diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..80bfe3ba0f1c4241d1d7ed0dbff0a371bdeb6ca3 --- /dev/null +++ b/app.py @@ -0,0 +1,122 @@ +import gradio as gr +from lavis.models import load_model_and_preprocess +import torch +import argparse + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Demo") + parser.add_argument("--model-name", default="blip2_vicuna_instruct") + parser.add_argument("--model-type", default="vicuna7b") + args = parser.parse_args() + + image_input = gr.Image(type="pil") + + min_len = gr.Slider( + minimum=1, + maximum=50, + value=1, + step=1, + interactive=True, + label="Min Length", + ) + + max_len = gr.Slider( + minimum=10, + maximum=500, + value=250, + step=5, + interactive=True, + label="Max Length", + ) + + sampling = gr.Radio( + choices=["Beam search", "Nucleus sampling"], + value="Beam search", + label="Text Decoding Method", + interactive=True, + ) + + top_p = gr.Slider( + minimum=0.5, + maximum=1.0, + value=0.9, + step=0.1, + interactive=True, + label="Top p", + ) + + beam_size = gr.Slider( + minimum=1, + maximum=10, + value=5, + step=1, + interactive=True, + label="Beam Size", + ) + + len_penalty = gr.Slider( + minimum=-1, + maximum=2, + value=1, + step=0.2, + interactive=True, + label="Length Penalty", + ) + + repetition_penalty = gr.Slider( + minimum=-1, + maximum=3, + value=1, + step=0.2, + interactive=True, + label="Repetition Penalty", + ) + + + # prompt_textbox = gr.Textbox(label="Prompt:", placeholder="prompt", lines=2) + + device = torch.device("cuda") if torch.cuda.is_available() else "cpu" + + print('Loading model...') + + model, vis_processors, _ = load_model_and_preprocess( + name=args.model_name, + model_type=args.model_type, + is_eval=True, + device=device, + ) + + print('Loading model done!') + + # def inference(image, prompt, min_len, max_len, beam_size, len_penalty, repetition_penalty, top_p, decoding_method, modeltype): + def inference(image, min_len, max_len, beam_size, len_penalty, repetition_penalty, top_p, decoding_method, modeltype): + use_nucleus_sampling = decoding_method == "Nucleus sampling" + # print(image, prompt, min_len, max_len, beam_size, len_penalty, repetition_penalty, top_p, use_nucleus_sampling) + image = vis_processors["eval"](image).unsqueeze(0).to(device) + + samples = { + "image": image, + # "prompt": prompt, + "prompt": "Describe the image in detail and where are the violence objects position in the image (center, left, right, top, bottom)." + } + + output = model.generate( + samples, + length_penalty=float(len_penalty), + repetition_penalty=float(repetition_penalty), + num_beams=beam_size, + max_length=max_len, + min_length=min_len, + top_p=top_p, + use_nucleus_sampling=use_nucleus_sampling, + ) + + return output[0] + + gr.Interface( + fn=inference, + # inputs=[image_input, prompt_textbox, min_len, max_len, beam_size, len_penalty, repetition_penalty, top_p, sampling], + inputs=[image_input, min_len, max_len, beam_size, len_penalty, repetition_penalty, top_p, sampling], + outputs="text", + allow_flagging="never", + ).launch() diff --git a/lavis/__init__.py b/lavis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ab17686f819c970015351238d2db77c8c09d5243 --- /dev/null +++ b/lavis/__init__.py @@ -0,0 +1,31 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import sys + +from omegaconf import OmegaConf + +from lavis.common.registry import registry + +from lavis.datasets.builders import * +from lavis.models import * +from lavis.processors import * +from lavis.tasks import * + + +root_dir = os.path.dirname(os.path.abspath(__file__)) +default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) + +registry.register_path("library_root", root_dir) +repo_root = os.path.join(root_dir, "..") +registry.register_path("repo_root", repo_root) +cache_root = os.path.join(repo_root, default_cfg.env.cache_root) +registry.register_path("cache_root", cache_root) + +registry.register("MAX_INT", sys.maxsize) +registry.register("SPLIT_NAMES", ["train", "val", "test"]) diff --git a/lavis/__pycache__/__init__.cpython-310.pyc b/lavis/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..886d52cb184ff76fe803627b962cc6ed89580d18 Binary files /dev/null and b/lavis/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/common/__pycache__/config.cpython-310.pyc b/lavis/common/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdfdac5b765a14984b74ad509f03a0260c58abba Binary files /dev/null and b/lavis/common/__pycache__/config.cpython-310.pyc differ diff --git a/lavis/common/__pycache__/dist_utils.cpython-310.pyc b/lavis/common/__pycache__/dist_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7985aefb9299f574cfe3da592239a7040765e744 Binary files /dev/null and b/lavis/common/__pycache__/dist_utils.cpython-310.pyc differ diff --git a/lavis/common/__pycache__/logger.cpython-310.pyc b/lavis/common/__pycache__/logger.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a746d05547598f42fd475d29cd6cd0e18c67eff9 Binary files /dev/null and b/lavis/common/__pycache__/logger.cpython-310.pyc differ diff --git a/lavis/common/__pycache__/registry.cpython-310.pyc b/lavis/common/__pycache__/registry.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c61d8fa61fd4ada2feef6dd758a0f69424caf1a Binary files /dev/null and b/lavis/common/__pycache__/registry.cpython-310.pyc differ diff --git a/lavis/common/__pycache__/utils.cpython-310.pyc b/lavis/common/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b8b7895e13730eeded2792f6a6ba35b1a8d7a02 Binary files /dev/null and b/lavis/common/__pycache__/utils.cpython-310.pyc differ diff --git a/lavis/common/annotator/__pycache__/util.cpython-310.pyc b/lavis/common/annotator/__pycache__/util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38a31ad8298d43e2b6143b02469c075ff0a78997 Binary files /dev/null and b/lavis/common/annotator/__pycache__/util.cpython-310.pyc differ diff --git a/lavis/common/annotator/canny/__init__.py b/lavis/common/annotator/canny/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cb0da951dc838ec9dec2131007e036113281800b --- /dev/null +++ b/lavis/common/annotator/canny/__init__.py @@ -0,0 +1,6 @@ +import cv2 + + +class CannyDetector: + def __call__(self, img, low_threshold, high_threshold): + return cv2.Canny(img, low_threshold, high_threshold) diff --git a/lavis/common/annotator/canny/__pycache__/__init__.cpython-310.pyc b/lavis/common/annotator/canny/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51a732da3ebf2ba22f7e71c15bb8444aa0a86eed Binary files /dev/null and b/lavis/common/annotator/canny/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/common/annotator/ckpts/download.sh b/lavis/common/annotator/ckpts/download.sh new file mode 100644 index 0000000000000000000000000000000000000000..6b28617b6fd95bdb2609ec54df292e320a1e4435 --- /dev/null +++ b/lavis/common/annotator/ckpts/download.sh @@ -0,0 +1,5 @@ +#! /bin/bash + +wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt +wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth + diff --git a/lavis/common/annotator/hed/__init__.py b/lavis/common/annotator/hed/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..56532c374df5c26f9ec53e2ac0dd924f4534bbdd --- /dev/null +++ b/lavis/common/annotator/hed/__init__.py @@ -0,0 +1,132 @@ +import numpy as np +import cv2 +import os +import torch +from einops import rearrange +from annotator.util import annotator_ckpts_path + + +class Network(torch.nn.Module): + def __init__(self, model_path): + super().__init__() + + self.netVggOne = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False), + torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False) + ) + + self.netVggTwo = torch.nn.Sequential( + torch.nn.MaxPool2d(kernel_size=2, stride=2), + torch.nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False), + torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False) + ) + + self.netVggThr = torch.nn.Sequential( + torch.nn.MaxPool2d(kernel_size=2, stride=2), + torch.nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False), + torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False), + torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False) + ) + + self.netVggFou = torch.nn.Sequential( + torch.nn.MaxPool2d(kernel_size=2, stride=2), + torch.nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False), + torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False), + torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False) + ) + + self.netVggFiv = torch.nn.Sequential( + torch.nn.MaxPool2d(kernel_size=2, stride=2), + torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False), + torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False), + torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), + torch.nn.ReLU(inplace=False) + ) + + self.netScoreOne = torch.nn.Conv2d(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0) + self.netScoreTwo = torch.nn.Conv2d(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0) + self.netScoreThr = torch.nn.Conv2d(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0) + self.netScoreFou = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0) + self.netScoreFiv = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0) + + self.netCombine = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0), + torch.nn.Sigmoid() + ) + + self.load_state_dict({strKey.replace('module', 'net'): tenWeight for strKey, tenWeight in torch.load(model_path).items()}) + + def forward(self, tenInput): + tenInput = tenInput * 255.0 + tenInput = tenInput - torch.tensor(data=[104.00698793, 116.66876762, 122.67891434], dtype=tenInput.dtype, device=tenInput.device).view(1, 3, 1, 1) + + tenVggOne = self.netVggOne(tenInput) + tenVggTwo = self.netVggTwo(tenVggOne) + tenVggThr = self.netVggThr(tenVggTwo) + tenVggFou = self.netVggFou(tenVggThr) + tenVggFiv = self.netVggFiv(tenVggFou) + + tenScoreOne = self.netScoreOne(tenVggOne) + tenScoreTwo = self.netScoreTwo(tenVggTwo) + tenScoreThr = self.netScoreThr(tenVggThr) + tenScoreFou = self.netScoreFou(tenVggFou) + tenScoreFiv = self.netScoreFiv(tenVggFiv) + + tenScoreOne = torch.nn.functional.interpolate(input=tenScoreOne, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False) + tenScoreTwo = torch.nn.functional.interpolate(input=tenScoreTwo, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False) + tenScoreThr = torch.nn.functional.interpolate(input=tenScoreThr, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False) + tenScoreFou = torch.nn.functional.interpolate(input=tenScoreFou, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False) + tenScoreFiv = torch.nn.functional.interpolate(input=tenScoreFiv, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False) + + return self.netCombine(torch.cat([ tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv ], 1)) + + +class HEDdetector: + def __init__(self): + remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth" + modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pth") + if not os.path.exists(modelpath): + from basicsr.utils.download_util import load_file_from_url + load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path) + self.netNetwork = Network(modelpath).cuda().eval() + + def __call__(self, input_image): + assert input_image.ndim == 3 + input_image = input_image[:, :, ::-1].copy() + with torch.no_grad(): + image_hed = torch.from_numpy(input_image).float().cuda() + image_hed = image_hed / 255.0 + image_hed = rearrange(image_hed, 'h w c -> 1 c h w') + edge = self.netNetwork(image_hed)[0] + edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8) + return edge[0] + + +def nms(x, t, s): + x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s) + + f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8) + f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8) + f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8) + f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8) + + y = np.zeros_like(x) + + for f in [f1, f2, f3, f4]: + np.putmask(y, cv2.dilate(x, kernel=f) == x, x) + + z = np.zeros_like(y, dtype=np.uint8) + z[y > t] = 255 + return z diff --git a/lavis/common/annotator/midas/__init__.py b/lavis/common/annotator/midas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc5ac03eea6f5ba7968706f1863c8bc4f8aaaf6a --- /dev/null +++ b/lavis/common/annotator/midas/__init__.py @@ -0,0 +1,38 @@ +import cv2 +import numpy as np +import torch + +from einops import rearrange +from .api import MiDaSInference + + +class MidasDetector: + def __init__(self): + self.model = MiDaSInference(model_type="dpt_hybrid").cuda() + + def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1): + assert input_image.ndim == 3 + image_depth = input_image + with torch.no_grad(): + image_depth = torch.from_numpy(image_depth).float().cuda() + image_depth = image_depth / 127.5 - 1.0 + image_depth = rearrange(image_depth, 'h w c -> 1 c h w') + depth = self.model(image_depth)[0] + + depth_pt = depth.clone() + depth_pt -= torch.min(depth_pt) + depth_pt /= torch.max(depth_pt) + depth_pt = depth_pt.cpu().numpy() + depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8) + + depth_np = depth.cpu().numpy() + x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3) + y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3) + z = np.ones_like(x) * a + x[depth_pt < bg_th] = 0 + y[depth_pt < bg_th] = 0 + normal = np.stack([x, y, z], axis=2) + normal /= np.sum(normal ** 2.0, axis=2, keepdims=True) ** 0.5 + normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8) + + return depth_image, normal_image diff --git a/lavis/common/annotator/midas/api.py b/lavis/common/annotator/midas/api.py new file mode 100644 index 0000000000000000000000000000000000000000..1ab9f15bf96bbaffcee0e3e29fc9d3979d6c32e8 --- /dev/null +++ b/lavis/common/annotator/midas/api.py @@ -0,0 +1,169 @@ +# based on https://github.com/isl-org/MiDaS + +import cv2 +import os +import torch +import torch.nn as nn +from torchvision.transforms import Compose + +from .midas.dpt_depth import DPTDepthModel +from .midas.midas_net import MidasNet +from .midas.midas_net_custom import MidasNet_small +from .midas.transforms import Resize, NormalizeImage, PrepareForNet +from annotator.util import annotator_ckpts_path + + +ISL_PATHS = { + "dpt_large": os.path.join(annotator_ckpts_path, "dpt_large-midas-2f21e586.pt"), + "dpt_hybrid": os.path.join(annotator_ckpts_path, "dpt_hybrid-midas-501f0c75.pt"), + "midas_v21": "", + "midas_v21_small": "", +} + +remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt" + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +def load_midas_transform(model_type): + # https://github.com/isl-org/MiDaS/blob/master/run.py + # load transform only + if model_type == "dpt_large": # DPT-Large + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_hybrid": # DPT-Hybrid + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "midas_v21": + net_w, net_h = 384, 384 + resize_mode = "upper_bound" + normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + + elif model_type == "midas_v21_small": + net_w, net_h = 256, 256 + resize_mode = "upper_bound" + normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + + else: + assert False, f"model_type '{model_type}' not implemented, use: --model_type large" + + transform = Compose( + [ + Resize( + net_w, + net_h, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method=resize_mode, + image_interpolation_method=cv2.INTER_CUBIC, + ), + normalization, + PrepareForNet(), + ] + ) + + return transform + + +def load_model(model_type): + # https://github.com/isl-org/MiDaS/blob/master/run.py + # load network + model_path = ISL_PATHS[model_type] + if model_type == "dpt_large": # DPT-Large + model = DPTDepthModel( + path=model_path, + backbone="vitl16_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_hybrid": # DPT-Hybrid + if not os.path.exists(model_path): + from basicsr.utils.download_util import load_file_from_url + load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path) + + model = DPTDepthModel( + path=model_path, + backbone="vitb_rn50_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "midas_v21": + model = MidasNet(model_path, non_negative=True) + net_w, net_h = 384, 384 + resize_mode = "upper_bound" + normalization = NormalizeImage( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + elif model_type == "midas_v21_small": + model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True, + non_negative=True, blocks={'expand': True}) + net_w, net_h = 256, 256 + resize_mode = "upper_bound" + normalization = NormalizeImage( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + else: + print(f"model_type '{model_type}' not implemented, use: --model_type large") + assert False + + transform = Compose( + [ + Resize( + net_w, + net_h, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method=resize_mode, + image_interpolation_method=cv2.INTER_CUBIC, + ), + normalization, + PrepareForNet(), + ] + ) + + return model.eval(), transform + + +class MiDaSInference(nn.Module): + MODEL_TYPES_TORCH_HUB = [ + "DPT_Large", + "DPT_Hybrid", + "MiDaS_small" + ] + MODEL_TYPES_ISL = [ + "dpt_large", + "dpt_hybrid", + "midas_v21", + "midas_v21_small", + ] + + def __init__(self, model_type): + super().__init__() + assert (model_type in self.MODEL_TYPES_ISL) + model, _ = load_model(model_type) + self.model = model + self.model.train = disabled_train + + def forward(self, x): + with torch.no_grad(): + prediction = self.model(x) + return prediction + diff --git a/lavis/common/annotator/midas/midas/__init__.py b/lavis/common/annotator/midas/midas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lavis/common/annotator/midas/midas/base_model.py b/lavis/common/annotator/midas/midas/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..5cf430239b47ec5ec07531263f26f5c24a2311cd --- /dev/null +++ b/lavis/common/annotator/midas/midas/base_model.py @@ -0,0 +1,16 @@ +import torch + + +class BaseModel(torch.nn.Module): + def load(self, path): + """Load model from file. + + Args: + path (str): file path + """ + parameters = torch.load(path, map_location=torch.device('cpu')) + + if "optimizer" in parameters: + parameters = parameters["model"] + + self.load_state_dict(parameters) diff --git a/lavis/common/annotator/midas/midas/blocks.py b/lavis/common/annotator/midas/midas/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..2145d18fa98060a618536d9a64fe6589e9be4f78 --- /dev/null +++ b/lavis/common/annotator/midas/midas/blocks.py @@ -0,0 +1,342 @@ +import torch +import torch.nn as nn + +from .vit import ( + _make_pretrained_vitb_rn50_384, + _make_pretrained_vitl16_384, + _make_pretrained_vitb16_384, + forward_vit, +) + +def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",): + if backbone == "vitl16_384": + pretrained = _make_pretrained_vitl16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) # ViT-L/16 - 85.0% Top1 (backbone) + elif backbone == "vitb_rn50_384": + pretrained = _make_pretrained_vitb_rn50_384( + use_pretrained, + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) + scratch = _make_scratch( + [256, 512, 768, 768], features, groups=groups, expand=expand + ) # ViT-H/16 - 85.0% Top1 (backbone) + elif backbone == "vitb16_384": + pretrained = _make_pretrained_vitb16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, expand=expand + ) # ViT-B/16 - 84.6% Top1 (backbone) + elif backbone == "resnext101_wsl": + pretrained = _make_pretrained_resnext101_wsl(use_pretrained) + scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3 + elif backbone == "efficientnet_lite3": + pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable) + scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3 + else: + print(f"Backbone '{backbone}' not implemented") + assert False + + return pretrained, scratch + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + out_shape4 = out_shape + if expand==True: + out_shape1 = out_shape + out_shape2 = out_shape*2 + out_shape3 = out_shape*4 + out_shape4 = out_shape*8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer4_rn = nn.Conv2d( + in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + + return scratch + + +def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False): + efficientnet = torch.hub.load( + "rwightman/gen-efficientnet-pytorch", + "tf_efficientnet_lite3", + pretrained=use_pretrained, + exportable=exportable + ) + return _make_efficientnet_backbone(efficientnet) + + +def _make_efficientnet_backbone(effnet): + pretrained = nn.Module() + + pretrained.layer1 = nn.Sequential( + effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2] + ) + pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3]) + pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5]) + pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9]) + + return pretrained + + +def _make_resnet_backbone(resnet): + pretrained = nn.Module() + pretrained.layer1 = nn.Sequential( + resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1 + ) + + pretrained.layer2 = resnet.layer2 + pretrained.layer3 = resnet.layer3 + pretrained.layer4 = resnet.layer4 + + return pretrained + + +def _make_pretrained_resnext101_wsl(use_pretrained): + resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl") + return _make_resnet_backbone(resnet) + + + +class Interpolate(nn.Module): + """Interpolation module. + """ + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners + ) + + return x + + +class ResidualConvUnit(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True + ) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True + ) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + out = self.relu(x) + out = self.conv1(out) + out = self.relu(out) + out = self.conv2(out) + + return out + x + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.resConfUnit1 = ResidualConvUnit(features) + self.resConfUnit2 = ResidualConvUnit(features) + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + output += self.resConfUnit1(xs[1]) + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode="bilinear", align_corners=True + ) + + return output + + + + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups=1 + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups + ) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups + ) + + if self.bn==True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn==True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn==True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + # return out + x + + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups=1 + + self.expand = expand + out_features = features + if self.expand==True: + out_features = features//2 + + self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + # output += res + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode="bilinear", align_corners=self.align_corners + ) + + output = self.out_conv(output) + + return output + diff --git a/lavis/common/annotator/midas/midas/dpt_depth.py b/lavis/common/annotator/midas/midas/dpt_depth.py new file mode 100644 index 0000000000000000000000000000000000000000..4e9aab5d2767dffea39da5b3f30e2798688216f1 --- /dev/null +++ b/lavis/common/annotator/midas/midas/dpt_depth.py @@ -0,0 +1,109 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .base_model import BaseModel +from .blocks import ( + FeatureFusionBlock, + FeatureFusionBlock_custom, + Interpolate, + _make_encoder, + forward_vit, +) + + +def _make_fusion_block(features, use_bn): + return FeatureFusionBlock_custom( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + ) + + +class DPT(BaseModel): + def __init__( + self, + head, + features=256, + backbone="vitb_rn50_384", + readout="project", + channels_last=False, + use_bn=False, + ): + + super(DPT, self).__init__() + + self.channels_last = channels_last + + hooks = { + "vitb_rn50_384": [0, 1, 8, 11], + "vitb16_384": [2, 5, 8, 11], + "vitl16_384": [5, 11, 17, 23], + } + + # Instantiate backbone and reassemble blocks + self.pretrained, self.scratch = _make_encoder( + backbone, + features, + False, # Set to true of you want to train from scratch, uses ImageNet weights + groups=1, + expand=False, + exportable=False, + hooks=hooks[backbone], + use_readout=readout, + ) + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + self.scratch.output_conv = head + + + def forward(self, x): + if self.channels_last == True: + x.contiguous(memory_format=torch.channels_last) + + layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return out + + +class DPTDepthModel(DPT): + def __init__(self, path=None, non_negative=True, **kwargs): + features = kwargs["features"] if "features" in kwargs else 256 + + head = nn.Sequential( + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + nn.Identity(), + ) + + super().__init__(head, **kwargs) + + if path is not None: + self.load(path) + + def forward(self, x): + return super().forward(x).squeeze(dim=1) + diff --git a/lavis/common/annotator/midas/midas/midas_net.py b/lavis/common/annotator/midas/midas/midas_net.py new file mode 100644 index 0000000000000000000000000000000000000000..8a954977800b0a0f48807e80fa63041910e33c1f --- /dev/null +++ b/lavis/common/annotator/midas/midas/midas_net.py @@ -0,0 +1,76 @@ +"""MidashNet: Network for monocular depth estimation trained by mixing several datasets. +This file contains code that is adapted from +https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py +""" +import torch +import torch.nn as nn + +from .base_model import BaseModel +from .blocks import FeatureFusionBlock, Interpolate, _make_encoder + + +class MidasNet(BaseModel): + """Network for monocular depth estimation. + """ + + def __init__(self, path=None, features=256, non_negative=True): + """Init. + + Args: + path (str, optional): Path to saved model. Defaults to None. + features (int, optional): Number of features. Defaults to 256. + backbone (str, optional): Backbone network for encoder. Defaults to resnet50 + """ + print("Loading weights: ", path) + + super(MidasNet, self).__init__() + + use_pretrained = False if path is None else True + + self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) + + self.scratch.refinenet4 = FeatureFusionBlock(features) + self.scratch.refinenet3 = FeatureFusionBlock(features) + self.scratch.refinenet2 = FeatureFusionBlock(features) + self.scratch.refinenet1 = FeatureFusionBlock(features) + + self.scratch.output_conv = nn.Sequential( + nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode="bilinear"), + nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + ) + + if path: + self.load(path) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input data (image) + + Returns: + tensor: depth + """ + + layer_1 = self.pretrained.layer1(x) + layer_2 = self.pretrained.layer2(layer_1) + layer_3 = self.pretrained.layer3(layer_2) + layer_4 = self.pretrained.layer4(layer_3) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return torch.squeeze(out, dim=1) diff --git a/lavis/common/annotator/midas/midas/midas_net_custom.py b/lavis/common/annotator/midas/midas/midas_net_custom.py new file mode 100644 index 0000000000000000000000000000000000000000..50e4acb5e53d5fabefe3dde16ab49c33c2b7797c --- /dev/null +++ b/lavis/common/annotator/midas/midas/midas_net_custom.py @@ -0,0 +1,128 @@ +"""MidashNet: Network for monocular depth estimation trained by mixing several datasets. +This file contains code that is adapted from +https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py +""" +import torch +import torch.nn as nn + +from .base_model import BaseModel +from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder + + +class MidasNet_small(BaseModel): + """Network for monocular depth estimation. + """ + + def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True, + blocks={'expand': True}): + """Init. + + Args: + path (str, optional): Path to saved model. Defaults to None. + features (int, optional): Number of features. Defaults to 256. + backbone (str, optional): Backbone network for encoder. Defaults to resnet50 + """ + print("Loading weights: ", path) + + super(MidasNet_small, self).__init__() + + use_pretrained = False if path else True + + self.channels_last = channels_last + self.blocks = blocks + self.backbone = backbone + + self.groups = 1 + + features1=features + features2=features + features3=features + features4=features + self.expand = False + if "expand" in self.blocks and self.blocks['expand'] == True: + self.expand = True + features1=features + features2=features*2 + features3=features*4 + features4=features*8 + + self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable) + + self.scratch.activation = nn.ReLU(False) + + self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners) + + + self.scratch.output_conv = nn.Sequential( + nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups), + Interpolate(scale_factor=2, mode="bilinear"), + nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), + self.scratch.activation, + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + nn.Identity(), + ) + + if path: + self.load(path) + + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input data (image) + + Returns: + tensor: depth + """ + if self.channels_last==True: + print("self.channels_last = ", self.channels_last) + x.contiguous(memory_format=torch.channels_last) + + + layer_1 = self.pretrained.layer1(x) + layer_2 = self.pretrained.layer2(layer_1) + layer_3 = self.pretrained.layer3(layer_2) + layer_4 = self.pretrained.layer4(layer_3) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return torch.squeeze(out, dim=1) + + + +def fuse_model(m): + prev_previous_type = nn.Identity() + prev_previous_name = '' + previous_type = nn.Identity() + previous_name = '' + for name, module in m.named_modules(): + if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU: + # print("FUSED ", prev_previous_name, previous_name, name) + torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True) + elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d: + # print("FUSED ", prev_previous_name, previous_name) + torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True) + # elif previous_type == nn.Conv2d and type(module) == nn.ReLU: + # print("FUSED ", previous_name, name) + # torch.quantization.fuse_modules(m, [previous_name, name], inplace=True) + + prev_previous_type = previous_type + prev_previous_name = previous_name + previous_type = type(module) + previous_name = name \ No newline at end of file diff --git a/lavis/common/annotator/midas/midas/transforms.py b/lavis/common/annotator/midas/midas/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..350cbc11662633ad7f8968eb10be2e7de6e384e9 --- /dev/null +++ b/lavis/common/annotator/midas/midas/transforms.py @@ -0,0 +1,234 @@ +import numpy as np +import cv2 +import math + + +def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): + """Rezise the sample to ensure the given size. Keeps aspect ratio. + + Args: + sample (dict): sample + size (tuple): image size + + Returns: + tuple: new size + """ + shape = list(sample["disparity"].shape) + + if shape[0] >= size[0] and shape[1] >= size[1]: + return sample + + scale = [0, 0] + scale[0] = size[0] / shape[0] + scale[1] = size[1] / shape[1] + + scale = max(scale) + + shape[0] = math.ceil(scale * shape[0]) + shape[1] = math.ceil(scale * shape[1]) + + # resize + sample["image"] = cv2.resize( + sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method + ) + + sample["disparity"] = cv2.resize( + sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST + ) + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + tuple(shape[::-1]), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return tuple(shape) + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_AREA, + ): + """Init. + + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + self.__width = width + self.__height = height + + self.__resize_target = resize_target + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + self.__image_interpolation_method = image_interpolation_method + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented" + ) + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, min_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, min_val=self.__width + ) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, max_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, max_val=self.__width + ) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def __call__(self, sample): + width, height = self.get_size( + sample["image"].shape[1], sample["image"].shape[0] + ) + + # resize sample + sample["image"] = cv2.resize( + sample["image"], + (width, height), + interpolation=self.__image_interpolation_method, + ) + + if self.__resize_target: + if "disparity" in sample: + sample["disparity"] = cv2.resize( + sample["disparity"], + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + + if "depth" in sample: + sample["depth"] = cv2.resize( + sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST + ) + + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return sample + + +class NormalizeImage(object): + """Normlize image by given mean and std. + """ + + def __init__(self, mean, std): + self.__mean = mean + self.__std = std + + def __call__(self, sample): + sample["image"] = (sample["image"] - self.__mean) / self.__std + + return sample + + +class PrepareForNet(object): + """Prepare sample for usage as network input. + """ + + def __init__(self): + pass + + def __call__(self, sample): + image = np.transpose(sample["image"], (2, 0, 1)) + sample["image"] = np.ascontiguousarray(image).astype(np.float32) + + if "mask" in sample: + sample["mask"] = sample["mask"].astype(np.float32) + sample["mask"] = np.ascontiguousarray(sample["mask"]) + + if "disparity" in sample: + disparity = sample["disparity"].astype(np.float32) + sample["disparity"] = np.ascontiguousarray(disparity) + + if "depth" in sample: + depth = sample["depth"].astype(np.float32) + sample["depth"] = np.ascontiguousarray(depth) + + return sample diff --git a/lavis/common/annotator/midas/midas/vit.py b/lavis/common/annotator/midas/midas/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..ea46b1be88b261b0dec04f3da0256f5f66f88a74 --- /dev/null +++ b/lavis/common/annotator/midas/midas/vit.py @@ -0,0 +1,491 @@ +import torch +import torch.nn as nn +import timm +import types +import math +import torch.nn.functional as F + + +class Slice(nn.Module): + def __init__(self, start_index=1): + super(Slice, self).__init__() + self.start_index = start_index + + def forward(self, x): + return x[:, self.start_index :] + + +class AddReadout(nn.Module): + def __init__(self, start_index=1): + super(AddReadout, self).__init__() + self.start_index = start_index + + def forward(self, x): + if self.start_index == 2: + readout = (x[:, 0] + x[:, 1]) / 2 + else: + readout = x[:, 0] + return x[:, self.start_index :] + readout.unsqueeze(1) + + +class ProjectReadout(nn.Module): + def __init__(self, in_features, start_index=1): + super(ProjectReadout, self).__init__() + self.start_index = start_index + + self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU()) + + def forward(self, x): + readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :]) + features = torch.cat((x[:, self.start_index :], readout), -1) + + return self.project(features) + + +class Transpose(nn.Module): + def __init__(self, dim0, dim1): + super(Transpose, self).__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + x = x.transpose(self.dim0, self.dim1) + return x + + +def forward_vit(pretrained, x): + b, c, h, w = x.shape + + glob = pretrained.model.forward_flex(x) + + layer_1 = pretrained.activations["1"] + layer_2 = pretrained.activations["2"] + layer_3 = pretrained.activations["3"] + layer_4 = pretrained.activations["4"] + + layer_1 = pretrained.act_postprocess1[0:2](layer_1) + layer_2 = pretrained.act_postprocess2[0:2](layer_2) + layer_3 = pretrained.act_postprocess3[0:2](layer_3) + layer_4 = pretrained.act_postprocess4[0:2](layer_4) + + unflatten = nn.Sequential( + nn.Unflatten( + 2, + torch.Size( + [ + h // pretrained.model.patch_size[1], + w // pretrained.model.patch_size[0], + ] + ), + ) + ) + + if layer_1.ndim == 3: + layer_1 = unflatten(layer_1) + if layer_2.ndim == 3: + layer_2 = unflatten(layer_2) + if layer_3.ndim == 3: + layer_3 = unflatten(layer_3) + if layer_4.ndim == 3: + layer_4 = unflatten(layer_4) + + layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1) + layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2) + layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3) + layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def _resize_pos_embed(self, posemb, gs_h, gs_w): + posemb_tok, posemb_grid = ( + posemb[:, : self.start_index], + posemb[0, self.start_index :], + ) + + gs_old = int(math.sqrt(len(posemb_grid))) + + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear") + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + +def forward_flex(self, x): + b, c, h, w = x.shape + + pos_embed = self._resize_pos_embed( + self.pos_embed, h // self.patch_size[1], w // self.patch_size[0] + ) + + B = x.shape[0] + + if hasattr(self.patch_embed, "backbone"): + x = self.patch_embed.backbone(x) + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) + + if getattr(self, "dist_token", None) is not None: + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + dist_token = self.dist_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, dist_token, x), dim=1) + else: + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + x = x + pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + + return x + + +activations = {} + + +def get_activation(name): + def hook(model, input, output): + activations[name] = output + + return hook + + +def get_readout_oper(vit_features, features, use_readout, start_index=1): + if use_readout == "ignore": + readout_oper = [Slice(start_index)] * len(features) + elif use_readout == "add": + readout_oper = [AddReadout(start_index)] * len(features) + elif use_readout == "project": + readout_oper = [ + ProjectReadout(vit_features, start_index) for out_feat in features + ] + else: + assert ( + False + ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" + + return readout_oper + + +def _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) + + # 32, 48, 136, 384 + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model + ) + + return pretrained + + +def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks == None else hooks + return _make_vit_b16_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("vit_base_patch16_384", pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks == None else hooks + return _make_vit_b16_backbone( + model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout + ) + + +def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks == None else hooks + return _make_vit_b16_backbone( + model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout + ) + + +def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model( + "vit_deit_base_distilled_patch16_384", pretrained=pretrained + ) + + hooks = [2, 5, 8, 11] if hooks == None else hooks + return _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout, + start_index=2, + ) + + +def _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=[0, 1, 8, 11], + vit_features=768, + use_vit_only=False, + use_readout="ignore", + start_index=1, +): + pretrained = nn.Module() + + pretrained.model = model + + if use_vit_only == True: + pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) + else: + pretrained.model.patch_embed.backbone.stages[0].register_forward_hook( + get_activation("1") + ) + pretrained.model.patch_embed.backbone.stages[1].register_forward_hook( + get_activation("2") + ) + + pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) + + if use_vit_only == True: + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + else: + pretrained.act_postprocess1 = nn.Sequential( + nn.Identity(), nn.Identity(), nn.Identity() + ) + pretrained.act_postprocess2 = nn.Sequential( + nn.Identity(), nn.Identity(), nn.Identity() + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model + ) + + return pretrained + + +def _make_pretrained_vitb_rn50_384( + pretrained, use_readout="ignore", hooks=None, use_vit_only=False +): + model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained) + + hooks = [0, 1, 8, 11] if hooks == None else hooks + return _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) diff --git a/lavis/common/annotator/midas/utils.py b/lavis/common/annotator/midas/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9a9d3b5b66370fa98da9e067ba53ead848ea9a59 --- /dev/null +++ b/lavis/common/annotator/midas/utils.py @@ -0,0 +1,189 @@ +"""Utils for monoDepth.""" +import sys +import re +import numpy as np +import cv2 +import torch + + +def read_pfm(path): + """Read pfm file. + + Args: + path (str): path to file + + Returns: + tuple: (data, scale) + """ + with open(path, "rb") as file: + + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().rstrip() + if header.decode("ascii") == "PF": + color = True + elif header.decode("ascii") == "Pf": + color = False + else: + raise Exception("Not a PFM file: " + path) + + dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) + if dim_match: + width, height = list(map(int, dim_match.groups())) + else: + raise Exception("Malformed PFM header.") + + scale = float(file.readline().decode("ascii").rstrip()) + if scale < 0: + # little-endian + endian = "<" + scale = -scale + else: + # big-endian + endian = ">" + + data = np.fromfile(file, endian + "f") + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + + return data, scale + + +def write_pfm(path, image, scale=1): + """Write pfm file. + + Args: + path (str): pathto file + image (array): data + scale (int, optional): Scale. Defaults to 1. + """ + + with open(path, "wb") as file: + color = None + + if image.dtype.name != "float32": + raise Exception("Image dtype must be float32.") + + image = np.flipud(image) + + if len(image.shape) == 3 and image.shape[2] == 3: # color image + color = True + elif ( + len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 + ): # greyscale + color = False + else: + raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") + + file.write("PF\n" if color else "Pf\n".encode()) + file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) + + endian = image.dtype.byteorder + + if endian == "<" or endian == "=" and sys.byteorder == "little": + scale = -scale + + file.write("%f\n".encode() % scale) + + image.tofile(file) + + +def read_image(path): + """Read image and output RGB image (0-1). + + Args: + path (str): path to file + + Returns: + array: RGB image (0-1) + """ + img = cv2.imread(path) + + if img.ndim == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 + + return img + + +def resize_image(img): + """Resize image and make it fit for network. + + Args: + img (array): image + + Returns: + tensor: data ready for network + """ + height_orig = img.shape[0] + width_orig = img.shape[1] + + if width_orig > height_orig: + scale = width_orig / 384 + else: + scale = height_orig / 384 + + height = (np.ceil(height_orig / scale / 32) * 32).astype(int) + width = (np.ceil(width_orig / scale / 32) * 32).astype(int) + + img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA) + + img_resized = ( + torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float() + ) + img_resized = img_resized.unsqueeze(0) + + return img_resized + + +def resize_depth(depth, width, height): + """Resize depth map and bring to CPU (numpy). + + Args: + depth (tensor): depth + width (int): image width + height (int): image height + + Returns: + array: processed depth + """ + depth = torch.squeeze(depth[0, :, :, :]).to("cpu") + + depth_resized = cv2.resize( + depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC + ) + + return depth_resized + +def write_depth(path, depth, bits=1): + """Write depth map to pfm and png file. + + Args: + path (str): filepath without extension + depth (array): depth + """ + write_pfm(path + ".pfm", depth.astype(np.float32)) + + depth_min = depth.min() + depth_max = depth.max() + + max_val = (2**(8*bits))-1 + + if depth_max - depth_min > np.finfo("float").eps: + out = max_val * (depth - depth_min) / (depth_max - depth_min) + else: + out = np.zeros(depth.shape, dtype=depth.type) + + if bits == 1: + cv2.imwrite(path + ".png", out.astype("uint8")) + elif bits == 2: + cv2.imwrite(path + ".png", out.astype("uint16")) + + return diff --git a/lavis/common/annotator/mlsd/__init__.py b/lavis/common/annotator/mlsd/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..42af28c682e781b30f691f65a475b53c9f3adc8b --- /dev/null +++ b/lavis/common/annotator/mlsd/__init__.py @@ -0,0 +1,39 @@ +import cv2 +import numpy as np +import torch +import os + +from einops import rearrange +from .models.mbv2_mlsd_tiny import MobileV2_MLSD_Tiny +from .models.mbv2_mlsd_large import MobileV2_MLSD_Large +from .utils import pred_lines + +from annotator.util import annotator_ckpts_path + + +remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_large_512_fp32.pth" + + +class MLSDdetector: + def __init__(self): + model_path = os.path.join(annotator_ckpts_path, "mlsd_large_512_fp32.pth") + if not os.path.exists(model_path): + from basicsr.utils.download_util import load_file_from_url + load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path) + model = MobileV2_MLSD_Large() + model.load_state_dict(torch.load(model_path), strict=True) + self.model = model.cuda().eval() + + def __call__(self, input_image, thr_v, thr_d): + assert input_image.ndim == 3 + img = input_image + img_output = np.zeros_like(img) + try: + with torch.no_grad(): + lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d) + for line in lines: + x_start, y_start, x_end, y_end = [int(val) for val in line] + cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1) + except Exception as e: + pass + return img_output[:, :, 0] diff --git a/lavis/common/annotator/mlsd/models/mbv2_mlsd_large.py b/lavis/common/annotator/mlsd/models/mbv2_mlsd_large.py new file mode 100644 index 0000000000000000000000000000000000000000..5b9799e7573ca41549b3c3b13ac47b906b369603 --- /dev/null +++ b/lavis/common/annotator/mlsd/models/mbv2_mlsd_large.py @@ -0,0 +1,292 @@ +import os +import sys +import torch +import torch.nn as nn +import torch.utils.model_zoo as model_zoo +from torch.nn import functional as F + + +class BlockTypeA(nn.Module): + def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True): + super(BlockTypeA, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(in_c2, out_c2, kernel_size=1), + nn.BatchNorm2d(out_c2), + nn.ReLU(inplace=True) + ) + self.conv2 = nn.Sequential( + nn.Conv2d(in_c1, out_c1, kernel_size=1), + nn.BatchNorm2d(out_c1), + nn.ReLU(inplace=True) + ) + self.upscale = upscale + + def forward(self, a, b): + b = self.conv1(b) + a = self.conv2(a) + if self.upscale: + b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True) + return torch.cat((a, b), dim=1) + + +class BlockTypeB(nn.Module): + def __init__(self, in_c, out_c): + super(BlockTypeB, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(in_c, in_c, kernel_size=3, padding=1), + nn.BatchNorm2d(in_c), + nn.ReLU() + ) + self.conv2 = nn.Sequential( + nn.Conv2d(in_c, out_c, kernel_size=3, padding=1), + nn.BatchNorm2d(out_c), + nn.ReLU() + ) + + def forward(self, x): + x = self.conv1(x) + x + x = self.conv2(x) + return x + +class BlockTypeC(nn.Module): + def __init__(self, in_c, out_c): + super(BlockTypeC, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5), + nn.BatchNorm2d(in_c), + nn.ReLU() + ) + self.conv2 = nn.Sequential( + nn.Conv2d(in_c, in_c, kernel_size=3, padding=1), + nn.BatchNorm2d(in_c), + nn.ReLU() + ) + self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + return x + +def _make_divisible(v, divisor, min_value=None): + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8 + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + :param v: + :param divisor: + :param min_value: + :return: + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNReLU(nn.Sequential): + def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): + self.channel_pad = out_planes - in_planes + self.stride = stride + #padding = (kernel_size - 1) // 2 + + # TFLite uses slightly different padding than PyTorch + if stride == 2: + padding = 0 + else: + padding = (kernel_size - 1) // 2 + + super(ConvBNReLU, self).__init__( + nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), + nn.BatchNorm2d(out_planes), + nn.ReLU6(inplace=True) + ) + self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) + + + def forward(self, x): + # TFLite uses different padding + if self.stride == 2: + x = F.pad(x, (0, 1, 0, 1), "constant", 0) + #print(x.shape) + + for module in self: + if not isinstance(module, nn.MaxPool2d): + x = module(x) + return x + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, expand_ratio): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = int(round(inp * expand_ratio)) + self.use_res_connect = self.stride == 1 and inp == oup + + layers = [] + if expand_ratio != 1: + # pw + layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) + layers.extend([ + # dw + ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2(nn.Module): + def __init__(self, pretrained=True): + """ + MobileNet V2 main class + Args: + num_classes (int): Number of classes + width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount + inverted_residual_setting: Network structure + round_nearest (int): Round the number of channels in each layer to be a multiple of this number + Set to 1 to turn off rounding + block: Module specifying inverted residual building block for mobilenet + """ + super(MobileNetV2, self).__init__() + + block = InvertedResidual + input_channel = 32 + last_channel = 1280 + width_mult = 1.0 + round_nearest = 8 + + inverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + [6, 96, 3, 1], + #[6, 160, 3, 2], + #[6, 320, 1, 1], + ] + + # only check the first element, assuming user knows t,c,n,s are required + if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: + raise ValueError("inverted_residual_setting should be non-empty " + "or a 4-element list, got {}".format(inverted_residual_setting)) + + # building first layer + input_channel = _make_divisible(input_channel * width_mult, round_nearest) + self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) + features = [ConvBNReLU(4, input_channel, stride=2)] + # building inverted residual blocks + for t, c, n, s in inverted_residual_setting: + output_channel = _make_divisible(c * width_mult, round_nearest) + for i in range(n): + stride = s if i == 0 else 1 + features.append(block(input_channel, output_channel, stride, expand_ratio=t)) + input_channel = output_channel + + self.features = nn.Sequential(*features) + self.fpn_selected = [1, 3, 6, 10, 13] + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.zeros_(m.bias) + if pretrained: + self._load_pretrained_model() + + def _forward_impl(self, x): + # This exists since TorchScript doesn't support inheritance, so the superclass method + # (this one) needs to have a name other than `forward` that can be accessed in a subclass + fpn_features = [] + for i, f in enumerate(self.features): + if i > self.fpn_selected[-1]: + break + x = f(x) + if i in self.fpn_selected: + fpn_features.append(x) + + c1, c2, c3, c4, c5 = fpn_features + return c1, c2, c3, c4, c5 + + + def forward(self, x): + return self._forward_impl(x) + + def _load_pretrained_model(self): + pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth') + model_dict = {} + state_dict = self.state_dict() + for k, v in pretrain_dict.items(): + if k in state_dict: + model_dict[k] = v + state_dict.update(model_dict) + self.load_state_dict(state_dict) + + +class MobileV2_MLSD_Large(nn.Module): + def __init__(self): + super(MobileV2_MLSD_Large, self).__init__() + + self.backbone = MobileNetV2(pretrained=False) + ## A, B + self.block15 = BlockTypeA(in_c1= 64, in_c2= 96, + out_c1= 64, out_c2=64, + upscale=False) + self.block16 = BlockTypeB(128, 64) + + ## A, B + self.block17 = BlockTypeA(in_c1 = 32, in_c2 = 64, + out_c1= 64, out_c2= 64) + self.block18 = BlockTypeB(128, 64) + + ## A, B + self.block19 = BlockTypeA(in_c1=24, in_c2=64, + out_c1=64, out_c2=64) + self.block20 = BlockTypeB(128, 64) + + ## A, B, C + self.block21 = BlockTypeA(in_c1=16, in_c2=64, + out_c1=64, out_c2=64) + self.block22 = BlockTypeB(128, 64) + + self.block23 = BlockTypeC(64, 16) + + def forward(self, x): + c1, c2, c3, c4, c5 = self.backbone(x) + + x = self.block15(c4, c5) + x = self.block16(x) + + x = self.block17(c3, x) + x = self.block18(x) + + x = self.block19(c2, x) + x = self.block20(x) + + x = self.block21(c1, x) + x = self.block22(x) + x = self.block23(x) + x = x[:, 7:, :, :] + + return x \ No newline at end of file diff --git a/lavis/common/annotator/mlsd/models/mbv2_mlsd_tiny.py b/lavis/common/annotator/mlsd/models/mbv2_mlsd_tiny.py new file mode 100644 index 0000000000000000000000000000000000000000..e3ed633f2cc23ea1829a627fdb879ab39f641f83 --- /dev/null +++ b/lavis/common/annotator/mlsd/models/mbv2_mlsd_tiny.py @@ -0,0 +1,275 @@ +import os +import sys +import torch +import torch.nn as nn +import torch.utils.model_zoo as model_zoo +from torch.nn import functional as F + + +class BlockTypeA(nn.Module): + def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True): + super(BlockTypeA, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(in_c2, out_c2, kernel_size=1), + nn.BatchNorm2d(out_c2), + nn.ReLU(inplace=True) + ) + self.conv2 = nn.Sequential( + nn.Conv2d(in_c1, out_c1, kernel_size=1), + nn.BatchNorm2d(out_c1), + nn.ReLU(inplace=True) + ) + self.upscale = upscale + + def forward(self, a, b): + b = self.conv1(b) + a = self.conv2(a) + b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True) + return torch.cat((a, b), dim=1) + + +class BlockTypeB(nn.Module): + def __init__(self, in_c, out_c): + super(BlockTypeB, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(in_c, in_c, kernel_size=3, padding=1), + nn.BatchNorm2d(in_c), + nn.ReLU() + ) + self.conv2 = nn.Sequential( + nn.Conv2d(in_c, out_c, kernel_size=3, padding=1), + nn.BatchNorm2d(out_c), + nn.ReLU() + ) + + def forward(self, x): + x = self.conv1(x) + x + x = self.conv2(x) + return x + +class BlockTypeC(nn.Module): + def __init__(self, in_c, out_c): + super(BlockTypeC, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5), + nn.BatchNorm2d(in_c), + nn.ReLU() + ) + self.conv2 = nn.Sequential( + nn.Conv2d(in_c, in_c, kernel_size=3, padding=1), + nn.BatchNorm2d(in_c), + nn.ReLU() + ) + self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + return x + +def _make_divisible(v, divisor, min_value=None): + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8 + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + :param v: + :param divisor: + :param min_value: + :return: + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNReLU(nn.Sequential): + def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): + self.channel_pad = out_planes - in_planes + self.stride = stride + #padding = (kernel_size - 1) // 2 + + # TFLite uses slightly different padding than PyTorch + if stride == 2: + padding = 0 + else: + padding = (kernel_size - 1) // 2 + + super(ConvBNReLU, self).__init__( + nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), + nn.BatchNorm2d(out_planes), + nn.ReLU6(inplace=True) + ) + self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) + + + def forward(self, x): + # TFLite uses different padding + if self.stride == 2: + x = F.pad(x, (0, 1, 0, 1), "constant", 0) + #print(x.shape) + + for module in self: + if not isinstance(module, nn.MaxPool2d): + x = module(x) + return x + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, expand_ratio): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = int(round(inp * expand_ratio)) + self.use_res_connect = self.stride == 1 and inp == oup + + layers = [] + if expand_ratio != 1: + # pw + layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) + layers.extend([ + # dw + ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2(nn.Module): + def __init__(self, pretrained=True): + """ + MobileNet V2 main class + Args: + num_classes (int): Number of classes + width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount + inverted_residual_setting: Network structure + round_nearest (int): Round the number of channels in each layer to be a multiple of this number + Set to 1 to turn off rounding + block: Module specifying inverted residual building block for mobilenet + """ + super(MobileNetV2, self).__init__() + + block = InvertedResidual + input_channel = 32 + last_channel = 1280 + width_mult = 1.0 + round_nearest = 8 + + inverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + #[6, 96, 3, 1], + #[6, 160, 3, 2], + #[6, 320, 1, 1], + ] + + # only check the first element, assuming user knows t,c,n,s are required + if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: + raise ValueError("inverted_residual_setting should be non-empty " + "or a 4-element list, got {}".format(inverted_residual_setting)) + + # building first layer + input_channel = _make_divisible(input_channel * width_mult, round_nearest) + self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) + features = [ConvBNReLU(4, input_channel, stride=2)] + # building inverted residual blocks + for t, c, n, s in inverted_residual_setting: + output_channel = _make_divisible(c * width_mult, round_nearest) + for i in range(n): + stride = s if i == 0 else 1 + features.append(block(input_channel, output_channel, stride, expand_ratio=t)) + input_channel = output_channel + self.features = nn.Sequential(*features) + + self.fpn_selected = [3, 6, 10] + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.zeros_(m.bias) + + #if pretrained: + # self._load_pretrained_model() + + def _forward_impl(self, x): + # This exists since TorchScript doesn't support inheritance, so the superclass method + # (this one) needs to have a name other than `forward` that can be accessed in a subclass + fpn_features = [] + for i, f in enumerate(self.features): + if i > self.fpn_selected[-1]: + break + x = f(x) + if i in self.fpn_selected: + fpn_features.append(x) + + c2, c3, c4 = fpn_features + return c2, c3, c4 + + + def forward(self, x): + return self._forward_impl(x) + + def _load_pretrained_model(self): + pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth') + model_dict = {} + state_dict = self.state_dict() + for k, v in pretrain_dict.items(): + if k in state_dict: + model_dict[k] = v + state_dict.update(model_dict) + self.load_state_dict(state_dict) + + +class MobileV2_MLSD_Tiny(nn.Module): + def __init__(self): + super(MobileV2_MLSD_Tiny, self).__init__() + + self.backbone = MobileNetV2(pretrained=True) + + self.block12 = BlockTypeA(in_c1= 32, in_c2= 64, + out_c1= 64, out_c2=64) + self.block13 = BlockTypeB(128, 64) + + self.block14 = BlockTypeA(in_c1 = 24, in_c2 = 64, + out_c1= 32, out_c2= 32) + self.block15 = BlockTypeB(64, 64) + + self.block16 = BlockTypeC(64, 16) + + def forward(self, x): + c2, c3, c4 = self.backbone(x) + + x = self.block12(c3, c4) + x = self.block13(x) + x = self.block14(c2, x) + x = self.block15(x) + x = self.block16(x) + x = x[:, 7:, :, :] + #print(x.shape) + x = F.interpolate(x, scale_factor=2.0, mode='bilinear', align_corners=True) + + return x \ No newline at end of file diff --git a/lavis/common/annotator/mlsd/utils.py b/lavis/common/annotator/mlsd/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ae3cf9420a33a4abae27c48ac4b90938c7d63cc3 --- /dev/null +++ b/lavis/common/annotator/mlsd/utils.py @@ -0,0 +1,580 @@ +''' +modified by lihaoweicv +pytorch version +''' + +''' +M-LSD +Copyright 2021-present NAVER Corp. +Apache License v2.0 +''' + +import os +import numpy as np +import cv2 +import torch +from torch.nn import functional as F + + +def deccode_output_score_and_ptss(tpMap, topk_n = 200, ksize = 5): + ''' + tpMap: + center: tpMap[1, 0, :, :] + displacement: tpMap[1, 1:5, :, :] + ''' + b, c, h, w = tpMap.shape + assert b==1, 'only support bsize==1' + displacement = tpMap[:, 1:5, :, :][0] + center = tpMap[:, 0, :, :] + heat = torch.sigmoid(center) + hmax = F.max_pool2d( heat, (ksize, ksize), stride=1, padding=(ksize-1)//2) + keep = (hmax == heat).float() + heat = heat * keep + heat = heat.reshape(-1, ) + + scores, indices = torch.topk(heat, topk_n, dim=-1, largest=True) + yy = torch.floor_divide(indices, w).unsqueeze(-1) + xx = torch.fmod(indices, w).unsqueeze(-1) + ptss = torch.cat((yy, xx),dim=-1) + + ptss = ptss.detach().cpu().numpy() + scores = scores.detach().cpu().numpy() + displacement = displacement.detach().cpu().numpy() + displacement = displacement.transpose((1,2,0)) + return ptss, scores, displacement + + +def pred_lines(image, model, + input_shape=[512, 512], + score_thr=0.10, + dist_thr=20.0): + h, w, _ = image.shape + h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]] + + resized_image = np.concatenate([cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA), + np.ones([input_shape[0], input_shape[1], 1])], axis=-1) + + resized_image = resized_image.transpose((2,0,1)) + batch_image = np.expand_dims(resized_image, axis=0).astype('float32') + batch_image = (batch_image / 127.5) - 1.0 + + batch_image = torch.from_numpy(batch_image).float().cuda() + outputs = model(batch_image) + pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3) + start = vmap[:, :, :2] + end = vmap[:, :, 2:] + dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1)) + + segments_list = [] + for center, score in zip(pts, pts_score): + y, x = center + distance = dist_map[y, x] + if score > score_thr and distance > dist_thr: + disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :] + x_start = x + disp_x_start + y_start = y + disp_y_start + x_end = x + disp_x_end + y_end = y + disp_y_end + segments_list.append([x_start, y_start, x_end, y_end]) + + lines = 2 * np.array(segments_list) # 256 > 512 + lines[:, 0] = lines[:, 0] * w_ratio + lines[:, 1] = lines[:, 1] * h_ratio + lines[:, 2] = lines[:, 2] * w_ratio + lines[:, 3] = lines[:, 3] * h_ratio + + return lines + + +def pred_squares(image, + model, + input_shape=[512, 512], + params={'score': 0.06, + 'outside_ratio': 0.28, + 'inside_ratio': 0.45, + 'w_overlap': 0.0, + 'w_degree': 1.95, + 'w_length': 0.0, + 'w_area': 1.86, + 'w_center': 0.14}): + ''' + shape = [height, width] + ''' + h, w, _ = image.shape + original_shape = [h, w] + + resized_image = np.concatenate([cv2.resize(image, (input_shape[0], input_shape[1]), interpolation=cv2.INTER_AREA), + np.ones([input_shape[0], input_shape[1], 1])], axis=-1) + resized_image = resized_image.transpose((2, 0, 1)) + batch_image = np.expand_dims(resized_image, axis=0).astype('float32') + batch_image = (batch_image / 127.5) - 1.0 + + batch_image = torch.from_numpy(batch_image).float().cuda() + outputs = model(batch_image) + + pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3) + start = vmap[:, :, :2] # (x, y) + end = vmap[:, :, 2:] # (x, y) + dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1)) + + junc_list = [] + segments_list = [] + for junc, score in zip(pts, pts_score): + y, x = junc + distance = dist_map[y, x] + if score > params['score'] and distance > 20.0: + junc_list.append([x, y]) + disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :] + d_arrow = 1.0 + x_start = x + d_arrow * disp_x_start + y_start = y + d_arrow * disp_y_start + x_end = x + d_arrow * disp_x_end + y_end = y + d_arrow * disp_y_end + segments_list.append([x_start, y_start, x_end, y_end]) + + segments = np.array(segments_list) + + ####### post processing for squares + # 1. get unique lines + point = np.array([[0, 0]]) + point = point[0] + start = segments[:, :2] + end = segments[:, 2:] + diff = start - end + a = diff[:, 1] + b = -diff[:, 0] + c = a * start[:, 0] + b * start[:, 1] + + d = np.abs(a * point[0] + b * point[1] - c) / np.sqrt(a ** 2 + b ** 2 + 1e-10) + theta = np.arctan2(diff[:, 0], diff[:, 1]) * 180 / np.pi + theta[theta < 0.0] += 180 + hough = np.concatenate([d[:, None], theta[:, None]], axis=-1) + + d_quant = 1 + theta_quant = 2 + hough[:, 0] //= d_quant + hough[:, 1] //= theta_quant + _, indices, counts = np.unique(hough, axis=0, return_index=True, return_counts=True) + + acc_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='float32') + idx_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='int32') - 1 + yx_indices = hough[indices, :].astype('int32') + acc_map[yx_indices[:, 0], yx_indices[:, 1]] = counts + idx_map[yx_indices[:, 0], yx_indices[:, 1]] = indices + + acc_map_np = acc_map + # acc_map = acc_map[None, :, :, None] + # + # ### fast suppression using tensorflow op + # acc_map = tf.constant(acc_map, dtype=tf.float32) + # max_acc_map = tf.keras.layers.MaxPool2D(pool_size=(5, 5), strides=1, padding='same')(acc_map) + # acc_map = acc_map * tf.cast(tf.math.equal(acc_map, max_acc_map), tf.float32) + # flatten_acc_map = tf.reshape(acc_map, [1, -1]) + # topk_values, topk_indices = tf.math.top_k(flatten_acc_map, k=len(pts)) + # _, h, w, _ = acc_map.shape + # y = tf.expand_dims(topk_indices // w, axis=-1) + # x = tf.expand_dims(topk_indices % w, axis=-1) + # yx = tf.concat([y, x], axis=-1) + + ### fast suppression using pytorch op + acc_map = torch.from_numpy(acc_map_np).unsqueeze(0).unsqueeze(0) + _,_, h, w = acc_map.shape + max_acc_map = F.max_pool2d(acc_map,kernel_size=5, stride=1, padding=2) + acc_map = acc_map * ( (acc_map == max_acc_map).float() ) + flatten_acc_map = acc_map.reshape([-1, ]) + + scores, indices = torch.topk(flatten_acc_map, len(pts), dim=-1, largest=True) + yy = torch.div(indices, w, rounding_mode='floor').unsqueeze(-1) + xx = torch.fmod(indices, w).unsqueeze(-1) + yx = torch.cat((yy, xx), dim=-1) + + yx = yx.detach().cpu().numpy() + + topk_values = scores.detach().cpu().numpy() + indices = idx_map[yx[:, 0], yx[:, 1]] + basis = 5 // 2 + + merged_segments = [] + for yx_pt, max_indice, value in zip(yx, indices, topk_values): + y, x = yx_pt + if max_indice == -1 or value == 0: + continue + segment_list = [] + for y_offset in range(-basis, basis + 1): + for x_offset in range(-basis, basis + 1): + indice = idx_map[y + y_offset, x + x_offset] + cnt = int(acc_map_np[y + y_offset, x + x_offset]) + if indice != -1: + segment_list.append(segments[indice]) + if cnt > 1: + check_cnt = 1 + current_hough = hough[indice] + for new_indice, new_hough in enumerate(hough): + if (current_hough == new_hough).all() and indice != new_indice: + segment_list.append(segments[new_indice]) + check_cnt += 1 + if check_cnt == cnt: + break + group_segments = np.array(segment_list).reshape([-1, 2]) + sorted_group_segments = np.sort(group_segments, axis=0) + x_min, y_min = sorted_group_segments[0, :] + x_max, y_max = sorted_group_segments[-1, :] + + deg = theta[max_indice] + if deg >= 90: + merged_segments.append([x_min, y_max, x_max, y_min]) + else: + merged_segments.append([x_min, y_min, x_max, y_max]) + + # 2. get intersections + new_segments = np.array(merged_segments) # (x1, y1, x2, y2) + start = new_segments[:, :2] # (x1, y1) + end = new_segments[:, 2:] # (x2, y2) + new_centers = (start + end) / 2.0 + diff = start - end + dist_segments = np.sqrt(np.sum(diff ** 2, axis=-1)) + + # ax + by = c + a = diff[:, 1] + b = -diff[:, 0] + c = a * start[:, 0] + b * start[:, 1] + pre_det = a[:, None] * b[None, :] + det = pre_det - np.transpose(pre_det) + + pre_inter_y = a[:, None] * c[None, :] + inter_y = (pre_inter_y - np.transpose(pre_inter_y)) / (det + 1e-10) + pre_inter_x = c[:, None] * b[None, :] + inter_x = (pre_inter_x - np.transpose(pre_inter_x)) / (det + 1e-10) + inter_pts = np.concatenate([inter_x[:, :, None], inter_y[:, :, None]], axis=-1).astype('int32') + + # 3. get corner information + # 3.1 get distance + ''' + dist_segments: + | dist(0), dist(1), dist(2), ...| + dist_inter_to_segment1: + | dist(inter,0), dist(inter,0), dist(inter,0), ... | + | dist(inter,1), dist(inter,1), dist(inter,1), ... | + ... + dist_inter_to_semgnet2: + | dist(inter,0), dist(inter,1), dist(inter,2), ... | + | dist(inter,0), dist(inter,1), dist(inter,2), ... | + ... + ''' + + dist_inter_to_segment1_start = np.sqrt( + np.sum(((inter_pts - start[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1] + dist_inter_to_segment1_end = np.sqrt( + np.sum(((inter_pts - end[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1] + dist_inter_to_segment2_start = np.sqrt( + np.sum(((inter_pts - start[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1] + dist_inter_to_segment2_end = np.sqrt( + np.sum(((inter_pts - end[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1] + + # sort ascending + dist_inter_to_segment1 = np.sort( + np.concatenate([dist_inter_to_segment1_start, dist_inter_to_segment1_end], axis=-1), + axis=-1) # [n_batch, n_batch, 2] + dist_inter_to_segment2 = np.sort( + np.concatenate([dist_inter_to_segment2_start, dist_inter_to_segment2_end], axis=-1), + axis=-1) # [n_batch, n_batch, 2] + + # 3.2 get degree + inter_to_start = new_centers[:, None, :] - inter_pts + deg_inter_to_start = np.arctan2(inter_to_start[:, :, 1], inter_to_start[:, :, 0]) * 180 / np.pi + deg_inter_to_start[deg_inter_to_start < 0.0] += 360 + inter_to_end = new_centers[None, :, :] - inter_pts + deg_inter_to_end = np.arctan2(inter_to_end[:, :, 1], inter_to_end[:, :, 0]) * 180 / np.pi + deg_inter_to_end[deg_inter_to_end < 0.0] += 360 + + ''' + B -- G + | | + C -- R + B : blue / G: green / C: cyan / R: red + + 0 -- 1 + | | + 3 -- 2 + ''' + # rename variables + deg1_map, deg2_map = deg_inter_to_start, deg_inter_to_end + # sort deg ascending + deg_sort = np.sort(np.concatenate([deg1_map[:, :, None], deg2_map[:, :, None]], axis=-1), axis=-1) + + deg_diff_map = np.abs(deg1_map - deg2_map) + # we only consider the smallest degree of intersect + deg_diff_map[deg_diff_map > 180] = 360 - deg_diff_map[deg_diff_map > 180] + + # define available degree range + deg_range = [60, 120] + + corner_dict = {corner_info: [] for corner_info in range(4)} + inter_points = [] + for i in range(inter_pts.shape[0]): + for j in range(i + 1, inter_pts.shape[1]): + # i, j > line index, always i < j + x, y = inter_pts[i, j, :] + deg1, deg2 = deg_sort[i, j, :] + deg_diff = deg_diff_map[i, j] + + check_degree = deg_diff > deg_range[0] and deg_diff < deg_range[1] + + outside_ratio = params['outside_ratio'] # over ratio >>> drop it! + inside_ratio = params['inside_ratio'] # over ratio >>> drop it! + check_distance = ((dist_inter_to_segment1[i, j, 1] >= dist_segments[i] and \ + dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * outside_ratio) or \ + (dist_inter_to_segment1[i, j, 1] <= dist_segments[i] and \ + dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * inside_ratio)) and \ + ((dist_inter_to_segment2[i, j, 1] >= dist_segments[j] and \ + dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * outside_ratio) or \ + (dist_inter_to_segment2[i, j, 1] <= dist_segments[j] and \ + dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * inside_ratio)) + + if check_degree and check_distance: + corner_info = None + + if (deg1 >= 0 and deg1 <= 45 and deg2 >= 45 and deg2 <= 120) or \ + (deg2 >= 315 and deg1 >= 45 and deg1 <= 120): + corner_info, color_info = 0, 'blue' + elif (deg1 >= 45 and deg1 <= 125 and deg2 >= 125 and deg2 <= 225): + corner_info, color_info = 1, 'green' + elif (deg1 >= 125 and deg1 <= 225 and deg2 >= 225 and deg2 <= 315): + corner_info, color_info = 2, 'black' + elif (deg1 >= 0 and deg1 <= 45 and deg2 >= 225 and deg2 <= 315) or \ + (deg2 >= 315 and deg1 >= 225 and deg1 <= 315): + corner_info, color_info = 3, 'cyan' + else: + corner_info, color_info = 4, 'red' # we don't use it + continue + + corner_dict[corner_info].append([x, y, i, j]) + inter_points.append([x, y]) + + square_list = [] + connect_list = [] + segments_list = [] + for corner0 in corner_dict[0]: + for corner1 in corner_dict[1]: + connect01 = False + for corner0_line in corner0[2:]: + if corner0_line in corner1[2:]: + connect01 = True + break + if connect01: + for corner2 in corner_dict[2]: + connect12 = False + for corner1_line in corner1[2:]: + if corner1_line in corner2[2:]: + connect12 = True + break + if connect12: + for corner3 in corner_dict[3]: + connect23 = False + for corner2_line in corner2[2:]: + if corner2_line in corner3[2:]: + connect23 = True + break + if connect23: + for corner3_line in corner3[2:]: + if corner3_line in corner0[2:]: + # SQUARE!!! + ''' + 0 -- 1 + | | + 3 -- 2 + square_list: + order: 0 > 1 > 2 > 3 + | x0, y0, x1, y1, x2, y2, x3, y3 | + | x0, y0, x1, y1, x2, y2, x3, y3 | + ... + connect_list: + order: 01 > 12 > 23 > 30 + | line_idx01, line_idx12, line_idx23, line_idx30 | + | line_idx01, line_idx12, line_idx23, line_idx30 | + ... + segments_list: + order: 0 > 1 > 2 > 3 + | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j | + | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j | + ... + ''' + square_list.append(corner0[:2] + corner1[:2] + corner2[:2] + corner3[:2]) + connect_list.append([corner0_line, corner1_line, corner2_line, corner3_line]) + segments_list.append(corner0[2:] + corner1[2:] + corner2[2:] + corner3[2:]) + + def check_outside_inside(segments_info, connect_idx): + # return 'outside or inside', min distance, cover_param, peri_param + if connect_idx == segments_info[0]: + check_dist_mat = dist_inter_to_segment1 + else: + check_dist_mat = dist_inter_to_segment2 + + i, j = segments_info + min_dist, max_dist = check_dist_mat[i, j, :] + connect_dist = dist_segments[connect_idx] + if max_dist > connect_dist: + return 'outside', min_dist, 0, 1 + else: + return 'inside', min_dist, -1, -1 + + top_square = None + + try: + map_size = input_shape[0] / 2 + squares = np.array(square_list).reshape([-1, 4, 2]) + score_array = [] + connect_array = np.array(connect_list) + segments_array = np.array(segments_list).reshape([-1, 4, 2]) + + # get degree of corners: + squares_rollup = np.roll(squares, 1, axis=1) + squares_rolldown = np.roll(squares, -1, axis=1) + vec1 = squares_rollup - squares + normalized_vec1 = vec1 / (np.linalg.norm(vec1, axis=-1, keepdims=True) + 1e-10) + vec2 = squares_rolldown - squares + normalized_vec2 = vec2 / (np.linalg.norm(vec2, axis=-1, keepdims=True) + 1e-10) + inner_products = np.sum(normalized_vec1 * normalized_vec2, axis=-1) # [n_squares, 4] + squares_degree = np.arccos(inner_products) * 180 / np.pi # [n_squares, 4] + + # get square score + overlap_scores = [] + degree_scores = [] + length_scores = [] + + for connects, segments, square, degree in zip(connect_array, segments_array, squares, squares_degree): + ''' + 0 -- 1 + | | + 3 -- 2 + + # segments: [4, 2] + # connects: [4] + ''' + + ###################################### OVERLAP SCORES + cover = 0 + perimeter = 0 + # check 0 > 1 > 2 > 3 + square_length = [] + + for start_idx in range(4): + end_idx = (start_idx + 1) % 4 + + connect_idx = connects[start_idx] # segment idx of segment01 + start_segments = segments[start_idx] + end_segments = segments[end_idx] + + start_point = square[start_idx] + end_point = square[end_idx] + + # check whether outside or inside + start_position, start_min, start_cover_param, start_peri_param = check_outside_inside(start_segments, + connect_idx) + end_position, end_min, end_cover_param, end_peri_param = check_outside_inside(end_segments, connect_idx) + + cover += dist_segments[connect_idx] + start_cover_param * start_min + end_cover_param * end_min + perimeter += dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min + + square_length.append( + dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min) + + overlap_scores.append(cover / perimeter) + ###################################### + ###################################### DEGREE SCORES + ''' + deg0 vs deg2 + deg1 vs deg3 + ''' + deg0, deg1, deg2, deg3 = degree + deg_ratio1 = deg0 / deg2 + if deg_ratio1 > 1.0: + deg_ratio1 = 1 / deg_ratio1 + deg_ratio2 = deg1 / deg3 + if deg_ratio2 > 1.0: + deg_ratio2 = 1 / deg_ratio2 + degree_scores.append((deg_ratio1 + deg_ratio2) / 2) + ###################################### + ###################################### LENGTH SCORES + ''' + len0 vs len2 + len1 vs len3 + ''' + len0, len1, len2, len3 = square_length + len_ratio1 = len0 / len2 if len2 > len0 else len2 / len0 + len_ratio2 = len1 / len3 if len3 > len1 else len3 / len1 + length_scores.append((len_ratio1 + len_ratio2) / 2) + + ###################################### + + overlap_scores = np.array(overlap_scores) + overlap_scores /= np.max(overlap_scores) + + degree_scores = np.array(degree_scores) + # degree_scores /= np.max(degree_scores) + + length_scores = np.array(length_scores) + + ###################################### AREA SCORES + area_scores = np.reshape(squares, [-1, 4, 2]) + area_x = area_scores[:, :, 0] + area_y = area_scores[:, :, 1] + correction = area_x[:, -1] * area_y[:, 0] - area_y[:, -1] * area_x[:, 0] + area_scores = np.sum(area_x[:, :-1] * area_y[:, 1:], axis=-1) - np.sum(area_y[:, :-1] * area_x[:, 1:], axis=-1) + area_scores = 0.5 * np.abs(area_scores + correction) + area_scores /= (map_size * map_size) # np.max(area_scores) + ###################################### + + ###################################### CENTER SCORES + centers = np.array([[256 // 2, 256 // 2]], dtype='float32') # [1, 2] + # squares: [n, 4, 2] + square_centers = np.mean(squares, axis=1) # [n, 2] + center2center = np.sqrt(np.sum((centers - square_centers) ** 2)) + center_scores = center2center / (map_size / np.sqrt(2.0)) + + ''' + score_w = [overlap, degree, area, center, length] + ''' + score_w = [0.0, 1.0, 10.0, 0.5, 1.0] + score_array = params['w_overlap'] * overlap_scores \ + + params['w_degree'] * degree_scores \ + + params['w_area'] * area_scores \ + - params['w_center'] * center_scores \ + + params['w_length'] * length_scores + + best_square = [] + + sorted_idx = np.argsort(score_array)[::-1] + score_array = score_array[sorted_idx] + squares = squares[sorted_idx] + + except Exception as e: + pass + + '''return list + merged_lines, squares, scores + ''' + + try: + new_segments[:, 0] = new_segments[:, 0] * 2 / input_shape[1] * original_shape[1] + new_segments[:, 1] = new_segments[:, 1] * 2 / input_shape[0] * original_shape[0] + new_segments[:, 2] = new_segments[:, 2] * 2 / input_shape[1] * original_shape[1] + new_segments[:, 3] = new_segments[:, 3] * 2 / input_shape[0] * original_shape[0] + except: + new_segments = [] + + try: + squares[:, :, 0] = squares[:, :, 0] * 2 / input_shape[1] * original_shape[1] + squares[:, :, 1] = squares[:, :, 1] * 2 / input_shape[0] * original_shape[0] + except: + squares = [] + score_array = [] + + try: + inter_points = np.array(inter_points) + inter_points[:, 0] = inter_points[:, 0] * 2 / input_shape[1] * original_shape[1] + inter_points[:, 1] = inter_points[:, 1] * 2 / input_shape[0] * original_shape[0] + except: + inter_points = [] + + return new_segments, squares, score_array, inter_points diff --git a/lavis/common/annotator/openpose/__init__.py b/lavis/common/annotator/openpose/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8c26f1b37dae854f51da938da2fa67a8ef48ce5a --- /dev/null +++ b/lavis/common/annotator/openpose/__init__.py @@ -0,0 +1,44 @@ +import os +os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" + +import torch +import numpy as np +from . import util +from .body import Body +from .hand import Hand +from annotator.util import annotator_ckpts_path + + +body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth" +hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth" + + +class OpenposeDetector: + def __init__(self): + body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth") + hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth") + + if not os.path.exists(hand_modelpath): + from basicsr.utils.download_util import load_file_from_url + load_file_from_url(body_model_path, model_dir=annotator_ckpts_path) + load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path) + + self.body_estimation = Body(body_modelpath) + self.hand_estimation = Hand(hand_modelpath) + + def __call__(self, oriImg, hand=False): + oriImg = oriImg[:, :, ::-1].copy() + with torch.no_grad(): + candidate, subset = self.body_estimation(oriImg) + canvas = np.zeros_like(oriImg) + canvas = util.draw_bodypose(canvas, candidate, subset) + if hand: + hands_list = util.handDetect(candidate, subset, oriImg) + all_hand_peaks = [] + for x, y, w, is_left in hands_list: + peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :]) + peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x) + peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y) + all_hand_peaks.append(peaks) + canvas = util.draw_handpose(canvas, all_hand_peaks) + return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist()) diff --git a/lavis/common/annotator/openpose/body.py b/lavis/common/annotator/openpose/body.py new file mode 100644 index 0000000000000000000000000000000000000000..7c3cf7a388b4ac81004524e64125e383bdd455bd --- /dev/null +++ b/lavis/common/annotator/openpose/body.py @@ -0,0 +1,219 @@ +import cv2 +import numpy as np +import math +import time +from scipy.ndimage.filters import gaussian_filter +import matplotlib.pyplot as plt +import matplotlib +import torch +from torchvision import transforms + +from . import util +from .model import bodypose_model + +class Body(object): + def __init__(self, model_path): + self.model = bodypose_model() + if torch.cuda.is_available(): + self.model = self.model.cuda() + print('cuda') + model_dict = util.transfer(self.model, torch.load(model_path)) + self.model.load_state_dict(model_dict) + self.model.eval() + + def __call__(self, oriImg): + # scale_search = [0.5, 1.0, 1.5, 2.0] + scale_search = [0.5] + boxsize = 368 + stride = 8 + padValue = 128 + thre1 = 0.1 + thre2 = 0.05 + multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search] + heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19)) + paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38)) + + for m in range(len(multiplier)): + scale = multiplier[m] + imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) + imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue) + im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5 + im = np.ascontiguousarray(im) + + data = torch.from_numpy(im).float() + if torch.cuda.is_available(): + data = data.cuda() + # data = data.permute([2, 0, 1]).unsqueeze(0).float() + with torch.no_grad(): + Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data) + Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy() + Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy() + + # extract outputs, resize, and remove padding + # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps + heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps + heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC) + heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :] + heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC) + + # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs + paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs + paf = cv2.resize(paf, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC) + paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :] + paf = cv2.resize(paf, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC) + + heatmap_avg += heatmap_avg + heatmap / len(multiplier) + paf_avg += + paf / len(multiplier) + + all_peaks = [] + peak_counter = 0 + + for part in range(18): + map_ori = heatmap_avg[:, :, part] + one_heatmap = gaussian_filter(map_ori, sigma=3) + + map_left = np.zeros(one_heatmap.shape) + map_left[1:, :] = one_heatmap[:-1, :] + map_right = np.zeros(one_heatmap.shape) + map_right[:-1, :] = one_heatmap[1:, :] + map_up = np.zeros(one_heatmap.shape) + map_up[:, 1:] = one_heatmap[:, :-1] + map_down = np.zeros(one_heatmap.shape) + map_down[:, :-1] = one_heatmap[:, 1:] + + peaks_binary = np.logical_and.reduce( + (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1)) + peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse + peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks] + peak_id = range(peak_counter, peak_counter + len(peaks)) + peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))] + + all_peaks.append(peaks_with_score_and_id) + peak_counter += len(peaks) + + # find connection in the specified sequence, center 29 is in the position 15 + limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \ + [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \ + [1, 16], [16, 18], [3, 17], [6, 18]] + # the middle joints heatmap correpondence + mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \ + [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \ + [55, 56], [37, 38], [45, 46]] + + connection_all = [] + special_k = [] + mid_num = 10 + + for k in range(len(mapIdx)): + score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]] + candA = all_peaks[limbSeq[k][0] - 1] + candB = all_peaks[limbSeq[k][1] - 1] + nA = len(candA) + nB = len(candB) + indexA, indexB = limbSeq[k] + if (nA != 0 and nB != 0): + connection_candidate = [] + for i in range(nA): + for j in range(nB): + vec = np.subtract(candB[j][:2], candA[i][:2]) + norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1]) + norm = max(0.001, norm) + vec = np.divide(vec, norm) + + startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \ + np.linspace(candA[i][1], candB[j][1], num=mid_num))) + + vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \ + for I in range(len(startend))]) + vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \ + for I in range(len(startend))]) + + score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1]) + score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min( + 0.5 * oriImg.shape[0] / norm - 1, 0) + criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts) + criterion2 = score_with_dist_prior > 0 + if criterion1 and criterion2: + connection_candidate.append( + [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]]) + + connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True) + connection = np.zeros((0, 5)) + for c in range(len(connection_candidate)): + i, j, s = connection_candidate[c][0:3] + if (i not in connection[:, 3] and j not in connection[:, 4]): + connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]]) + if (len(connection) >= min(nA, nB)): + break + + connection_all.append(connection) + else: + special_k.append(k) + connection_all.append([]) + + # last number in each row is the total parts number of that person + # the second last number in each row is the score of the overall configuration + subset = -1 * np.ones((0, 20)) + candidate = np.array([item for sublist in all_peaks for item in sublist]) + + for k in range(len(mapIdx)): + if k not in special_k: + partAs = connection_all[k][:, 0] + partBs = connection_all[k][:, 1] + indexA, indexB = np.array(limbSeq[k]) - 1 + + for i in range(len(connection_all[k])): # = 1:size(temp,1) + found = 0 + subset_idx = [-1, -1] + for j in range(len(subset)): # 1:size(subset,1): + if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]: + subset_idx[found] = j + found += 1 + + if found == 1: + j = subset_idx[0] + if subset[j][indexB] != partBs[i]: + subset[j][indexB] = partBs[i] + subset[j][-1] += 1 + subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2] + elif found == 2: # if found 2 and disjoint, merge them + j1, j2 = subset_idx + membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2] + if len(np.nonzero(membership == 2)[0]) == 0: # merge + subset[j1][:-2] += (subset[j2][:-2] + 1) + subset[j1][-2:] += subset[j2][-2:] + subset[j1][-2] += connection_all[k][i][2] + subset = np.delete(subset, j2, 0) + else: # as like found == 1 + subset[j1][indexB] = partBs[i] + subset[j1][-1] += 1 + subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2] + + # if find no partA in the subset, create a new subset + elif not found and k < 17: + row = -1 * np.ones(20) + row[indexA] = partAs[i] + row[indexB] = partBs[i] + row[-1] = 2 + row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2] + subset = np.vstack([subset, row]) + # delete some rows of subset which has few parts occur + deleteIdx = [] + for i in range(len(subset)): + if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4: + deleteIdx.append(i) + subset = np.delete(subset, deleteIdx, axis=0) + + # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts + # candidate: x, y, score, id + return candidate, subset + +if __name__ == "__main__": + body_estimation = Body('../model/body_pose_model.pth') + + test_image = '../images/ski.jpg' + oriImg = cv2.imread(test_image) # B,G,R order + candidate, subset = body_estimation(oriImg) + canvas = util.draw_bodypose(oriImg, candidate, subset) + plt.imshow(canvas[:, :, [2, 1, 0]]) + plt.show() diff --git a/lavis/common/annotator/openpose/hand.py b/lavis/common/annotator/openpose/hand.py new file mode 100644 index 0000000000000000000000000000000000000000..3d0bf17165ad7eb225332b51f4a2aa16718664b2 --- /dev/null +++ b/lavis/common/annotator/openpose/hand.py @@ -0,0 +1,86 @@ +import cv2 +import json +import numpy as np +import math +import time +from scipy.ndimage.filters import gaussian_filter +import matplotlib.pyplot as plt +import matplotlib +import torch +from skimage.measure import label + +from .model import handpose_model +from . import util + +class Hand(object): + def __init__(self, model_path): + self.model = handpose_model() + if torch.cuda.is_available(): + self.model = self.model.cuda() + print('cuda') + model_dict = util.transfer(self.model, torch.load(model_path)) + self.model.load_state_dict(model_dict) + self.model.eval() + + def __call__(self, oriImg): + scale_search = [0.5, 1.0, 1.5, 2.0] + # scale_search = [0.5] + boxsize = 368 + stride = 8 + padValue = 128 + thre = 0.05 + multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search] + heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22)) + # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38)) + + for m in range(len(multiplier)): + scale = multiplier[m] + imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) + imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue) + im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5 + im = np.ascontiguousarray(im) + + data = torch.from_numpy(im).float() + if torch.cuda.is_available(): + data = data.cuda() + # data = data.permute([2, 0, 1]).unsqueeze(0).float() + with torch.no_grad(): + output = self.model(data).cpu().numpy() + # output = self.model(data).numpy()q + + # extract outputs, resize, and remove padding + heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps + heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC) + heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :] + heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC) + + heatmap_avg += heatmap / len(multiplier) + + all_peaks = [] + for part in range(21): + map_ori = heatmap_avg[:, :, part] + one_heatmap = gaussian_filter(map_ori, sigma=3) + binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8) + # 全部小于阈值 + if np.sum(binary) == 0: + all_peaks.append([0, 0]) + continue + label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim) + max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1 + label_img[label_img != max_index] = 0 + map_ori[label_img == 0] = 0 + + y, x = util.npmax(map_ori) + all_peaks.append([x, y]) + return np.array(all_peaks) + +if __name__ == "__main__": + hand_estimation = Hand('../model/hand_pose_model.pth') + + # test_image = '../images/hand.jpg' + test_image = '../images/hand.jpg' + oriImg = cv2.imread(test_image) # B,G,R order + peaks = hand_estimation(oriImg) + canvas = util.draw_handpose(oriImg, peaks, True) + cv2.imshow('', canvas) + cv2.waitKey(0) \ No newline at end of file diff --git a/lavis/common/annotator/openpose/model.py b/lavis/common/annotator/openpose/model.py new file mode 100644 index 0000000000000000000000000000000000000000..5dfc80de827a17beccb9b0f3f7588545be78c9de --- /dev/null +++ b/lavis/common/annotator/openpose/model.py @@ -0,0 +1,219 @@ +import torch +from collections import OrderedDict + +import torch +import torch.nn as nn + +def make_layers(block, no_relu_layers): + layers = [] + for layer_name, v in block.items(): + if 'pool' in layer_name: + layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], + padding=v[2]) + layers.append((layer_name, layer)) + else: + conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], + kernel_size=v[2], stride=v[3], + padding=v[4]) + layers.append((layer_name, conv2d)) + if layer_name not in no_relu_layers: + layers.append(('relu_'+layer_name, nn.ReLU(inplace=True))) + + return nn.Sequential(OrderedDict(layers)) + +class bodypose_model(nn.Module): + def __init__(self): + super(bodypose_model, self).__init__() + + # these layers have no relu layer + no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\ + 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\ + 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\ + 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1'] + blocks = {} + block0 = OrderedDict([ + ('conv1_1', [3, 64, 3, 1, 1]), + ('conv1_2', [64, 64, 3, 1, 1]), + ('pool1_stage1', [2, 2, 0]), + ('conv2_1', [64, 128, 3, 1, 1]), + ('conv2_2', [128, 128, 3, 1, 1]), + ('pool2_stage1', [2, 2, 0]), + ('conv3_1', [128, 256, 3, 1, 1]), + ('conv3_2', [256, 256, 3, 1, 1]), + ('conv3_3', [256, 256, 3, 1, 1]), + ('conv3_4', [256, 256, 3, 1, 1]), + ('pool3_stage1', [2, 2, 0]), + ('conv4_1', [256, 512, 3, 1, 1]), + ('conv4_2', [512, 512, 3, 1, 1]), + ('conv4_3_CPM', [512, 256, 3, 1, 1]), + ('conv4_4_CPM', [256, 128, 3, 1, 1]) + ]) + + + # Stage 1 + block1_1 = OrderedDict([ + ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]), + ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]), + ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]), + ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]), + ('conv5_5_CPM_L1', [512, 38, 1, 1, 0]) + ]) + + block1_2 = OrderedDict([ + ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]), + ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]), + ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]), + ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]), + ('conv5_5_CPM_L2', [512, 19, 1, 1, 0]) + ]) + blocks['block1_1'] = block1_1 + blocks['block1_2'] = block1_2 + + self.model0 = make_layers(block0, no_relu_layers) + + # Stages 2 - 6 + for i in range(2, 7): + blocks['block%d_1' % i] = OrderedDict([ + ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]), + ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]), + ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]), + ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]), + ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]), + ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]), + ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0]) + ]) + + blocks['block%d_2' % i] = OrderedDict([ + ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]), + ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]), + ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]), + ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]), + ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]), + ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]), + ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0]) + ]) + + for k in blocks.keys(): + blocks[k] = make_layers(blocks[k], no_relu_layers) + + self.model1_1 = blocks['block1_1'] + self.model2_1 = blocks['block2_1'] + self.model3_1 = blocks['block3_1'] + self.model4_1 = blocks['block4_1'] + self.model5_1 = blocks['block5_1'] + self.model6_1 = blocks['block6_1'] + + self.model1_2 = blocks['block1_2'] + self.model2_2 = blocks['block2_2'] + self.model3_2 = blocks['block3_2'] + self.model4_2 = blocks['block4_2'] + self.model5_2 = blocks['block5_2'] + self.model6_2 = blocks['block6_2'] + + + def forward(self, x): + + out1 = self.model0(x) + + out1_1 = self.model1_1(out1) + out1_2 = self.model1_2(out1) + out2 = torch.cat([out1_1, out1_2, out1], 1) + + out2_1 = self.model2_1(out2) + out2_2 = self.model2_2(out2) + out3 = torch.cat([out2_1, out2_2, out1], 1) + + out3_1 = self.model3_1(out3) + out3_2 = self.model3_2(out3) + out4 = torch.cat([out3_1, out3_2, out1], 1) + + out4_1 = self.model4_1(out4) + out4_2 = self.model4_2(out4) + out5 = torch.cat([out4_1, out4_2, out1], 1) + + out5_1 = self.model5_1(out5) + out5_2 = self.model5_2(out5) + out6 = torch.cat([out5_1, out5_2, out1], 1) + + out6_1 = self.model6_1(out6) + out6_2 = self.model6_2(out6) + + return out6_1, out6_2 + +class handpose_model(nn.Module): + def __init__(self): + super(handpose_model, self).__init__() + + # these layers have no relu layer + no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\ + 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6'] + # stage 1 + block1_0 = OrderedDict([ + ('conv1_1', [3, 64, 3, 1, 1]), + ('conv1_2', [64, 64, 3, 1, 1]), + ('pool1_stage1', [2, 2, 0]), + ('conv2_1', [64, 128, 3, 1, 1]), + ('conv2_2', [128, 128, 3, 1, 1]), + ('pool2_stage1', [2, 2, 0]), + ('conv3_1', [128, 256, 3, 1, 1]), + ('conv3_2', [256, 256, 3, 1, 1]), + ('conv3_3', [256, 256, 3, 1, 1]), + ('conv3_4', [256, 256, 3, 1, 1]), + ('pool3_stage1', [2, 2, 0]), + ('conv4_1', [256, 512, 3, 1, 1]), + ('conv4_2', [512, 512, 3, 1, 1]), + ('conv4_3', [512, 512, 3, 1, 1]), + ('conv4_4', [512, 512, 3, 1, 1]), + ('conv5_1', [512, 512, 3, 1, 1]), + ('conv5_2', [512, 512, 3, 1, 1]), + ('conv5_3_CPM', [512, 128, 3, 1, 1]) + ]) + + block1_1 = OrderedDict([ + ('conv6_1_CPM', [128, 512, 1, 1, 0]), + ('conv6_2_CPM', [512, 22, 1, 1, 0]) + ]) + + blocks = {} + blocks['block1_0'] = block1_0 + blocks['block1_1'] = block1_1 + + # stage 2-6 + for i in range(2, 7): + blocks['block%d' % i] = OrderedDict([ + ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]), + ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]), + ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]), + ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]), + ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]), + ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]), + ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0]) + ]) + + for k in blocks.keys(): + blocks[k] = make_layers(blocks[k], no_relu_layers) + + self.model1_0 = blocks['block1_0'] + self.model1_1 = blocks['block1_1'] + self.model2 = blocks['block2'] + self.model3 = blocks['block3'] + self.model4 = blocks['block4'] + self.model5 = blocks['block5'] + self.model6 = blocks['block6'] + + def forward(self, x): + out1_0 = self.model1_0(x) + out1_1 = self.model1_1(out1_0) + concat_stage2 = torch.cat([out1_1, out1_0], 1) + out_stage2 = self.model2(concat_stage2) + concat_stage3 = torch.cat([out_stage2, out1_0], 1) + out_stage3 = self.model3(concat_stage3) + concat_stage4 = torch.cat([out_stage3, out1_0], 1) + out_stage4 = self.model4(concat_stage4) + concat_stage5 = torch.cat([out_stage4, out1_0], 1) + out_stage5 = self.model5(concat_stage5) + concat_stage6 = torch.cat([out_stage5, out1_0], 1) + out_stage6 = self.model6(concat_stage6) + return out_stage6 + + diff --git a/lavis/common/annotator/openpose/util.py b/lavis/common/annotator/openpose/util.py new file mode 100644 index 0000000000000000000000000000000000000000..6f91ae0e65abaf0cbd62d803f56498991141e61b --- /dev/null +++ b/lavis/common/annotator/openpose/util.py @@ -0,0 +1,164 @@ +import math +import numpy as np +import matplotlib +import cv2 + + +def padRightDownCorner(img, stride, padValue): + h = img.shape[0] + w = img.shape[1] + + pad = 4 * [None] + pad[0] = 0 # up + pad[1] = 0 # left + pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down + pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right + + img_padded = img + pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1)) + img_padded = np.concatenate((pad_up, img_padded), axis=0) + pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1)) + img_padded = np.concatenate((pad_left, img_padded), axis=1) + pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1)) + img_padded = np.concatenate((img_padded, pad_down), axis=0) + pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1)) + img_padded = np.concatenate((img_padded, pad_right), axis=1) + + return img_padded, pad + +# transfer caffe model to pytorch which will match the layer name +def transfer(model, model_weights): + transfered_model_weights = {} + for weights_name in model.state_dict().keys(): + transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])] + return transfered_model_weights + +# draw the body keypoint and lims +def draw_bodypose(canvas, candidate, subset): + stickwidth = 4 + limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \ + [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \ + [1, 16], [16, 18], [3, 17], [6, 18]] + + colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \ + [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \ + [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]] + for i in range(18): + for n in range(len(subset)): + index = int(subset[n][i]) + if index == -1: + continue + x, y = candidate[index][0:2] + cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1) + for i in range(17): + for n in range(len(subset)): + index = subset[n][np.array(limbSeq[i]) - 1] + if -1 in index: + continue + cur_canvas = canvas.copy() + Y = candidate[index.astype(int), 0] + X = candidate[index.astype(int), 1] + mX = np.mean(X) + mY = np.mean(Y) + length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 + angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) + polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1) + cv2.fillConvexPoly(cur_canvas, polygon, colors[i]) + canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) + # plt.imsave("preview.jpg", canvas[:, :, [2, 1, 0]]) + # plt.imshow(canvas[:, :, [2, 1, 0]]) + return canvas + + +# image drawed by opencv is not good. +def draw_handpose(canvas, all_hand_peaks, show_number=False): + edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \ + [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]] + + for peaks in all_hand_peaks: + for ie, e in enumerate(edges): + if np.sum(np.all(peaks[e], axis=1)==0)==0: + x1, y1 = peaks[e[0]] + x2, y2 = peaks[e[1]] + cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie/float(len(edges)), 1.0, 1.0])*255, thickness=2) + + for i, keyponit in enumerate(peaks): + x, y = keyponit + cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1) + if show_number: + cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA) + return canvas + +# detect hand according to body pose keypoints +# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp +def handDetect(candidate, subset, oriImg): + # right hand: wrist 4, elbow 3, shoulder 2 + # left hand: wrist 7, elbow 6, shoulder 5 + ratioWristElbow = 0.33 + detect_result = [] + image_height, image_width = oriImg.shape[0:2] + for person in subset.astype(int): + # if any of three not detected + has_left = np.sum(person[[5, 6, 7]] == -1) == 0 + has_right = np.sum(person[[2, 3, 4]] == -1) == 0 + if not (has_left or has_right): + continue + hands = [] + #left hand + if has_left: + left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]] + x1, y1 = candidate[left_shoulder_index][:2] + x2, y2 = candidate[left_elbow_index][:2] + x3, y3 = candidate[left_wrist_index][:2] + hands.append([x1, y1, x2, y2, x3, y3, True]) + # right hand + if has_right: + right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]] + x1, y1 = candidate[right_shoulder_index][:2] + x2, y2 = candidate[right_elbow_index][:2] + x3, y3 = candidate[right_wrist_index][:2] + hands.append([x1, y1, x2, y2, x3, y3, False]) + + for x1, y1, x2, y2, x3, y3, is_left in hands: + # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox + # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]); + # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]); + # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow); + # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder); + # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder); + x = x3 + ratioWristElbow * (x3 - x2) + y = y3 + ratioWristElbow * (y3 - y2) + distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2) + distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2) + width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder) + # x-y refers to the center --> offset to topLeft point + # handRectangle.x -= handRectangle.width / 2.f; + # handRectangle.y -= handRectangle.height / 2.f; + x -= width / 2 + y -= width / 2 # width = height + # overflow the image + if x < 0: x = 0 + if y < 0: y = 0 + width1 = width + width2 = width + if x + width > image_width: width1 = image_width - x + if y + width > image_height: width2 = image_height - y + width = min(width1, width2) + # the max hand box value is 20 pixels + if width >= 20: + detect_result.append([int(x), int(y), int(width), is_left]) + + ''' + return value: [[x, y, w, True if left hand else False]]. + width=height since the network require squared input. + x, y is the coordinate of top left + ''' + return detect_result + +# get max index of 2d array +def npmax(array): + arrayindex = array.argmax(1) + arrayvalue = array.max(1) + i = arrayvalue.argmax() + j = arrayindex[i] + return i, j diff --git a/lavis/common/annotator/uniformer/__init__.py b/lavis/common/annotator/uniformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6be429542e4908c2b7648e7ee7c9c5f8253e7c94 --- /dev/null +++ b/lavis/common/annotator/uniformer/__init__.py @@ -0,0 +1,23 @@ +import os + +from annotator.uniformer.mmseg.apis import init_segmentor, inference_segmentor, show_result_pyplot +from annotator.uniformer.mmseg.core.evaluation import get_palette +from annotator.util import annotator_ckpts_path + + +checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth" + + +class UniformerDetector: + def __init__(self): + modelpath = os.path.join(annotator_ckpts_path, "upernet_global_small.pth") + if not os.path.exists(modelpath): + from basicsr.utils.download_util import load_file_from_url + load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path) + config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py") + self.model = init_segmentor(config_file, modelpath).cuda() + + def __call__(self, img): + result = inference_segmentor(self.model, img) + res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1) + return res_img diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/ade20k.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/ade20k.py new file mode 100644 index 0000000000000000000000000000000000000000..efc8b4bb20c981f3db6df7eb52b3dc0744c94cc0 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/ade20k.py @@ -0,0 +1,54 @@ +# dataset settings +dataset_type = 'ADE20KDataset' +data_root = 'data/ade/ADEChallengeData2016' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (512, 512) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', reduce_zero_label=True), + dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 512), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/chase_db1.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/chase_db1.py new file mode 100644 index 0000000000000000000000000000000000000000..298594ea925f87f22b37094a2ec50e370aec96a0 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/chase_db1.py @@ -0,0 +1,59 @@ +# dataset settings +dataset_type = 'ChaseDB1Dataset' +data_root = 'data/CHASE_DB1' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +img_scale = (960, 999) +crop_size = (128, 128) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale, + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=40000, + dataset=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/cityscapes.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/cityscapes.py new file mode 100644 index 0000000000000000000000000000000000000000..f21867c63e1835f6fceb61f066e802fd8fd2a735 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/cityscapes.py @@ -0,0 +1,54 @@ +# dataset settings +dataset_type = 'CityscapesDataset' +data_root = 'data/cityscapes/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (512, 1024) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 1024), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='leftImg8bit/train', + ann_dir='gtFine/train', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='leftImg8bit/val', + ann_dir='gtFine/val', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='leftImg8bit/val', + ann_dir='gtFine/val', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py new file mode 100644 index 0000000000000000000000000000000000000000..336c7b254fe392b4703039fec86a83acdbd2e1a5 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py @@ -0,0 +1,35 @@ +_base_ = './cityscapes.py' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (769, 769) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2049, 1025), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/drive.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/drive.py new file mode 100644 index 0000000000000000000000000000000000000000..06e8ff606e0d2a4514ec8b7d2c6c436a32efcbf4 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/drive.py @@ -0,0 +1,59 @@ +# dataset settings +dataset_type = 'DRIVEDataset' +data_root = 'data/DRIVE' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +img_scale = (584, 565) +crop_size = (64, 64) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale, + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=40000, + dataset=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/hrf.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/hrf.py new file mode 100644 index 0000000000000000000000000000000000000000..242d790eb1b83e75cf6b7eaa7a35c674099311ad --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/hrf.py @@ -0,0 +1,59 @@ +# dataset settings +dataset_type = 'HRFDataset' +data_root = 'data/HRF' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +img_scale = (2336, 3504) +crop_size = (256, 256) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale, + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=40000, + dataset=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_context.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_context.py new file mode 100644 index 0000000000000000000000000000000000000000..ff65bad1b86d7e3a5980bb5b9fc55798dc8df5f4 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_context.py @@ -0,0 +1,60 @@ +# dataset settings +dataset_type = 'PascalContextDataset' +data_root = 'data/VOCdevkit/VOC2010/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +img_scale = (520, 520) +crop_size = (480, 480) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale, + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClassContext', + split='ImageSets/SegmentationContext/train.txt', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClassContext', + split='ImageSets/SegmentationContext/val.txt', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClassContext', + split='ImageSets/SegmentationContext/val.txt', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py new file mode 100644 index 0000000000000000000000000000000000000000..37585abab89834b95cd5bdd993b994fca1db65f6 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py @@ -0,0 +1,60 @@ +# dataset settings +dataset_type = 'PascalContextDataset59' +data_root = 'data/VOCdevkit/VOC2010/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +img_scale = (520, 520) +crop_size = (480, 480) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', reduce_zero_label=True), + dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale, + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClassContext', + split='ImageSets/SegmentationContext/train.txt', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClassContext', + split='ImageSets/SegmentationContext/val.txt', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClassContext', + split='ImageSets/SegmentationContext/val.txt', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py new file mode 100644 index 0000000000000000000000000000000000000000..ba1d42d0c5781f56dc177d860d856bb34adce555 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py @@ -0,0 +1,57 @@ +# dataset settings +dataset_type = 'PascalVOCDataset' +data_root = 'data/VOCdevkit/VOC2012' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (512, 512) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 512), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClass', + split='ImageSets/Segmentation/train.txt', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClass', + split='ImageSets/Segmentation/val.txt', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='JPEGImages', + ann_dir='SegmentationClass', + split='ImageSets/Segmentation/val.txt', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..3f23b6717d53ad29f02dd15046802a2631a5076b --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py @@ -0,0 +1,9 @@ +_base_ = './pascal_voc12.py' +# dataset settings +data = dict( + train=dict( + ann_dir=['SegmentationClass', 'SegmentationClassAug'], + split=[ + 'ImageSets/Segmentation/train.txt', + 'ImageSets/Segmentation/aug.txt' + ])) diff --git a/lavis/common/annotator/uniformer/configs/_base_/datasets/stare.py b/lavis/common/annotator/uniformer/configs/_base_/datasets/stare.py new file mode 100644 index 0000000000000000000000000000000000000000..3f71b25488cc11a6b4d582ac52b5a24e1ad1cf8e --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/datasets/stare.py @@ -0,0 +1,59 @@ +# dataset settings +dataset_type = 'STAREDataset' +data_root = 'data/STARE' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +img_scale = (605, 700) +crop_size = (128, 128) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale, + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=40000, + dataset=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/default_runtime.py b/lavis/common/annotator/uniformer/configs/_base_/default_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..b564cc4e7e7d9a67dacaaddecb100e4d8f5c005b --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/default_runtime.py @@ -0,0 +1,14 @@ +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook', by_epoch=False), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] +cudnn_benchmark = True diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/ann_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/ann_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..a2cb653827e44e6015b3b83bc578003e614a6aa1 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/ann_r50-d8.py @@ -0,0 +1,46 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='ANNHead', + in_channels=[1024, 2048], + in_index=[2, 3], + channels=512, + project_channels=256, + query_scales=(1, ), + key_pool_scales=(1, 3, 6, 8), + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..c8f5316cbcf3896ba9de7ca2c801eba512f01d5e --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py @@ -0,0 +1,44 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='APCHead', + in_channels=2048, + in_index=3, + channels=512, + pool_scales=(1, 2, 3, 6), + dropout_ratio=0.1, + num_classes=19, + norm_cfg=dict(type='SyncBN', requires_grad=True), + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..794148f576b9e215c3c6963e73dffe98204b7717 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py @@ -0,0 +1,44 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='CCHead', + in_channels=2048, + in_index=3, + channels=512, + recurrence=2, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/cgnet.py b/lavis/common/annotator/uniformer/configs/_base_/models/cgnet.py new file mode 100644 index 0000000000000000000000000000000000000000..eff8d9458c877c5db894957e0b1b4597e40da6ab --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/cgnet.py @@ -0,0 +1,35 @@ +# model settings +norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='CGNet', + norm_cfg=norm_cfg, + in_channels=3, + num_channels=(32, 64, 128), + num_blocks=(3, 21), + dilations=(2, 4), + reductions=(8, 16)), + decode_head=dict( + type='FCNHead', + in_channels=256, + in_index=2, + channels=256, + num_convs=0, + concat_input=False, + dropout_ratio=0, + num_classes=19, + norm_cfg=norm_cfg, + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=[ + 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352, + 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905, + 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587, + 10.396974, 10.055647 + ])), + # model training and testing settings + train_cfg=dict(sampler=None), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/danet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/danet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..2c934939fac48525f22ad86f489a041dd7db7d09 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/danet_r50-d8.py @@ -0,0 +1,44 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='DAHead', + in_channels=2048, + in_index=3, + channels=512, + pam_channels=64, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..d7a43bee01422ad4795dd27874e0cd4bb6cbfecf --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py @@ -0,0 +1,44 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='ASPPHead', + in_channels=2048, + in_index=3, + channels=512, + dilations=(1, 12, 24, 36), + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py b/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py new file mode 100644 index 0000000000000000000000000000000000000000..0cd262999d8b2cb8e14a5c32190ae73f479d8e81 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py @@ -0,0 +1,50 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained=None, + backbone=dict( + type='UNet', + in_channels=3, + base_channels=64, + num_stages=5, + strides=(1, 1, 1, 1, 1), + enc_num_convs=(2, 2, 2, 2, 2), + dec_num_convs=(2, 2, 2, 2), + downsamples=(True, True, True, True), + enc_dilations=(1, 1, 1, 1, 1), + dec_dilations=(1, 1, 1, 1), + with_cp=False, + conv_cfg=None, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU'), + upsample_cfg=dict(type='InterpConv'), + norm_eval=False), + decode_head=dict( + type='ASPPHead', + in_channels=64, + in_index=4, + channels=16, + dilations=(1, 12, 24, 36), + dropout_ratio=0.1, + num_classes=2, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=128, + in_index=3, + channels=64, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=2, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide', crop_size=256, stride=170)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..050e39e091d816df9028d23aa3ecf9db74e441e1 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py @@ -0,0 +1,46 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='DepthwiseSeparableASPPHead', + in_channels=2048, + in_index=3, + channels=512, + dilations=(1, 12, 24, 36), + c1_in_channels=256, + c1_channels=48, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..d22ba52640bebd805b3b8d07025e276dfb023759 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py @@ -0,0 +1,44 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='DMHead', + in_channels=2048, + in_index=3, + channels=512, + filter_sizes=(1, 3, 5, 7), + dropout_ratio=0.1, + num_classes=19, + norm_cfg=dict(type='SyncBN', requires_grad=True), + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..edb4c174c51e34c103737ba39bfc48bf831e561d --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py @@ -0,0 +1,46 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='DNLHead', + in_channels=2048, + in_index=3, + channels=512, + dropout_ratio=0.1, + reduction=2, + use_scale=True, + mode='embedded_gaussian', + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..26adcd430926de0862204a71d345f2543167f27b --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py @@ -0,0 +1,47 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='EMAHead', + in_channels=2048, + in_index=3, + channels=256, + ema_channels=512, + num_bases=64, + num_stages=3, + momentum=0.1, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..be777123a886503172a95fe0719e956a147bbd68 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py @@ -0,0 +1,48 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='EncHead', + in_channels=[512, 1024, 2048], + in_index=(1, 2, 3), + channels=512, + num_codes=32, + use_se_loss=True, + add_lateral=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_se_decode=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/fast_scnn.py b/lavis/common/annotator/uniformer/configs/_base_/models/fast_scnn.py new file mode 100644 index 0000000000000000000000000000000000000000..32fdeb659355a5ce5ef2cc7c2f30742703811cdf --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/fast_scnn.py @@ -0,0 +1,57 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='FastSCNN', + downsample_dw_channels=(32, 48), + global_in_channels=64, + global_block_channels=(64, 96, 128), + global_block_strides=(2, 2, 1), + global_out_channels=128, + higher_in_channels=64, + lower_in_channels=128, + fusion_out_channels=128, + out_indices=(0, 1, 2), + norm_cfg=norm_cfg, + align_corners=False), + decode_head=dict( + type='DepthwiseSeparableFCNHead', + in_channels=128, + channels=128, + concat_input=False, + num_classes=19, + in_index=-1, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), + auxiliary_head=[ + dict( + type='FCNHead', + in_channels=128, + channels=32, + num_convs=1, + num_classes=19, + in_index=-2, + norm_cfg=norm_cfg, + concat_input=False, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), + dict( + type='FCNHead', + in_channels=64, + channels=32, + num_convs=1, + num_classes=19, + in_index=-3, + norm_cfg=norm_cfg, + concat_input=False, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), + ], + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/fcn_hr18.py b/lavis/common/annotator/uniformer/configs/_base_/models/fcn_hr18.py new file mode 100644 index 0000000000000000000000000000000000000000..c3e299bc89ada56ca14bbffcbdb08a586b8ed9e9 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/fcn_hr18.py @@ -0,0 +1,52 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://msra/hrnetv2_w18', + backbone=dict( + type='HRNet', + norm_cfg=norm_cfg, + norm_eval=False, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144)))), + decode_head=dict( + type='FCNHead', + in_channels=[18, 36, 72, 144], + in_index=(0, 1, 2, 3), + channels=sum([18, 36, 72, 144]), + input_transform='resize_concat', + kernel_size=1, + num_convs=1, + concat_input=False, + dropout_ratio=-1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..5e98f6cc918b6146fc6d613c6918e825ef1355c3 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py @@ -0,0 +1,45 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='FCNHead', + in_channels=2048, + in_index=3, + channels=512, + num_convs=2, + concat_input=True, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py b/lavis/common/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py new file mode 100644 index 0000000000000000000000000000000000000000..a33e7972877f902d0e7d18401ca675e3e4e60a18 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py @@ -0,0 +1,51 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained=None, + backbone=dict( + type='UNet', + in_channels=3, + base_channels=64, + num_stages=5, + strides=(1, 1, 1, 1, 1), + enc_num_convs=(2, 2, 2, 2, 2), + dec_num_convs=(2, 2, 2, 2), + downsamples=(True, True, True, True), + enc_dilations=(1, 1, 1, 1, 1), + dec_dilations=(1, 1, 1, 1), + with_cp=False, + conv_cfg=None, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU'), + upsample_cfg=dict(type='InterpConv'), + norm_eval=False), + decode_head=dict( + type='FCNHead', + in_channels=64, + in_index=4, + channels=64, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=2, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=128, + in_index=3, + channels=64, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=2, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide', crop_size=256, stride=170)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/fpn_r50.py b/lavis/common/annotator/uniformer/configs/_base_/models/fpn_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..86ab327db92e44c14822d65f1c9277cb007f17c1 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/fpn_r50.py @@ -0,0 +1,36 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=4), + decode_head=dict( + type='FPNHead', + in_channels=[256, 256, 256, 256], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/fpn_uniformer.py b/lavis/common/annotator/uniformer/configs/_base_/models/fpn_uniformer.py new file mode 100644 index 0000000000000000000000000000000000000000..8aae98c5991055bfcc08e82ccdc09f8b1d9f8a8d --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/fpn_uniformer.py @@ -0,0 +1,35 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='UniFormer', + embed_dim=[64, 128, 320, 512], + layers=[3, 4, 8, 3], + head_dim=64, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1), + neck=dict( + type='FPN', + in_channels=[64, 128, 320, 512], + out_channels=256, + num_outs=4), + decode_head=dict( + type='FPNHead', + in_channels=[256, 256, 256, 256], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..3d2ad69f5c22adfe79d5fdabf920217628987166 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py @@ -0,0 +1,46 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='GCHead', + in_channels=2048, + in_index=3, + channels=512, + ratio=1 / 4., + pooling_type='att', + fusion_types=('channel_add', ), + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..93258242a90695cc94a7c6bd41562d6a75988771 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py @@ -0,0 +1,25 @@ +# model settings +norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='MobileNetV3', + arch='large', + out_indices=(1, 3, 16), + norm_cfg=norm_cfg), + decode_head=dict( + type='LRASPPHead', + in_channels=(16, 24, 960), + in_index=(0, 1, 2), + channels=128, + input_transform='multiple_select', + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU'), + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..5674a39854cafd1f2e363bac99c58ccae62f24da --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py @@ -0,0 +1,46 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='NLHead', + in_channels=2048, + in_index=3, + channels=512, + dropout_ratio=0.1, + reduction=2, + use_scale=True, + mode='embedded_gaussian', + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py b/lavis/common/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py new file mode 100644 index 0000000000000000000000000000000000000000..c60f62a7cdf3f5c5096a7a7e725e8268fddcb057 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py @@ -0,0 +1,68 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='CascadeEncoderDecoder', + num_stages=2, + pretrained='open-mmlab://msra/hrnetv2_w18', + backbone=dict( + type='HRNet', + norm_cfg=norm_cfg, + norm_eval=False, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144)))), + decode_head=[ + dict( + type='FCNHead', + in_channels=[18, 36, 72, 144], + channels=sum([18, 36, 72, 144]), + in_index=(0, 1, 2, 3), + input_transform='resize_concat', + kernel_size=1, + num_convs=1, + concat_input=False, + dropout_ratio=-1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + dict( + type='OCRHead', + in_channels=[18, 36, 72, 144], + in_index=(0, 1, 2, 3), + input_transform='resize_concat', + channels=512, + ocr_channels=256, + dropout_ratio=-1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + ], + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..615aa3ff703942b6c22b2d6e9642504dd3e41ebd --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py @@ -0,0 +1,47 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='CascadeEncoderDecoder', + num_stages=2, + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=[ + dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + dict( + type='OCRHead', + in_channels=2048, + in_index=3, + channels=512, + ocr_channels=256, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)) + ], + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/pointrend_r50.py b/lavis/common/annotator/uniformer/configs/_base_/models/pointrend_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..9d323dbf9466d41e0800aa57ef84045f3d874bdf --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/pointrend_r50.py @@ -0,0 +1,56 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='CascadeEncoderDecoder', + num_stages=2, + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=4), + decode_head=[ + dict( + type='FPNHead', + in_channels=[256, 256, 256, 256], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=-1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + dict( + type='PointHead', + in_channels=[256], + in_index=[0], + channels=256, + num_fcs=3, + coarse_pred_each_layer=True, + dropout_ratio=-1, + num_classes=19, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)) + ], + # model training and testing settings + train_cfg=dict( + num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75), + test_cfg=dict( + mode='whole', + subdivision_steps=2, + subdivision_num_points=8196, + scale_factor=2)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..689513fa9d2a40f14bf0ae4ae61f38f0dcc1b3da --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py @@ -0,0 +1,49 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='PSAHead', + in_channels=2048, + in_index=3, + channels=512, + mask_size=(97, 97), + psa_type='bi-direction', + compact=False, + shrink_factor=2, + normalization_factor=1.0, + psa_softmax=True, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py b/lavis/common/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py new file mode 100644 index 0000000000000000000000000000000000000000..f451e08ad2eb0732dcb806b1851eb978d4acf136 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py @@ -0,0 +1,44 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='PSPHead', + in_channels=2048, + in_index=3, + channels=512, + pool_scales=(1, 2, 3, 6), + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py b/lavis/common/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py new file mode 100644 index 0000000000000000000000000000000000000000..fcff9ec4f41fad158344ecd77313dc14564f3682 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py @@ -0,0 +1,50 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained=None, + backbone=dict( + type='UNet', + in_channels=3, + base_channels=64, + num_stages=5, + strides=(1, 1, 1, 1, 1), + enc_num_convs=(2, 2, 2, 2, 2), + dec_num_convs=(2, 2, 2, 2), + downsamples=(True, True, True, True), + enc_dilations=(1, 1, 1, 1, 1), + dec_dilations=(1, 1, 1, 1), + with_cp=False, + conv_cfg=None, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU'), + upsample_cfg=dict(type='InterpConv'), + norm_eval=False), + decode_head=dict( + type='PSPHead', + in_channels=64, + in_index=4, + channels=16, + pool_scales=(1, 2, 3, 6), + dropout_ratio=0.1, + num_classes=2, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=128, + in_index=3, + channels=64, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=2, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide', crop_size=256, stride=170)) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/upernet_r50.py b/lavis/common/annotator/uniformer/configs/_base_/models/upernet_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..10974962fdd7136031fd06de1700f497d355ceaa --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/upernet_r50.py @@ -0,0 +1,44 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='open-mmlab://resnet50_v1c', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + decode_head=dict( + type='UPerHead', + in_channels=[256, 512, 1024, 2048], + in_index=[0, 1, 2, 3], + pool_scales=(1, 2, 3, 6), + channels=512, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=1024, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/lavis/common/annotator/uniformer/configs/_base_/models/upernet_uniformer.py b/lavis/common/annotator/uniformer/configs/_base_/models/upernet_uniformer.py new file mode 100644 index 0000000000000000000000000000000000000000..41aa4db809dc6e2c508e98051f61807d07477903 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/models/upernet_uniformer.py @@ -0,0 +1,43 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained=None, + backbone=dict( + type='UniFormer', + embed_dim=[64, 128, 320, 512], + layers=[3, 4, 8, 3], + head_dim=64, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1), + decode_head=dict( + type='UPerHead', + in_channels=[64, 128, 320, 512], + in_index=[0, 1, 2, 3], + pool_scales=(1, 2, 3, 6), + channels=512, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=320, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) \ No newline at end of file diff --git a/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_160k.py b/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_160k.py new file mode 100644 index 0000000000000000000000000000000000000000..52603890b10f25faf8eec9f9e5a4468fae09b811 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_160k.py @@ -0,0 +1,9 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) +# runtime settings +runner = dict(type='IterBasedRunner', max_iters=160000) +checkpoint_config = dict(by_epoch=False, interval=16000) +evaluation = dict(interval=16000, metric='mIoU') diff --git a/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_20k.py b/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_20k.py new file mode 100644 index 0000000000000000000000000000000000000000..bf780a1b6f6521833c6a5859675147824efa599d --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_20k.py @@ -0,0 +1,9 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) +# runtime settings +runner = dict(type='IterBasedRunner', max_iters=20000) +checkpoint_config = dict(by_epoch=False, interval=2000) +evaluation = dict(interval=2000, metric='mIoU') diff --git a/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_40k.py b/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_40k.py new file mode 100644 index 0000000000000000000000000000000000000000..cdbf841abcb26eed87bf76ab816aff4bae0630ee --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_40k.py @@ -0,0 +1,9 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) +# runtime settings +runner = dict(type='IterBasedRunner', max_iters=40000) +checkpoint_config = dict(by_epoch=False, interval=4000) +evaluation = dict(interval=4000, metric='mIoU') diff --git a/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_80k.py b/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_80k.py new file mode 100644 index 0000000000000000000000000000000000000000..c190cee6bdc7922b688ea75dc8f152fa15c24617 --- /dev/null +++ b/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_80k.py @@ -0,0 +1,9 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) +# runtime settings +runner = dict(type='IterBasedRunner', max_iters=80000) +checkpoint_config = dict(by_epoch=False, interval=8000) +evaluation = dict(interval=8000, metric='mIoU') diff --git a/lavis/common/annotator/uniformer/exp/upernet_global_small/config.py b/lavis/common/annotator/uniformer/exp/upernet_global_small/config.py new file mode 100644 index 0000000000000000000000000000000000000000..01db96bf9b0be531aa0eaf62fee51543712f8670 --- /dev/null +++ b/lavis/common/annotator/uniformer/exp/upernet_global_small/config.py @@ -0,0 +1,38 @@ +_base_ = [ + '../../configs/_base_/models/upernet_uniformer.py', + '../../configs/_base_/datasets/ade20k.py', + '../../configs/_base_/default_runtime.py', + '../../configs/_base_/schedules/schedule_160k.py' +] +model = dict( + backbone=dict( + type='UniFormer', + embed_dim=[64, 128, 320, 512], + layers=[3, 4, 8, 3], + head_dim=64, + drop_path_rate=0.25, + windows=False, + hybrid=False + ), + decode_head=dict( + in_channels=[64, 128, 320, 512], + num_classes=150 + ), + auxiliary_head=dict( + in_channels=320, + num_classes=150 + )) + +# AdamW optimizer, no weight decay for position embedding & layer norm in backbone +optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01, + paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.)})) + +lr_config = dict(_delete_=True, policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, min_lr=0.0, by_epoch=False) + +data=dict(samples_per_gpu=2) \ No newline at end of file diff --git a/lavis/common/annotator/uniformer/exp/upernet_global_small/run.sh b/lavis/common/annotator/uniformer/exp/upernet_global_small/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9fb22edfa7a32624ea08a63fe7d720c40db3b696 --- /dev/null +++ b/lavis/common/annotator/uniformer/exp/upernet_global_small/run.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +work_path=$(dirname $0) +PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \ +python -m torch.distributed.launch --nproc_per_node=8 \ + tools/train.py ${work_path}/config.py \ + --launcher pytorch \ + --options model.backbone.pretrained_path='your_model_path/uniformer_small_in1k.pth' \ + --work-dir ${work_path}/ckpt \ + 2>&1 | tee -a ${work_path}/log.txt diff --git a/lavis/common/annotator/uniformer/exp/upernet_global_small/test.sh b/lavis/common/annotator/uniformer/exp/upernet_global_small/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..d9a85e7a0d3b7c96b060f473d41254b37a382fcb --- /dev/null +++ b/lavis/common/annotator/uniformer/exp/upernet_global_small/test.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +work_path=$(dirname $0) +PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \ +python -m torch.distributed.launch --nproc_per_node=8 \ + tools/test.py ${work_path}/test_config_h32.py \ + ${work_path}/ckpt/latest.pth \ + --launcher pytorch \ + --eval mIoU \ + 2>&1 | tee -a ${work_path}/log.txt diff --git a/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_g.py b/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_g.py new file mode 100644 index 0000000000000000000000000000000000000000..e43737a98a3b174a9f2fe059c06d511144686459 --- /dev/null +++ b/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_g.py @@ -0,0 +1,38 @@ +_base_ = [ + '../../configs/_base_/models/upernet_uniformer.py', + '../../configs/_base_/datasets/ade20k.py', + '../../configs/_base_/default_runtime.py', + '../../configs/_base_/schedules/schedule_160k.py' +] +model = dict( + backbone=dict( + type='UniFormer', + embed_dim=[64, 128, 320, 512], + layers=[3, 4, 8, 3], + head_dim=64, + drop_path_rate=0.25, + windows=False, + hybrid=False, + ), + decode_head=dict( + in_channels=[64, 128, 320, 512], + num_classes=150 + ), + auxiliary_head=dict( + in_channels=320, + num_classes=150 + )) + +# AdamW optimizer, no weight decay for position embedding & layer norm in backbone +optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01, + paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.)})) + +lr_config = dict(_delete_=True, policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, min_lr=0.0, by_epoch=False) + +data=dict(samples_per_gpu=2) \ No newline at end of file diff --git a/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_h32.py b/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_h32.py new file mode 100644 index 0000000000000000000000000000000000000000..a31e3874f76f9f7b089ac8834d85df2441af9b0e --- /dev/null +++ b/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_h32.py @@ -0,0 +1,39 @@ +_base_ = [ + '../../configs/_base_/models/upernet_uniformer.py', + '../../configs/_base_/datasets/ade20k.py', + '../../configs/_base_/default_runtime.py', + '../../configs/_base_/schedules/schedule_160k.py' +] +model = dict( + backbone=dict( + type='UniFormer', + embed_dim=[64, 128, 320, 512], + layers=[3, 4, 8, 3], + head_dim=64, + drop_path_rate=0.25, + windows=False, + hybrid=True, + window_size=32 + ), + decode_head=dict( + in_channels=[64, 128, 320, 512], + num_classes=150 + ), + auxiliary_head=dict( + in_channels=320, + num_classes=150 + )) + +# AdamW optimizer, no weight decay for position embedding & layer norm in backbone +optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01, + paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.)})) + +lr_config = dict(_delete_=True, policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, min_lr=0.0, by_epoch=False) + +data=dict(samples_per_gpu=2) \ No newline at end of file diff --git a/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_w32.py b/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_w32.py new file mode 100644 index 0000000000000000000000000000000000000000..3d9e06f029e46c14cb9ddb39319cabe86fef9b44 --- /dev/null +++ b/lavis/common/annotator/uniformer/exp/upernet_global_small/test_config_w32.py @@ -0,0 +1,39 @@ +_base_ = [ + '../../configs/_base_/models/upernet_uniformer.py', + '../../configs/_base_/datasets/ade20k.py', + '../../configs/_base_/default_runtime.py', + '../../configs/_base_/schedules/schedule_160k.py' +] +model = dict( + backbone=dict( + type='UniFormer', + embed_dim=[64, 128, 320, 512], + layers=[3, 4, 8, 3], + head_dim=64, + drop_path_rate=0.25, + windows=True, + hybrid=False, + window_size=32 + ), + decode_head=dict( + in_channels=[64, 128, 320, 512], + num_classes=150 + ), + auxiliary_head=dict( + in_channels=320, + num_classes=150 + )) + +# AdamW optimizer, no weight decay for position embedding & layer norm in backbone +optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01, + paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.)})) + +lr_config = dict(_delete_=True, policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, min_lr=0.0, by_epoch=False) + +data=dict(samples_per_gpu=2) \ No newline at end of file diff --git a/lavis/common/annotator/uniformer/mmcv/__init__.py b/lavis/common/annotator/uniformer/mmcv/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..210a2989138380559f23045b568d0fbbeb918c03 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# flake8: noqa +from .arraymisc import * +from .fileio import * +from .image import * +from .utils import * +from .version import * +from .video import * +from .visualization import * + +# The following modules are not imported to this level, so mmcv may be used +# without PyTorch. +# - runner +# - parallel +# - op diff --git a/lavis/common/annotator/uniformer/mmcv/arraymisc/__init__.py b/lavis/common/annotator/uniformer/mmcv/arraymisc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b4700d6139ae3d604ff6e542468cce4200c020c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/arraymisc/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .quantization import dequantize, quantize + +__all__ = ['quantize', 'dequantize'] diff --git a/lavis/common/annotator/uniformer/mmcv/arraymisc/quantization.py b/lavis/common/annotator/uniformer/mmcv/arraymisc/quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..8e47a3545780cf071a1ef8195efb0b7b662c8186 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/arraymisc/quantization.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + + +def quantize(arr, min_val, max_val, levels, dtype=np.int64): + """Quantize an array of (-inf, inf) to [0, levels-1]. + + Args: + arr (ndarray): Input array. + min_val (scalar): Minimum value to be clipped. + max_val (scalar): Maximum value to be clipped. + levels (int): Quantization levels. + dtype (np.type): The type of the quantized array. + + Returns: + tuple: Quantized array. + """ + if not (isinstance(levels, int) and levels > 1): + raise ValueError( + f'levels must be a positive integer, but got {levels}') + if min_val >= max_val: + raise ValueError( + f'min_val ({min_val}) must be smaller than max_val ({max_val})') + + arr = np.clip(arr, min_val, max_val) - min_val + quantized_arr = np.minimum( + np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1) + + return quantized_arr + + +def dequantize(arr, min_val, max_val, levels, dtype=np.float64): + """Dequantize an array. + + Args: + arr (ndarray): Input array. + min_val (scalar): Minimum value to be clipped. + max_val (scalar): Maximum value to be clipped. + levels (int): Quantization levels. + dtype (np.type): The type of the dequantized array. + + Returns: + tuple: Dequantized array. + """ + if not (isinstance(levels, int) and levels > 1): + raise ValueError( + f'levels must be a positive integer, but got {levels}') + if min_val >= max_val: + raise ValueError( + f'min_val ({min_val}) must be smaller than max_val ({max_val})') + + dequantized_arr = (arr + 0.5).astype(dtype) * (max_val - + min_val) / levels + min_val + + return dequantized_arr diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/__init__.py b/lavis/common/annotator/uniformer/mmcv/cnn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7246c897430f0cc7ce12719ad8608824fc734446 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/__init__.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .alexnet import AlexNet +# yapf: disable +from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS, + PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS, + ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule, + ConvTranspose2d, ConvTranspose3d, ConvWS2d, + DepthwiseSeparableConvModule, GeneralizedAttention, + HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d, + NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish, + build_activation_layer, build_conv_layer, + build_norm_layer, build_padding_layer, build_plugin_layer, + build_upsample_layer, conv_ws_2d, is_norm) +from .builder import MODELS, build_model_from_cfg +# yapf: enable +from .resnet import ResNet, make_res_layer +from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit, + NormalInit, PretrainedInit, TruncNormalInit, UniformInit, + XavierInit, bias_init_with_prob, caffe2_xavier_init, + constant_init, fuse_conv_bn, get_model_complexity_info, + initialize, kaiming_init, normal_init, trunc_normal_init, + uniform_init, xavier_init) +from .vgg import VGG, make_vgg_layer + +__all__ = [ + 'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer', + 'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init', + 'uniform_init', 'kaiming_init', 'caffe2_xavier_init', + 'bias_init_with_prob', 'ConvModule', 'build_activation_layer', + 'build_conv_layer', 'build_norm_layer', 'build_padding_layer', + 'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d', + 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', + 'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', + 'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', + 'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d', + 'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', + 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', + 'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit', + 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit', + 'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg' +] diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/alexnet.py b/lavis/common/annotator/uniformer/mmcv/cnn/alexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..89e36b8c7851f895d9ae7f07149f0e707456aab0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/alexnet.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch.nn as nn + + +class AlexNet(nn.Module): + """AlexNet backbone. + + Args: + num_classes (int): number of classes for classification. + """ + + def __init__(self, num_classes=-1): + super(AlexNet, self).__init__() + self.num_classes = num_classes + self.features = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(64, 192, kernel_size=5, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(192, 384, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(384, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + ) + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Dropout(), + nn.Linear(256 * 6 * 6, 4096), + nn.ReLU(inplace=True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(inplace=True), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + from ..runner import load_checkpoint + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + # use default initializer + pass + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + + x = self.features(x) + if self.num_classes > 0: + x = x.view(x.size(0), 256 * 6 * 6) + x = self.classifier(x) + + return x diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/__init__.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0f33124ed23fc6f27119a37bcb5ab004d3572be0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/__init__.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .activation import build_activation_layer +from .context_block import ContextBlock +from .conv import build_conv_layer +from .conv2d_adaptive_padding import Conv2dAdaptivePadding +from .conv_module import ConvModule +from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d +from .depthwise_separable_conv_module import DepthwiseSeparableConvModule +from .drop import Dropout, DropPath +from .generalized_attention import GeneralizedAttention +from .hsigmoid import HSigmoid +from .hswish import HSwish +from .non_local import NonLocal1d, NonLocal2d, NonLocal3d +from .norm import build_norm_layer, is_norm +from .padding import build_padding_layer +from .plugin import build_plugin_layer +from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS, + PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS) +from .scale import Scale +from .swish import Swish +from .upsample import build_upsample_layer +from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d, + Linear, MaxPool2d, MaxPool3d) + +__all__ = [ + 'ConvModule', 'build_activation_layer', 'build_conv_layer', + 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer', + 'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d', + 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention', + 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS', + 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d', + 'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear', + 'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d', + 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath' +] diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/activation.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..cab2712287d5ef7be2f079dcb54a94b96394eab5 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/activation.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from annotator.uniformer.mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version +from .registry import ACTIVATION_LAYERS + +for module in [ + nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU, + nn.Sigmoid, nn.Tanh +]: + ACTIVATION_LAYERS.register_module(module=module) + + +@ACTIVATION_LAYERS.register_module(name='Clip') +@ACTIVATION_LAYERS.register_module() +class Clamp(nn.Module): + """Clamp activation layer. + + This activation function is to clamp the feature map value within + :math:`[min, max]`. More details can be found in ``torch.clamp()``. + + Args: + min (Number | optional): Lower-bound of the range to be clamped to. + Default to -1. + max (Number | optional): Upper-bound of the range to be clamped to. + Default to 1. + """ + + def __init__(self, min=-1., max=1.): + super(Clamp, self).__init__() + self.min = min + self.max = max + + def forward(self, x): + """Forward function. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: Clamped tensor. + """ + return torch.clamp(x, min=self.min, max=self.max) + + +class GELU(nn.Module): + r"""Applies the Gaussian Error Linear Units function: + + .. math:: + \text{GELU}(x) = x * \Phi(x) + where :math:`\Phi(x)` is the Cumulative Distribution Function for + Gaussian Distribution. + + Shape: + - Input: :math:`(N, *)` where `*` means, any number of additional + dimensions + - Output: :math:`(N, *)`, same shape as the input + + .. image:: scripts/activation_images/GELU.png + + Examples:: + + >>> m = nn.GELU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input): + return F.gelu(input) + + +if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.4')): + ACTIVATION_LAYERS.register_module(module=GELU) +else: + ACTIVATION_LAYERS.register_module(module=nn.GELU) + + +def build_activation_layer(cfg): + """Build activation layer. + + Args: + cfg (dict): The activation layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate an activation layer. + + Returns: + nn.Module: Created activation layer. + """ + return build_from_cfg(cfg, ACTIVATION_LAYERS) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/context_block.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/context_block.py new file mode 100644 index 0000000000000000000000000000000000000000..d60fdb904c749ce3b251510dff3cc63cea70d42e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/context_block.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn + +from ..utils import constant_init, kaiming_init +from .registry import PLUGIN_LAYERS + + +def last_zero_init(m): + if isinstance(m, nn.Sequential): + constant_init(m[-1], val=0) + else: + constant_init(m, val=0) + + +@PLUGIN_LAYERS.register_module() +class ContextBlock(nn.Module): + """ContextBlock module in GCNet. + + See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond' + (https://arxiv.org/abs/1904.11492) for details. + + Args: + in_channels (int): Channels of the input feature map. + ratio (float): Ratio of channels of transform bottleneck + pooling_type (str): Pooling method for context modeling. + Options are 'att' and 'avg', stand for attention pooling and + average pooling respectively. Default: 'att'. + fusion_types (Sequence[str]): Fusion method for feature fusion, + Options are 'channels_add', 'channel_mul', stand for channelwise + addition and multiplication respectively. Default: ('channel_add',) + """ + + _abbr_ = 'context_block' + + def __init__(self, + in_channels, + ratio, + pooling_type='att', + fusion_types=('channel_add', )): + super(ContextBlock, self).__init__() + assert pooling_type in ['avg', 'att'] + assert isinstance(fusion_types, (list, tuple)) + valid_fusion_types = ['channel_add', 'channel_mul'] + assert all([f in valid_fusion_types for f in fusion_types]) + assert len(fusion_types) > 0, 'at least one fusion should be used' + self.in_channels = in_channels + self.ratio = ratio + self.planes = int(in_channels * ratio) + self.pooling_type = pooling_type + self.fusion_types = fusion_types + if pooling_type == 'att': + self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1) + self.softmax = nn.Softmax(dim=2) + else: + self.avg_pool = nn.AdaptiveAvgPool2d(1) + if 'channel_add' in fusion_types: + self.channel_add_conv = nn.Sequential( + nn.Conv2d(self.in_channels, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(inplace=True), # yapf: disable + nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) + else: + self.channel_add_conv = None + if 'channel_mul' in fusion_types: + self.channel_mul_conv = nn.Sequential( + nn.Conv2d(self.in_channels, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(inplace=True), # yapf: disable + nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) + else: + self.channel_mul_conv = None + self.reset_parameters() + + def reset_parameters(self): + if self.pooling_type == 'att': + kaiming_init(self.conv_mask, mode='fan_in') + self.conv_mask.inited = True + + if self.channel_add_conv is not None: + last_zero_init(self.channel_add_conv) + if self.channel_mul_conv is not None: + last_zero_init(self.channel_mul_conv) + + def spatial_pool(self, x): + batch, channel, height, width = x.size() + if self.pooling_type == 'att': + input_x = x + # [N, C, H * W] + input_x = input_x.view(batch, channel, height * width) + # [N, 1, C, H * W] + input_x = input_x.unsqueeze(1) + # [N, 1, H, W] + context_mask = self.conv_mask(x) + # [N, 1, H * W] + context_mask = context_mask.view(batch, 1, height * width) + # [N, 1, H * W] + context_mask = self.softmax(context_mask) + # [N, 1, H * W, 1] + context_mask = context_mask.unsqueeze(-1) + # [N, 1, C, 1] + context = torch.matmul(input_x, context_mask) + # [N, C, 1, 1] + context = context.view(batch, channel, 1, 1) + else: + # [N, C, 1, 1] + context = self.avg_pool(x) + + return context + + def forward(self, x): + # [N, C, 1, 1] + context = self.spatial_pool(x) + + out = x + if self.channel_mul_conv is not None: + # [N, C, 1, 1] + channel_mul_term = torch.sigmoid(self.channel_mul_conv(context)) + out = out * channel_mul_term + if self.channel_add_conv is not None: + # [N, C, 1, 1] + channel_add_term = self.channel_add_conv(context) + out = out + channel_add_term + + return out diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..cf54491997a48ac3e7fadc4183ab7bf3e831024c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch import nn + +from .registry import CONV_LAYERS + +CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d) +CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d) +CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d) +CONV_LAYERS.register_module('Conv', module=nn.Conv2d) + + +def build_conv_layer(cfg, *args, **kwargs): + """Build convolution layer. + + Args: + cfg (None or dict): The conv layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate an conv layer. + args (argument list): Arguments passed to the `__init__` + method of the corresponding conv layer. + kwargs (keyword arguments): Keyword arguments passed to the `__init__` + method of the corresponding conv layer. + + Returns: + nn.Module: Created conv layer. + """ + if cfg is None: + cfg_ = dict(type='Conv2d') + else: + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in CONV_LAYERS: + raise KeyError(f'Unrecognized norm type {layer_type}') + else: + conv_layer = CONV_LAYERS.get(layer_type) + + layer = conv_layer(*args, **kwargs, **cfg_) + + return layer diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py new file mode 100644 index 0000000000000000000000000000000000000000..b45e758ac6cf8dfb0382d072fe09125bc7e9b888 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +from torch import nn +from torch.nn import functional as F + +from .registry import CONV_LAYERS + + +@CONV_LAYERS.register_module() +class Conv2dAdaptivePadding(nn.Conv2d): + """Implementation of 2D convolution in tensorflow with `padding` as "same", + which applies padding to input (if needed) so that input image gets fully + covered by filter and stride you specified. For stride 1, this will ensure + that output image size is same as input. For stride of 2, output dimensions + will be half, for example. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel elements. + Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True): + super().__init__(in_channels, out_channels, kernel_size, stride, 0, + dilation, groups, bias) + + def forward(self, x): + img_h, img_w = x.size()[-2:] + kernel_h, kernel_w = self.weight.size()[-2:] + stride_h, stride_w = self.stride + output_h = math.ceil(img_h / stride_h) + output_w = math.ceil(img_w / stride_w) + pad_h = ( + max((output_h - 1) * self.stride[0] + + (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0)) + pad_w = ( + max((output_w - 1) * self.stride[1] + + (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0)) + if pad_h > 0 or pad_w > 0: + x = F.pad(x, [ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 + ]) + return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, + self.dilation, self.groups) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_module.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_module.py new file mode 100644 index 0000000000000000000000000000000000000000..e60e7e62245071c77b652093fddebff3948d7c3e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_module.py @@ -0,0 +1,206 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch.nn as nn + +from annotator.uniformer.mmcv.utils import _BatchNorm, _InstanceNorm +from ..utils import constant_init, kaiming_init +from .activation import build_activation_layer +from .conv import build_conv_layer +from .norm import build_norm_layer +from .padding import build_padding_layer +from .registry import PLUGIN_LAYERS + + +@PLUGIN_LAYERS.register_module() +class ConvModule(nn.Module): + """A conv block that bundles conv/norm/activation layers. + + This block simplifies the usage of convolution layers, which are commonly + used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). + It is based upon three build methods: `build_conv_layer()`, + `build_norm_layer()` and `build_activation_layer()`. + + Besides, we add some additional features in this module. + 1. Automatically set `bias` of the conv layer. + 2. Spectral norm is supported. + 3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only + supports zero and circular padding, and we add "reflect" padding mode. + + Args: + in_channels (int): Number of channels in the input feature map. + Same as that in ``nn._ConvNd``. + out_channels (int): Number of channels produced by the convolution. + Same as that in ``nn._ConvNd``. + kernel_size (int | tuple[int]): Size of the convolving kernel. + Same as that in ``nn._ConvNd``. + stride (int | tuple[int]): Stride of the convolution. + Same as that in ``nn._ConvNd``. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. Same as that in ``nn._ConvNd``. + dilation (int | tuple[int]): Spacing between kernel elements. + Same as that in ``nn._ConvNd``. + groups (int): Number of blocked connections from input channels to + output channels. Same as that in ``nn._ConvNd``. + bias (bool | str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise + False. Default: "auto". + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + inplace (bool): Whether to use inplace mode for activation. + Default: True. + with_spectral_norm (bool): Whether use spectral norm in conv module. + Default: False. + padding_mode (str): If the `padding_mode` has not been supported by + current `Conv2d` in PyTorch, we will use our own padding layer + instead. Currently, we support ['zeros', 'circular'] with official + implementation and ['reflect'] with our own implementation. + Default: 'zeros'. + order (tuple[str]): The order of conv/norm/activation layers. It is a + sequence of "conv", "norm" and "act". Common examples are + ("conv", "norm", "act") and ("act", "conv", "norm"). + Default: ('conv', 'norm', 'act'). + """ + + _abbr_ = 'conv_block' + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias='auto', + conv_cfg=None, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + inplace=True, + with_spectral_norm=False, + padding_mode='zeros', + order=('conv', 'norm', 'act')): + super(ConvModule, self).__init__() + assert conv_cfg is None or isinstance(conv_cfg, dict) + assert norm_cfg is None or isinstance(norm_cfg, dict) + assert act_cfg is None or isinstance(act_cfg, dict) + official_padding_mode = ['zeros', 'circular'] + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.inplace = inplace + self.with_spectral_norm = with_spectral_norm + self.with_explicit_padding = padding_mode not in official_padding_mode + self.order = order + assert isinstance(self.order, tuple) and len(self.order) == 3 + assert set(order) == set(['conv', 'norm', 'act']) + + self.with_norm = norm_cfg is not None + self.with_activation = act_cfg is not None + # if the conv layer is before a norm layer, bias is unnecessary. + if bias == 'auto': + bias = not self.with_norm + self.with_bias = bias + + if self.with_explicit_padding: + pad_cfg = dict(type=padding_mode) + self.padding_layer = build_padding_layer(pad_cfg, padding) + + # reset padding to 0 for conv module + conv_padding = 0 if self.with_explicit_padding else padding + # build convolution layer + self.conv = build_conv_layer( + conv_cfg, + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=conv_padding, + dilation=dilation, + groups=groups, + bias=bias) + # export the attributes of self.conv to a higher level for convenience + self.in_channels = self.conv.in_channels + self.out_channels = self.conv.out_channels + self.kernel_size = self.conv.kernel_size + self.stride = self.conv.stride + self.padding = padding + self.dilation = self.conv.dilation + self.transposed = self.conv.transposed + self.output_padding = self.conv.output_padding + self.groups = self.conv.groups + + if self.with_spectral_norm: + self.conv = nn.utils.spectral_norm(self.conv) + + # build normalization layers + if self.with_norm: + # norm layer is after conv layer + if order.index('norm') > order.index('conv'): + norm_channels = out_channels + else: + norm_channels = in_channels + self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) + self.add_module(self.norm_name, norm) + if self.with_bias: + if isinstance(norm, (_BatchNorm, _InstanceNorm)): + warnings.warn( + 'Unnecessary conv bias before batch/instance norm') + else: + self.norm_name = None + + # build activation layer + if self.with_activation: + act_cfg_ = act_cfg.copy() + # nn.Tanh has no 'inplace' argument + if act_cfg_['type'] not in [ + 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish' + ]: + act_cfg_.setdefault('inplace', inplace) + self.activate = build_activation_layer(act_cfg_) + + # Use msra init by default + self.init_weights() + + @property + def norm(self): + if self.norm_name: + return getattr(self, self.norm_name) + else: + return None + + def init_weights(self): + # 1. It is mainly for customized conv layers with their own + # initialization manners by calling their own ``init_weights()``, + # and we do not want ConvModule to override the initialization. + # 2. For customized conv layers without their own initialization + # manners (that is, they don't have their own ``init_weights()``) + # and PyTorch's conv layers, they will be initialized by + # this method with default ``kaiming_init``. + # Note: For PyTorch's conv layers, they will be overwritten by our + # initialization implementation using default ``kaiming_init``. + if not hasattr(self.conv, 'init_weights'): + if self.with_activation and self.act_cfg['type'] == 'LeakyReLU': + nonlinearity = 'leaky_relu' + a = self.act_cfg.get('negative_slope', 0.01) + else: + nonlinearity = 'relu' + a = 0 + kaiming_init(self.conv, a=a, nonlinearity=nonlinearity) + if self.with_norm: + constant_init(self.norm, 1, bias=0) + + def forward(self, x, activate=True, norm=True): + for layer in self.order: + if layer == 'conv': + if self.with_explicit_padding: + x = self.padding_layer(x) + x = self.conv(x) + elif layer == 'norm' and norm and self.with_norm: + x = self.norm(x) + elif layer == 'act' and activate and self.with_activation: + x = self.activate(x) + return x diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py new file mode 100644 index 0000000000000000000000000000000000000000..a3941e27874993418b3b5708d5a7485f175ff9c8 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .registry import CONV_LAYERS + + +def conv_ws_2d(input, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + eps=1e-5): + c_in = weight.size(0) + weight_flat = weight.view(c_in, -1) + mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) + std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1) + weight = (weight - mean) / (std + eps) + return F.conv2d(input, weight, bias, stride, padding, dilation, groups) + + +@CONV_LAYERS.register_module('ConvWS') +class ConvWS2d(nn.Conv2d): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + eps=1e-5): + super(ConvWS2d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + self.eps = eps + + def forward(self, x): + return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding, + self.dilation, self.groups, self.eps) + + +@CONV_LAYERS.register_module(name='ConvAWS') +class ConvAWS2d(nn.Conv2d): + """AWS (Adaptive Weight Standardization) + + This is a variant of Weight Standardization + (https://arxiv.org/pdf/1903.10520.pdf) + It is used in DetectoRS to avoid NaN + (https://arxiv.org/pdf/2006.02334.pdf) + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the conv kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel elements. + Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If set True, adds a learnable bias to the + output. Default: True + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + self.register_buffer('weight_gamma', + torch.ones(self.out_channels, 1, 1, 1)) + self.register_buffer('weight_beta', + torch.zeros(self.out_channels, 1, 1, 1)) + + def _get_weight(self, weight): + weight_flat = weight.view(weight.size(0), -1) + mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) + std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) + weight = (weight - mean) / std + weight = self.weight_gamma * weight + self.weight_beta + return weight + + def forward(self, x): + weight = self._get_weight(self.weight) + return F.conv2d(x, weight, self.bias, self.stride, self.padding, + self.dilation, self.groups) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + """Override default load function. + + AWS overrides the function _load_from_state_dict to recover + weight_gamma and weight_beta if they are missing. If weight_gamma and + weight_beta are found in the checkpoint, this function will return + after super()._load_from_state_dict. Otherwise, it will compute the + mean and std of the pretrained weights and store them in weight_beta + and weight_gamma. + """ + + self.weight_gamma.data.fill_(-1) + local_missing_keys = [] + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, local_missing_keys, + unexpected_keys, error_msgs) + if self.weight_gamma.data.mean() > 0: + for k in local_missing_keys: + missing_keys.append(k) + return + weight = self.weight.data + weight_flat = weight.view(weight.size(0), -1) + mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) + std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) + self.weight_beta.data.copy_(mean) + self.weight_gamma.data.copy_(std) + missing_gamma_beta = [ + k for k in local_missing_keys + if k.endswith('weight_gamma') or k.endswith('weight_beta') + ] + for k in missing_gamma_beta: + local_missing_keys.remove(k) + for k in local_missing_keys: + missing_keys.append(k) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/depthwise_separable_conv_module.py new file mode 100644 index 0000000000000000000000000000000000000000..722d5d8d71f75486e2db3008907c4eadfca41d63 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/depthwise_separable_conv_module.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .conv_module import ConvModule + + +class DepthwiseSeparableConvModule(nn.Module): + """Depthwise separable convolution module. + + See https://arxiv.org/pdf/1704.04861.pdf for details. + + This module can replace a ConvModule with the conv block replaced by two + conv block: depthwise conv block and pointwise conv block. The depthwise + conv block contains depthwise-conv/norm/activation layers. The pointwise + conv block contains pointwise-conv/norm/activation layers. It should be + noted that there will be norm/activation layer in the depthwise conv block + if `norm_cfg` and `act_cfg` are specified. + + Args: + in_channels (int): Number of channels in the input feature map. + Same as that in ``nn._ConvNd``. + out_channels (int): Number of channels produced by the convolution. + Same as that in ``nn._ConvNd``. + kernel_size (int | tuple[int]): Size of the convolving kernel. + Same as that in ``nn._ConvNd``. + stride (int | tuple[int]): Stride of the convolution. + Same as that in ``nn._ConvNd``. Default: 1. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. Same as that in ``nn._ConvNd``. Default: 0. + dilation (int | tuple[int]): Spacing between kernel elements. + Same as that in ``nn._ConvNd``. Default: 1. + norm_cfg (dict): Default norm config for both depthwise ConvModule and + pointwise ConvModule. Default: None. + act_cfg (dict): Default activation config for both depthwise ConvModule + and pointwise ConvModule. Default: dict(type='ReLU'). + dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is + 'default', it will be the same as `norm_cfg`. Default: 'default'. + dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is + 'default', it will be the same as `act_cfg`. Default: 'default'. + pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is + 'default', it will be the same as `norm_cfg`. Default: 'default'. + pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is + 'default', it will be the same as `act_cfg`. Default: 'default'. + kwargs (optional): Other shared arguments for depthwise and pointwise + ConvModule. See ConvModule for ref. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + dw_norm_cfg='default', + dw_act_cfg='default', + pw_norm_cfg='default', + pw_act_cfg='default', + **kwargs): + super(DepthwiseSeparableConvModule, self).__init__() + assert 'groups' not in kwargs, 'groups should not be specified' + + # if norm/activation config of depthwise/pointwise ConvModule is not + # specified, use default config. + dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg + dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg + pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg + pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg + + # depthwise convolution + self.depthwise_conv = ConvModule( + in_channels, + in_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=in_channels, + norm_cfg=dw_norm_cfg, + act_cfg=dw_act_cfg, + **kwargs) + + self.pointwise_conv = ConvModule( + in_channels, + out_channels, + 1, + norm_cfg=pw_norm_cfg, + act_cfg=pw_act_cfg, + **kwargs) + + def forward(self, x): + x = self.depthwise_conv(x) + x = self.pointwise_conv(x) + return x diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/drop.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/drop.py new file mode 100644 index 0000000000000000000000000000000000000000..b7b4fccd457a0d51fb10c789df3c8537fe7b67c1 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/drop.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from annotator.uniformer.mmcv import build_from_cfg +from .registry import DROPOUT_LAYERS + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + + We follow the implementation + https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + # handle tensors with different dimensions, not just 4D tensors. + shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + torch.rand( + shape, dtype=x.dtype, device=x.device) + output = x.div(keep_prob) * random_tensor.floor() + return output + + +@DROPOUT_LAYERS.register_module() +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + + We follow the implementation + https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 + + Args: + drop_prob (float): Probability of the path to be zeroed. Default: 0.1 + """ + + def __init__(self, drop_prob=0.1): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +@DROPOUT_LAYERS.register_module() +class Dropout(nn.Dropout): + """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of + ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with + ``DropPath`` + + Args: + drop_prob (float): Probability of the elements to be + zeroed. Default: 0.5. + inplace (bool): Do the operation inplace or not. Default: False. + """ + + def __init__(self, drop_prob=0.5, inplace=False): + super().__init__(p=drop_prob, inplace=inplace) + + +def build_dropout(cfg, default_args=None): + """Builder for drop out layers.""" + return build_from_cfg(cfg, DROPOUT_LAYERS, default_args) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/generalized_attention.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/generalized_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..988d9adf2f289ef223bd1c680a5ae1d3387f0269 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/generalized_attention.py @@ -0,0 +1,412 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..utils import kaiming_init +from .registry import PLUGIN_LAYERS + + +@PLUGIN_LAYERS.register_module() +class GeneralizedAttention(nn.Module): + """GeneralizedAttention module. + + See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks' + (https://arxiv.org/abs/1711.07971) for details. + + Args: + in_channels (int): Channels of the input feature map. + spatial_range (int): The spatial range. -1 indicates no spatial range + constraint. Default: -1. + num_heads (int): The head number of empirical_attention module. + Default: 9. + position_embedding_dim (int): The position embedding dimension. + Default: -1. + position_magnitude (int): A multiplier acting on coord difference. + Default: 1. + kv_stride (int): The feature stride acting on key/value feature map. + Default: 2. + q_stride (int): The feature stride acting on query feature map. + Default: 1. + attention_type (str): A binary indicator string for indicating which + items in generalized empirical_attention module are used. + Default: '1111'. + + - '1000' indicates 'query and key content' (appr - appr) item, + - '0100' indicates 'query content and relative position' + (appr - position) item, + - '0010' indicates 'key content only' (bias - appr) item, + - '0001' indicates 'relative position only' (bias - position) item. + """ + + _abbr_ = 'gen_attention_block' + + def __init__(self, + in_channels, + spatial_range=-1, + num_heads=9, + position_embedding_dim=-1, + position_magnitude=1, + kv_stride=2, + q_stride=1, + attention_type='1111'): + + super(GeneralizedAttention, self).__init__() + + # hard range means local range for non-local operation + self.position_embedding_dim = ( + position_embedding_dim + if position_embedding_dim > 0 else in_channels) + + self.position_magnitude = position_magnitude + self.num_heads = num_heads + self.in_channels = in_channels + self.spatial_range = spatial_range + self.kv_stride = kv_stride + self.q_stride = q_stride + self.attention_type = [bool(int(_)) for _ in attention_type] + self.qk_embed_dim = in_channels // num_heads + out_c = self.qk_embed_dim * num_heads + + if self.attention_type[0] or self.attention_type[1]: + self.query_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_c, + kernel_size=1, + bias=False) + self.query_conv.kaiming_init = True + + if self.attention_type[0] or self.attention_type[2]: + self.key_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_c, + kernel_size=1, + bias=False) + self.key_conv.kaiming_init = True + + self.v_dim = in_channels // num_heads + self.value_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=self.v_dim * num_heads, + kernel_size=1, + bias=False) + self.value_conv.kaiming_init = True + + if self.attention_type[1] or self.attention_type[3]: + self.appr_geom_fc_x = nn.Linear( + self.position_embedding_dim // 2, out_c, bias=False) + self.appr_geom_fc_x.kaiming_init = True + + self.appr_geom_fc_y = nn.Linear( + self.position_embedding_dim // 2, out_c, bias=False) + self.appr_geom_fc_y.kaiming_init = True + + if self.attention_type[2]: + stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) + appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv + self.appr_bias = nn.Parameter(appr_bias_value) + + if self.attention_type[3]: + stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) + geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv + self.geom_bias = nn.Parameter(geom_bias_value) + + self.proj_conv = nn.Conv2d( + in_channels=self.v_dim * num_heads, + out_channels=in_channels, + kernel_size=1, + bias=True) + self.proj_conv.kaiming_init = True + self.gamma = nn.Parameter(torch.zeros(1)) + + if self.spatial_range >= 0: + # only works when non local is after 3*3 conv + if in_channels == 256: + max_len = 84 + elif in_channels == 512: + max_len = 42 + + max_len_kv = int((max_len - 1.0) / self.kv_stride + 1) + local_constraint_map = np.ones( + (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int) + for iy in range(max_len): + for ix in range(max_len): + local_constraint_map[ + iy, ix, + max((iy - self.spatial_range) // + self.kv_stride, 0):min((iy + self.spatial_range + + 1) // self.kv_stride + + 1, max_len), + max((ix - self.spatial_range) // + self.kv_stride, 0):min((ix + self.spatial_range + + 1) // self.kv_stride + + 1, max_len)] = 0 + + self.local_constraint_map = nn.Parameter( + torch.from_numpy(local_constraint_map).byte(), + requires_grad=False) + + if self.q_stride > 1: + self.q_downsample = nn.AvgPool2d( + kernel_size=1, stride=self.q_stride) + else: + self.q_downsample = None + + if self.kv_stride > 1: + self.kv_downsample = nn.AvgPool2d( + kernel_size=1, stride=self.kv_stride) + else: + self.kv_downsample = None + + self.init_weights() + + def get_position_embedding(self, + h, + w, + h_kv, + w_kv, + q_stride, + kv_stride, + device, + dtype, + feat_dim, + wave_length=1000): + # the default type of Tensor is float32, leading to type mismatch + # in fp16 mode. Cast it to support fp16 mode. + h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype) + h_idxs = h_idxs.view((h, 1)) * q_stride + + w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype) + w_idxs = w_idxs.view((w, 1)) * q_stride + + h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to( + device=device, dtype=dtype) + h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride + + w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to( + device=device, dtype=dtype) + w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride + + # (h, h_kv, 1) + h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0) + h_diff *= self.position_magnitude + + # (w, w_kv, 1) + w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0) + w_diff *= self.position_magnitude + + feat_range = torch.arange(0, feat_dim / 4).to( + device=device, dtype=dtype) + + dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype) + dim_mat = dim_mat**((4. / feat_dim) * feat_range) + dim_mat = dim_mat.view((1, 1, -1)) + + embedding_x = torch.cat( + ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2) + + embedding_y = torch.cat( + ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2) + + return embedding_x, embedding_y + + def forward(self, x_input): + num_heads = self.num_heads + + # use empirical_attention + if self.q_downsample is not None: + x_q = self.q_downsample(x_input) + else: + x_q = x_input + n, _, h, w = x_q.shape + + if self.kv_downsample is not None: + x_kv = self.kv_downsample(x_input) + else: + x_kv = x_input + _, _, h_kv, w_kv = x_kv.shape + + if self.attention_type[0] or self.attention_type[1]: + proj_query = self.query_conv(x_q).view( + (n, num_heads, self.qk_embed_dim, h * w)) + proj_query = proj_query.permute(0, 1, 3, 2) + + if self.attention_type[0] or self.attention_type[2]: + proj_key = self.key_conv(x_kv).view( + (n, num_heads, self.qk_embed_dim, h_kv * w_kv)) + + if self.attention_type[1] or self.attention_type[3]: + position_embed_x, position_embed_y = self.get_position_embedding( + h, w, h_kv, w_kv, self.q_stride, self.kv_stride, + x_input.device, x_input.dtype, self.position_embedding_dim) + # (n, num_heads, w, w_kv, dim) + position_feat_x = self.appr_geom_fc_x(position_embed_x).\ + view(1, w, w_kv, num_heads, self.qk_embed_dim).\ + permute(0, 3, 1, 2, 4).\ + repeat(n, 1, 1, 1, 1) + + # (n, num_heads, h, h_kv, dim) + position_feat_y = self.appr_geom_fc_y(position_embed_y).\ + view(1, h, h_kv, num_heads, self.qk_embed_dim).\ + permute(0, 3, 1, 2, 4).\ + repeat(n, 1, 1, 1, 1) + + position_feat_x /= math.sqrt(2) + position_feat_y /= math.sqrt(2) + + # accelerate for saliency only + if (np.sum(self.attention_type) == 1) and self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim).\ + repeat(n, 1, 1, 1) + + energy = torch.matmul(appr_bias, proj_key).\ + view(n, num_heads, 1, h_kv * w_kv) + + h = 1 + w = 1 + else: + # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for + if not self.attention_type[0]: + energy = torch.zeros( + n, + num_heads, + h, + w, + h_kv, + w_kv, + dtype=x_input.dtype, + device=x_input.device) + + # attention_type[0]: appr - appr + # attention_type[1]: appr - position + # attention_type[2]: bias - appr + # attention_type[3]: bias - position + if self.attention_type[0] or self.attention_type[2]: + if self.attention_type[0] and self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim) + energy = torch.matmul(proj_query + appr_bias, proj_key).\ + view(n, num_heads, h, w, h_kv, w_kv) + + elif self.attention_type[0]: + energy = torch.matmul(proj_query, proj_key).\ + view(n, num_heads, h, w, h_kv, w_kv) + + elif self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim).\ + repeat(n, 1, 1, 1) + + energy += torch.matmul(appr_bias, proj_key).\ + view(n, num_heads, 1, 1, h_kv, w_kv) + + if self.attention_type[1] or self.attention_type[3]: + if self.attention_type[1] and self.attention_type[3]: + geom_bias = self.geom_bias.\ + view(1, num_heads, 1, self.qk_embed_dim) + + proj_query_reshape = (proj_query + geom_bias).\ + view(n, num_heads, h, w, self.qk_embed_dim) + + energy_x = torch.matmul( + proj_query_reshape.permute(0, 1, 3, 2, 4), + position_feat_x.permute(0, 1, 2, 4, 3)) + energy_x = energy_x.\ + permute(0, 1, 3, 2, 4).unsqueeze(4) + + energy_y = torch.matmul( + proj_query_reshape, + position_feat_y.permute(0, 1, 2, 4, 3)) + energy_y = energy_y.unsqueeze(5) + + energy += energy_x + energy_y + + elif self.attention_type[1]: + proj_query_reshape = proj_query.\ + view(n, num_heads, h, w, self.qk_embed_dim) + proj_query_reshape = proj_query_reshape.\ + permute(0, 1, 3, 2, 4) + position_feat_x_reshape = position_feat_x.\ + permute(0, 1, 2, 4, 3) + position_feat_y_reshape = position_feat_y.\ + permute(0, 1, 2, 4, 3) + + energy_x = torch.matmul(proj_query_reshape, + position_feat_x_reshape) + energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4) + + energy_y = torch.matmul(proj_query_reshape, + position_feat_y_reshape) + energy_y = energy_y.unsqueeze(5) + + energy += energy_x + energy_y + + elif self.attention_type[3]: + geom_bias = self.geom_bias.\ + view(1, num_heads, self.qk_embed_dim, 1).\ + repeat(n, 1, 1, 1) + + position_feat_x_reshape = position_feat_x.\ + view(n, num_heads, w*w_kv, self.qk_embed_dim) + + position_feat_y_reshape = position_feat_y.\ + view(n, num_heads, h * h_kv, self.qk_embed_dim) + + energy_x = torch.matmul(position_feat_x_reshape, geom_bias) + energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv) + + energy_y = torch.matmul(position_feat_y_reshape, geom_bias) + energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1) + + energy += energy_x + energy_y + + energy = energy.view(n, num_heads, h * w, h_kv * w_kv) + + if self.spatial_range >= 0: + cur_local_constraint_map = \ + self.local_constraint_map[:h, :w, :h_kv, :w_kv].\ + contiguous().\ + view(1, 1, h*w, h_kv*w_kv) + + energy = energy.masked_fill_(cur_local_constraint_map, + float('-inf')) + + attention = F.softmax(energy, 3) + + proj_value = self.value_conv(x_kv) + proj_value_reshape = proj_value.\ + view((n, num_heads, self.v_dim, h_kv * w_kv)).\ + permute(0, 1, 3, 2) + + out = torch.matmul(attention, proj_value_reshape).\ + permute(0, 1, 3, 2).\ + contiguous().\ + view(n, self.v_dim * self.num_heads, h, w) + + out = self.proj_conv(out) + + # output is downsampled, upsample back to input size + if self.q_downsample is not None: + out = F.interpolate( + out, + size=x_input.shape[2:], + mode='bilinear', + align_corners=False) + + out = self.gamma * out + x_input + return out + + def init_weights(self): + for m in self.modules(): + if hasattr(m, 'kaiming_init') and m.kaiming_init: + kaiming_init( + m, + mode='fan_in', + nonlinearity='leaky_relu', + bias=0, + distribution='uniform', + a=1) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py new file mode 100644 index 0000000000000000000000000000000000000000..30b1a3d6580cf0360710426fbea1f05acdf07b4b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .registry import ACTIVATION_LAYERS + + +@ACTIVATION_LAYERS.register_module() +class HSigmoid(nn.Module): + """Hard Sigmoid Module. Apply the hard sigmoid function: + Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value) + Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1) + + Args: + bias (float): Bias of the input feature map. Default: 1.0. + divisor (float): Divisor of the input feature map. Default: 2.0. + min_value (float): Lower bound value. Default: 0.0. + max_value (float): Upper bound value. Default: 1.0. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0): + super(HSigmoid, self).__init__() + self.bias = bias + self.divisor = divisor + assert self.divisor != 0 + self.min_value = min_value + self.max_value = max_value + + def forward(self, x): + x = (x + self.bias) / self.divisor + + return x.clamp_(self.min_value, self.max_value) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py new file mode 100644 index 0000000000000000000000000000000000000000..7e0c090ff037c99ee6c5c84c4592e87beae02208 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .registry import ACTIVATION_LAYERS + + +@ACTIVATION_LAYERS.register_module() +class HSwish(nn.Module): + """Hard Swish Module. + + This module applies the hard swish function: + + .. math:: + Hswish(x) = x * ReLU6(x + 3) / 6 + + Args: + inplace (bool): can optionally do the operation in-place. + Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, inplace=False): + super(HSwish, self).__init__() + self.act = nn.ReLU6(inplace) + + def forward(self, x): + return x * self.act(x + 3) / 6 diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/non_local.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/non_local.py new file mode 100644 index 0000000000000000000000000000000000000000..92d00155ef275c1201ea66bba30470a1785cc5d7 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/non_local.py @@ -0,0 +1,306 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +import torch +import torch.nn as nn + +from ..utils import constant_init, normal_init +from .conv_module import ConvModule +from .registry import PLUGIN_LAYERS + + +class _NonLocalNd(nn.Module, metaclass=ABCMeta): + """Basic Non-local module. + + This module is proposed in + "Non-local Neural Networks" + Paper reference: https://arxiv.org/abs/1711.07971 + Code reference: https://github.com/AlexHex7/Non-local_pytorch + + Args: + in_channels (int): Channels of the input feature map. + reduction (int): Channel reduction ratio. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. + Default: True. + conv_cfg (None | dict): The config dict for convolution layers. + If not specified, it will use `nn.Conv2d` for convolution layers. + Default: None. + norm_cfg (None | dict): The config dict for normalization layers. + Default: None. (This parameter is only applicable to conv_out.) + mode (str): Options are `gaussian`, `concatenation`, + `embedded_gaussian` and `dot_product`. Default: embedded_gaussian. + """ + + def __init__(self, + in_channels, + reduction=2, + use_scale=True, + conv_cfg=None, + norm_cfg=None, + mode='embedded_gaussian', + **kwargs): + super(_NonLocalNd, self).__init__() + self.in_channels = in_channels + self.reduction = reduction + self.use_scale = use_scale + self.inter_channels = max(in_channels // reduction, 1) + self.mode = mode + + if mode not in [ + 'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation' + ]: + raise ValueError("Mode should be in 'gaussian', 'concatenation', " + f"'embedded_gaussian' or 'dot_product', but got " + f'{mode} instead.') + + # g, theta, phi are defaulted as `nn.ConvNd`. + # Here we use ConvModule for potential usage. + self.g = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) + self.conv_out = ConvModule( + self.inter_channels, + self.in_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + if self.mode != 'gaussian': + self.theta = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) + self.phi = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) + + if self.mode == 'concatenation': + self.concat_project = ConvModule( + self.inter_channels * 2, + 1, + kernel_size=1, + stride=1, + padding=0, + bias=False, + act_cfg=dict(type='ReLU')) + + self.init_weights(**kwargs) + + def init_weights(self, std=0.01, zeros_init=True): + if self.mode != 'gaussian': + for m in [self.g, self.theta, self.phi]: + normal_init(m.conv, std=std) + else: + normal_init(self.g.conv, std=std) + if zeros_init: + if self.conv_out.norm_cfg is None: + constant_init(self.conv_out.conv, 0) + else: + constant_init(self.conv_out.norm, 0) + else: + if self.conv_out.norm_cfg is None: + normal_init(self.conv_out.conv, std=std) + else: + normal_init(self.conv_out.norm, std=std) + + def gaussian(self, theta_x, phi_x): + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + pairwise_weight = pairwise_weight.softmax(dim=-1) + return pairwise_weight + + def embedded_gaussian(self, theta_x, phi_x): + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + if self.use_scale: + # theta_x.shape[-1] is `self.inter_channels` + pairwise_weight /= theta_x.shape[-1]**0.5 + pairwise_weight = pairwise_weight.softmax(dim=-1) + return pairwise_weight + + def dot_product(self, theta_x, phi_x): + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + pairwise_weight /= pairwise_weight.shape[-1] + return pairwise_weight + + def concatenation(self, theta_x, phi_x): + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + h = theta_x.size(2) + w = phi_x.size(3) + theta_x = theta_x.repeat(1, 1, 1, w) + phi_x = phi_x.repeat(1, 1, h, 1) + + concat_feature = torch.cat([theta_x, phi_x], dim=1) + pairwise_weight = self.concat_project(concat_feature) + n, _, h, w = pairwise_weight.size() + pairwise_weight = pairwise_weight.view(n, h, w) + pairwise_weight /= pairwise_weight.shape[-1] + + return pairwise_weight + + def forward(self, x): + # Assume `reduction = 1`, then `inter_channels = C` + # or `inter_channels = C` when `mode="gaussian"` + + # NonLocal1d x: [N, C, H] + # NonLocal2d x: [N, C, H, W] + # NonLocal3d x: [N, C, T, H, W] + n = x.size(0) + + # NonLocal1d g_x: [N, H, C] + # NonLocal2d g_x: [N, HxW, C] + # NonLocal3d g_x: [N, TxHxW, C] + g_x = self.g(x).view(n, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H] + # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW] + # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW] + if self.mode == 'gaussian': + theta_x = x.view(n, self.in_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + if self.sub_sample: + phi_x = self.phi(x).view(n, self.in_channels, -1) + else: + phi_x = x.view(n, self.in_channels, -1) + elif self.mode == 'concatenation': + theta_x = self.theta(x).view(n, self.inter_channels, -1, 1) + phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) + else: + theta_x = self.theta(x).view(n, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(n, self.inter_channels, -1) + + pairwise_func = getattr(self, self.mode) + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = pairwise_func(theta_x, phi_x) + + # NonLocal1d y: [N, H, C] + # NonLocal2d y: [N, HxW, C] + # NonLocal3d y: [N, TxHxW, C] + y = torch.matmul(pairwise_weight, g_x) + # NonLocal1d y: [N, C, H] + # NonLocal2d y: [N, C, H, W] + # NonLocal3d y: [N, C, T, H, W] + y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels, + *x.size()[2:]) + + output = x + self.conv_out(y) + + return output + + +class NonLocal1d(_NonLocalNd): + """1D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv1d'). + """ + + def __init__(self, + in_channels, + sub_sample=False, + conv_cfg=dict(type='Conv1d'), + **kwargs): + super(NonLocal1d, self).__init__( + in_channels, conv_cfg=conv_cfg, **kwargs) + + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool1d(kernel_size=2) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer + + +@PLUGIN_LAYERS.register_module() +class NonLocal2d(_NonLocalNd): + """2D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv2d'). + """ + + _abbr_ = 'nonlocal_block' + + def __init__(self, + in_channels, + sub_sample=False, + conv_cfg=dict(type='Conv2d'), + **kwargs): + super(NonLocal2d, self).__init__( + in_channels, conv_cfg=conv_cfg, **kwargs) + + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer + + +class NonLocal3d(_NonLocalNd): + """3D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv3d'). + """ + + def __init__(self, + in_channels, + sub_sample=False, + conv_cfg=dict(type='Conv3d'), + **kwargs): + super(NonLocal3d, self).__init__( + in_channels, conv_cfg=conv_cfg, **kwargs) + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/norm.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/norm.py new file mode 100644 index 0000000000000000000000000000000000000000..408f4b42731b19a3beeef68b6a5e610d0bbc18b3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/norm.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect + +import torch.nn as nn + +from annotator.uniformer.mmcv.utils import is_tuple_of +from annotator.uniformer.mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm +from .registry import NORM_LAYERS + +NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d) +NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d) +NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d) +NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d) +NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm) +NORM_LAYERS.register_module('GN', module=nn.GroupNorm) +NORM_LAYERS.register_module('LN', module=nn.LayerNorm) +NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d) +NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d) +NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d) +NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d) + + +def infer_abbr(class_type): + """Infer abbreviation from the class name. + + When we build a norm layer with `build_norm_layer()`, we want to preserve + the norm type in variable names, e.g, self.bn1, self.gn. This method will + infer the abbreviation to map class types to abbreviations. + + Rule 1: If the class has the property "_abbr_", return the property. + Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or + InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and + "in" respectively. + Rule 3: If the class name contains "batch", "group", "layer" or "instance", + the abbreviation of this layer will be "bn", "gn", "ln" and "in" + respectively. + Rule 4: Otherwise, the abbreviation falls back to "norm". + + Args: + class_type (type): The norm layer type. + + Returns: + str: The inferred abbreviation. + """ + if not inspect.isclass(class_type): + raise TypeError( + f'class_type must be a type, but got {type(class_type)}') + if hasattr(class_type, '_abbr_'): + return class_type._abbr_ + if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN + return 'in' + elif issubclass(class_type, _BatchNorm): + return 'bn' + elif issubclass(class_type, nn.GroupNorm): + return 'gn' + elif issubclass(class_type, nn.LayerNorm): + return 'ln' + else: + class_name = class_type.__name__.lower() + if 'batch' in class_name: + return 'bn' + elif 'group' in class_name: + return 'gn' + elif 'layer' in class_name: + return 'ln' + elif 'instance' in class_name: + return 'in' + else: + return 'norm_layer' + + +def build_norm_layer(cfg, num_features, postfix=''): + """Build normalization layer. + + Args: + cfg (dict): The norm layer config, which should contain: + + - type (str): Layer type. + - layer args: Args needed to instantiate a norm layer. + - requires_grad (bool, optional): Whether stop gradient updates. + num_features (int): Number of input channels. + postfix (int | str): The postfix to be appended into norm abbreviation + to create named layer. + + Returns: + (str, nn.Module): The first element is the layer name consisting of + abbreviation and postfix, e.g., bn1, gn. The second element is the + created norm layer. + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in NORM_LAYERS: + raise KeyError(f'Unrecognized norm type {layer_type}') + + norm_layer = NORM_LAYERS.get(layer_type) + abbr = infer_abbr(norm_layer) + + assert isinstance(postfix, (int, str)) + name = abbr + str(postfix) + + requires_grad = cfg_.pop('requires_grad', True) + cfg_.setdefault('eps', 1e-5) + if layer_type != 'GN': + layer = norm_layer(num_features, **cfg_) + if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'): + layer._specify_ddp_gpu_num(1) + else: + assert 'num_groups' in cfg_ + layer = norm_layer(num_channels=num_features, **cfg_) + + for param in layer.parameters(): + param.requires_grad = requires_grad + + return name, layer + + +def is_norm(layer, exclude=None): + """Check if a layer is a normalization layer. + + Args: + layer (nn.Module): The layer to be checked. + exclude (type | tuple[type]): Types to be excluded. + + Returns: + bool: Whether the layer is a norm layer. + """ + if exclude is not None: + if not isinstance(exclude, tuple): + exclude = (exclude, ) + if not is_tuple_of(exclude, type): + raise TypeError( + f'"exclude" must be either None or type or a tuple of types, ' + f'but got {type(exclude)}: {exclude}') + + if exclude and isinstance(layer, exclude): + return False + + all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm) + return isinstance(layer, all_norm_bases) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ac6b28a1789bd551c613a7d3e7b622433ac7ec --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .registry import PADDING_LAYERS + +PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d) +PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d) +PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d) + + +def build_padding_layer(cfg, *args, **kwargs): + """Build padding layer. + + Args: + cfg (None or dict): The padding layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate a padding layer. + + Returns: + nn.Module: Created padding layer. + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + + cfg_ = cfg.copy() + padding_type = cfg_.pop('type') + if padding_type not in PADDING_LAYERS: + raise KeyError(f'Unrecognized padding type {padding_type}.') + else: + padding_layer = PADDING_LAYERS.get(padding_type) + + layer = padding_layer(*args, **kwargs, **cfg_) + + return layer diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/plugin.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..07c010d4053174dd41107aa654ea67e82b46a25c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/plugin.py @@ -0,0 +1,88 @@ +import inspect +import platform + +from .registry import PLUGIN_LAYERS + +if platform.system() == 'Windows': + import regex as re +else: + import re + + +def infer_abbr(class_type): + """Infer abbreviation from the class name. + + This method will infer the abbreviation to map class types to + abbreviations. + + Rule 1: If the class has the property "abbr", return the property. + Rule 2: Otherwise, the abbreviation falls back to snake case of class + name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``. + + Args: + class_type (type): The norm layer type. + + Returns: + str: The inferred abbreviation. + """ + + def camel2snack(word): + """Convert camel case word into snack case. + + Modified from `inflection lib + `_. + + Example:: + + >>> camel2snack("FancyBlock") + 'fancy_block' + """ + + word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word) + word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word) + word = word.replace('-', '_') + return word.lower() + + if not inspect.isclass(class_type): + raise TypeError( + f'class_type must be a type, but got {type(class_type)}') + if hasattr(class_type, '_abbr_'): + return class_type._abbr_ + else: + return camel2snack(class_type.__name__) + + +def build_plugin_layer(cfg, postfix='', **kwargs): + """Build plugin layer. + + Args: + cfg (None or dict): cfg should contain: + type (str): identify plugin layer type. + layer args: args needed to instantiate a plugin layer. + postfix (int, str): appended into norm abbreviation to + create named layer. Default: ''. + + Returns: + tuple[str, nn.Module]: + name (str): abbreviation + postfix + layer (nn.Module): created plugin layer + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in PLUGIN_LAYERS: + raise KeyError(f'Unrecognized plugin type {layer_type}') + + plugin_layer = PLUGIN_LAYERS.get(layer_type) + abbr = infer_abbr(plugin_layer) + + assert isinstance(postfix, (int, str)) + name = abbr + str(postfix) + + layer = plugin_layer(**kwargs, **cfg_) + + return name, layer diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..39eabc58db4b5954478a2ac1ab91cea5e45ab055 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from annotator.uniformer.mmcv.utils import Registry + +CONV_LAYERS = Registry('conv layer') +NORM_LAYERS = Registry('norm layer') +ACTIVATION_LAYERS = Registry('activation layer') +PADDING_LAYERS = Registry('padding layer') +UPSAMPLE_LAYERS = Registry('upsample layer') +PLUGIN_LAYERS = Registry('plugin layer') + +DROPOUT_LAYERS = Registry('drop out layers') +POSITIONAL_ENCODING = Registry('position encoding') +ATTENTION = Registry('attention') +FEEDFORWARD_NETWORK = Registry('feed-forward Network') +TRANSFORMER_LAYER = Registry('transformerLayer') +TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence') diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py new file mode 100644 index 0000000000000000000000000000000000000000..c905fffcc8bf998d18d94f927591963c428025e2 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + + +class Scale(nn.Module): + """A learnable scale parameter. + + This layer scales the input by a learnable factor. It multiplies a + learnable scale parameter of shape (1,) with input of any shape. + + Args: + scale (float): Initial value of scale factor. Default: 1.0 + """ + + def __init__(self, scale=1.0): + super(Scale, self).__init__() + self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) + + def forward(self, x): + return x * self.scale diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py new file mode 100644 index 0000000000000000000000000000000000000000..e2ca8ed7b749413f011ae54aac0cab27e6f0b51f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from .registry import ACTIVATION_LAYERS + + +@ACTIVATION_LAYERS.register_module() +class Swish(nn.Module): + """Swish Module. + + This module applies the swish function: + + .. math:: + Swish(x) = x * Sigmoid(x) + + Returns: + Tensor: The output tensor. + """ + + def __init__(self): + super(Swish, self).__init__() + + def forward(self, x): + return x * torch.sigmoid(x) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/transformer.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e61ae0dd941a7be00b3e41a3de833ec50470a45f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/transformer.py @@ -0,0 +1,595 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings + +import torch +import torch.nn as nn + +from annotator.uniformer.mmcv import ConfigDict, deprecated_api_warning +from annotator.uniformer.mmcv.cnn import Linear, build_activation_layer, build_norm_layer +from annotator.uniformer.mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from annotator.uniformer.mmcv.utils import build_from_cfg +from .drop import build_dropout +from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, + TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) + +# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file +try: + from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 + warnings.warn( + ImportWarning( + '``MultiScaleDeformableAttention`` has been moved to ' + '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 + '``from annotator.uniformer.mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 + 'to ``from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 + )) + +except ImportError: + warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' + '``mmcv.ops.multi_scale_deform_attn``, ' + 'You should install ``mmcv-full`` if you need this module. ') + + +def build_positional_encoding(cfg, default_args=None): + """Builder for Position Encoding.""" + return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args) + + +def build_attention(cfg, default_args=None): + """Builder for attention.""" + return build_from_cfg(cfg, ATTENTION, default_args) + + +def build_feedforward_network(cfg, default_args=None): + """Builder for feed-forward network (FFN).""" + return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args) + + +def build_transformer_layer(cfg, default_args=None): + """Builder for transformer layer.""" + return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args) + + +def build_transformer_layer_sequence(cfg, default_args=None): + """Builder for transformer encoder and transformer decoder.""" + return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args) + + +@ATTENTION.register_module() +class MultiheadAttention(BaseModule): + """A wrapper for ``torch.nn.MultiheadAttention``. + + This module implements MultiheadAttention with identity connection, + and positional encoding is also passed as input. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): When it is True, Key, Query and Value are shape of + (batch, n, embed_dim), otherwise (n, batch, embed_dim). + Default to False. + """ + + def __init__(self, + embed_dims, + num_heads, + attn_drop=0., + proj_drop=0., + dropout_layer=dict(type='Dropout', drop_prob=0.), + init_cfg=None, + batch_first=False, + **kwargs): + super(MultiheadAttention, self).__init__(init_cfg) + if 'dropout' in kwargs: + warnings.warn('The arguments `dropout` in MultiheadAttention ' + 'has been deprecated, now you can separately ' + 'set `attn_drop`(float), proj_drop(float), ' + 'and `dropout_layer`(dict) ') + attn_drop = kwargs['dropout'] + dropout_layer['drop_prob'] = kwargs.pop('dropout') + + self.embed_dims = embed_dims + self.num_heads = num_heads + self.batch_first = batch_first + + self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, + **kwargs) + + self.proj_drop = nn.Dropout(proj_drop) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else nn.Identity() + + @deprecated_api_warning({'residual': 'identity'}, + cls_name='MultiheadAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_pos=None, + attn_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `MultiheadAttention`. + + **kwargs allow passing a more general data flow when combining + with other operations in `transformerlayer`. + + Args: + query (Tensor): The input query with shape [num_queries, bs, + embed_dims] if self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + If None, the ``query`` will be used. Defaults to None. + value (Tensor): The value tensor with same shape as `key`. + Same in `nn.MultiheadAttention.forward`. Defaults to None. + If None, the `key` will be used. + identity (Tensor): This tensor, with the same shape as x, + will be used for the identity link. + If None, `x` will be used. Defaults to None. + query_pos (Tensor): The positional encoding for query, with + the same shape as `x`. If not None, it will + be added to `x` before forward function. Defaults to None. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. Defaults to None. If not None, it will + be added to `key` before forward function. If None, and + `query_pos` has the same shape as `key`, then `query_pos` + will be used for `key_pos`. Defaults to None. + attn_mask (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + + Returns: + Tensor: forwarded results with shape + [num_queries, bs, embed_dims] + if self.batch_first is False, else + [bs, num_queries embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + if identity is None: + identity = query + if key_pos is None: + if query_pos is not None: + # use query_pos if key_pos is not available + if query_pos.shape == key.shape: + key_pos = query_pos + else: + warnings.warn(f'position encoding of key is' + f'missing in {self.__class__.__name__}.') + if query_pos is not None: + query = query + query_pos + if key_pos is not None: + key = key + key_pos + + # Because the dataflow('key', 'query', 'value') of + # ``torch.nn.MultiheadAttention`` is (num_query, batch, + # embed_dims), We should adjust the shape of dataflow from + # batch_first (batch, num_query, embed_dims) to num_query_first + # (num_query ,batch, embed_dims), and recover ``attn_output`` + # from num_query_first to batch_first. + if self.batch_first: + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + out = self.attn( + query=query, + key=key, + value=value, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask)[0] + + if self.batch_first: + out = out.transpose(0, 1) + + return identity + self.dropout_layer(self.proj_drop(out)) + + +@FEEDFORWARD_NETWORK.register_module() +class FFN(BaseModule): + """Implements feed-forward networks (FFNs) with identity connection. + + Args: + embed_dims (int): The feature dimension. Same as + `MultiheadAttention`. Defaults: 256. + feedforward_channels (int): The hidden dimension of FFNs. + Defaults: 1024. + num_fcs (int, optional): The number of fully-connected layers in + FFNs. Default: 2. + act_cfg (dict, optional): The activation config for FFNs. + Default: dict(type='ReLU') + ffn_drop (float, optional): Probability of an element to be + zeroed in FFN. Default 0.0. + add_identity (bool, optional): Whether to add the + identity connection. Default: `True`. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + @deprecated_api_warning( + { + 'dropout': 'ffn_drop', + 'add_residual': 'add_identity' + }, + cls_name='FFN') + def __init__(self, + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0., + dropout_layer=None, + add_identity=True, + init_cfg=None, + **kwargs): + super(FFN, self).__init__(init_cfg) + assert num_fcs >= 2, 'num_fcs should be no less ' \ + f'than 2. got {num_fcs}.' + self.embed_dims = embed_dims + self.feedforward_channels = feedforward_channels + self.num_fcs = num_fcs + self.act_cfg = act_cfg + self.activate = build_activation_layer(act_cfg) + + layers = [] + in_channels = embed_dims + for _ in range(num_fcs - 1): + layers.append( + Sequential( + Linear(in_channels, feedforward_channels), self.activate, + nn.Dropout(ffn_drop))) + in_channels = feedforward_channels + layers.append(Linear(feedforward_channels, embed_dims)) + layers.append(nn.Dropout(ffn_drop)) + self.layers = Sequential(*layers) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else torch.nn.Identity() + self.add_identity = add_identity + + @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN') + def forward(self, x, identity=None): + """Forward function for `FFN`. + + The function would add x to the output tensor if residue is None. + """ + out = self.layers(x) + if not self.add_identity: + return self.dropout_layer(out) + if identity is None: + identity = x + return identity + self.dropout_layer(out) + + +@TRANSFORMER_LAYER.register_module() +class BaseTransformerLayer(BaseModule): + """Base `TransformerLayer` for vision transformer. + + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=False, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ') + ffn_cfgs[new_name] = kwargs[ori_name] + + super(BaseTransformerLayer, self).__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & set( + ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all four operation type ' \ + f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count('self_attn') + operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index], + dict(type='FFN'))) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + + **kwargs contains some specific arguments of attentions. + + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class TransformerLayerSequence(BaseModule): + """Base class for TransformerEncoder and TransformerDecoder in vision + transformer. + + As base-class of Encoder and Decoder in vision transformer. + Support customization such as specifying different kind + of `transformer_layer` in `transformer_coder`. + + Args: + transformerlayer (list[obj:`mmcv.ConfigDict`] | + obj:`mmcv.ConfigDict`): Config of transformerlayer + in TransformerCoder. If it is obj:`mmcv.ConfigDict`, + it would be repeated `num_layer` times to a + list[`mmcv.ConfigDict`]. Default: None. + num_layers (int): The number of `TransformerLayer`. Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): + super(TransformerLayerSequence, self).__init__(init_cfg) + if isinstance(transformerlayers, dict): + transformerlayers = [ + copy.deepcopy(transformerlayers) for _ in range(num_layers) + ] + else: + assert isinstance(transformerlayers, list) and \ + len(transformerlayers) == num_layers + self.num_layers = num_layers + self.layers = ModuleList() + for i in range(num_layers): + self.layers.append(build_transformer_layer(transformerlayers[i])) + self.embed_dims = self.layers[0].embed_dims + self.pre_norm = self.layers[0].pre_norm + + def forward(self, + query, + key, + value, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerCoder`. + + Args: + query (Tensor): Input query with shape + `(num_queries, bs, embed_dims)`. + key (Tensor): The key tensor with shape + `(num_keys, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_keys, bs, embed_dims)`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor], optional): Each element is 2D Tensor + which is used in calculation of corresponding attention in + operation_order. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in self-attention + Default: None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: results with shape [num_queries, bs, embed_dims]. + """ + for layer in self.layers: + query = layer( + query, + key, + value, + query_pos=query_pos, + key_pos=key_pos, + attn_masks=attn_masks, + query_key_padding_mask=query_key_padding_mask, + key_padding_mask=key_padding_mask, + **kwargs) + return query diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/upsample.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/upsample.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a353767d0ce8518f0d7289bed10dba0178ed12 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/upsample.py @@ -0,0 +1,84 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.nn.functional as F + +from ..utils import xavier_init +from .registry import UPSAMPLE_LAYERS + +UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample) +UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample) + + +@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle') +class PixelShufflePack(nn.Module): + """Pixel Shuffle upsample layer. + + This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to + achieve a simple upsampling with pixel shuffle. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + scale_factor (int): Upsample ratio. + upsample_kernel (int): Kernel size of the conv layer to expand the + channels. + """ + + def __init__(self, in_channels, out_channels, scale_factor, + upsample_kernel): + super(PixelShufflePack, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.scale_factor = scale_factor + self.upsample_kernel = upsample_kernel + self.upsample_conv = nn.Conv2d( + self.in_channels, + self.out_channels * scale_factor * scale_factor, + self.upsample_kernel, + padding=(self.upsample_kernel - 1) // 2) + self.init_weights() + + def init_weights(self): + xavier_init(self.upsample_conv, distribution='uniform') + + def forward(self, x): + x = self.upsample_conv(x) + x = F.pixel_shuffle(x, self.scale_factor) + return x + + +def build_upsample_layer(cfg, *args, **kwargs): + """Build upsample layer. + + Args: + cfg (dict): The upsample layer config, which should contain: + + - type (str): Layer type. + - scale_factor (int): Upsample ratio, which is not applicable to + deconv. + - layer args: Args needed to instantiate a upsample layer. + args (argument list): Arguments passed to the ``__init__`` + method of the corresponding conv layer. + kwargs (keyword arguments): Keyword arguments passed to the + ``__init__`` method of the corresponding conv layer. + + Returns: + nn.Module: Created upsample layer. + """ + if not isinstance(cfg, dict): + raise TypeError(f'cfg must be a dict, but got {type(cfg)}') + if 'type' not in cfg: + raise KeyError( + f'the cfg dict must contain the key "type", but got {cfg}') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in UPSAMPLE_LAYERS: + raise KeyError(f'Unrecognized upsample type {layer_type}') + else: + upsample = UPSAMPLE_LAYERS.get(layer_type) + + if upsample is nn.Upsample: + cfg_['mode'] = layer_type + layer = upsample(*args, **kwargs, **cfg_) + return layer diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/bricks/wrappers.py b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..8aebf67bf52355a513f21756ee74fe510902d075 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/bricks/wrappers.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501 + +Wrap some nn modules to support empty tensor input. Currently, these wrappers +are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask +heads are trained on only positive RoIs. +""" +import math + +import torch +import torch.nn as nn +from torch.nn.modules.utils import _pair, _triple + +from .registry import CONV_LAYERS, UPSAMPLE_LAYERS + +if torch.__version__ == 'parrots': + TORCH_VERSION = torch.__version__ +else: + # torch.__version__ could be 1.3.1+cu92, we only need the first two + # for comparison + TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2]) + + +def obsolete_torch_version(torch_version, version_threshold): + return torch_version == 'parrots' or torch_version <= version_threshold + + +class NewEmptyTensorOp(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, new_shape): + ctx.shape = x.shape + return x.new_empty(new_shape) + + @staticmethod + def backward(ctx, grad): + shape = ctx.shape + return NewEmptyTensorOp.apply(grad, shape), None + + +@CONV_LAYERS.register_module('Conv', force=True) +class Conv2d(nn.Conv2d): + + def forward(self, x): + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size, + self.padding, self.stride, self.dilation): + o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@CONV_LAYERS.register_module('Conv3d', force=True) +class Conv3d(nn.Conv3d): + + def forward(self, x): + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size, + self.padding, self.stride, self.dilation): + o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@CONV_LAYERS.register_module() +@CONV_LAYERS.register_module('deconv') +@UPSAMPLE_LAYERS.register_module('deconv', force=True) +class ConvTranspose2d(nn.ConvTranspose2d): + + def forward(self, x): + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size, + self.padding, self.stride, + self.dilation, self.output_padding): + out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@CONV_LAYERS.register_module() +@CONV_LAYERS.register_module('deconv3d') +@UPSAMPLE_LAYERS.register_module('deconv3d', force=True) +class ConvTranspose3d(nn.ConvTranspose3d): + + def forward(self, x): + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size, + self.padding, self.stride, + self.dilation, self.output_padding): + out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +class MaxPool2d(nn.MaxPool2d): + + def forward(self, x): + # PyTorch 1.9 does not support empty tensor inference yet + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): + out_shape = list(x.shape[:2]) + for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size), + _pair(self.padding), _pair(self.stride), + _pair(self.dilation)): + o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 + o = math.ceil(o) if self.ceil_mode else math.floor(o) + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + return empty + + return super().forward(x) + + +class MaxPool3d(nn.MaxPool3d): + + def forward(self, x): + # PyTorch 1.9 does not support empty tensor inference yet + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): + out_shape = list(x.shape[:2]) + for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size), + _triple(self.padding), + _triple(self.stride), + _triple(self.dilation)): + o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 + o = math.ceil(o) if self.ceil_mode else math.floor(o) + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + return empty + + return super().forward(x) + + +class Linear(torch.nn.Linear): + + def forward(self, x): + # empty tensor forward of Linear layer is supported in Pytorch 1.6 + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)): + out_shape = [x.shape[0], self.out_features] + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/builder.py b/lavis/common/annotator/uniformer/mmcv/cnn/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..7567316c566bd3aca6d8f65a84b00e9e890948a7 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/builder.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..runner import Sequential +from ..utils import Registry, build_from_cfg + + +def build_model_from_cfg(cfg, registry, default_args=None): + """Build a PyTorch model from config dict(s). Different from + ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built. + + Args: + cfg (dict, list[dict]): The config of modules, is is either a config + dict or a list of config dicts. If cfg is a list, a + the built modules will be wrapped with ``nn.Sequential``. + registry (:obj:`Registry`): A registry the module belongs to. + default_args (dict, optional): Default arguments to build the module. + Defaults to None. + + Returns: + nn.Module: A built nn module. + """ + if isinstance(cfg, list): + modules = [ + build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg + ] + return Sequential(*modules) + else: + return build_from_cfg(cfg, registry, default_args) + + +MODELS = Registry('model', build_func=build_model_from_cfg) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/resnet.py b/lavis/common/annotator/uniformer/mmcv/cnn/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..1cb3ac057ee2d52c46fc94685b5d4e698aad8d5f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/resnet.py @@ -0,0 +1,316 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch.nn as nn +import torch.utils.checkpoint as cp + +from .utils import constant_init, kaiming_init + + +def conv3x3(in_planes, out_planes, stride=1, dilation=1): + """3x3 convolution with padding.""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False): + super(BasicBlock, self).__init__() + assert style in ['pytorch', 'caffe'] + self.conv1 = conv3x3(inplanes, planes, stride, dilation) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + assert not with_cp + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False): + """Bottleneck block. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if + it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super(Bottleneck, self).__init__() + assert style in ['pytorch', 'caffe'] + if style == 'pytorch': + conv1_stride = 1 + conv2_stride = stride + else: + conv1_stride = stride + conv2_stride = 1 + self.conv1 = nn.Conv2d( + inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False) + self.conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + stride=conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + + self.bn1 = nn.BatchNorm2d(planes) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d( + planes, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + def forward(self, x): + + def _inner_forward(x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +def make_res_layer(block, + inplanes, + planes, + blocks, + stride=1, + dilation=1, + style='pytorch', + with_cp=False): + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append( + block( + inplanes, + planes, + stride, + dilation, + downsample, + style=style, + with_cp=with_cp)) + inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp)) + + return nn.Sequential(*layers) + + +class ResNet(nn.Module): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + num_stages (int): Resnet stages, normally 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze + running stats (mean and var). + bn_frozen (bool): Whether to freeze weight and bias of BN layers. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + depth, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(0, 1, 2, 3), + style='pytorch', + frozen_stages=-1, + bn_eval=True, + bn_frozen=False, + with_cp=False): + super(ResNet, self).__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + assert num_stages >= 1 and num_stages <= 4 + block, stage_blocks = self.arch_settings[depth] + stage_blocks = stage_blocks[:num_stages] + assert len(strides) == len(dilations) == num_stages + assert max(out_indices) < num_stages + + self.out_indices = out_indices + self.style = style + self.frozen_stages = frozen_stages + self.bn_eval = bn_eval + self.bn_frozen = bn_frozen + self.with_cp = with_cp + + self.inplanes = 64 + self.conv1 = nn.Conv2d( + 3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.res_layers = [] + for i, num_blocks in enumerate(stage_blocks): + stride = strides[i] + dilation = dilations[i] + planes = 64 * 2**i + res_layer = make_res_layer( + block, + self.inplanes, + planes, + num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + with_cp=with_cp) + self.inplanes = planes * block.expansion + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + from ..runner import load_checkpoint + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode=True): + super(ResNet, self).train(mode) + if self.bn_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.bn_frozen: + for params in m.parameters(): + params.requires_grad = False + if mode and self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for param in self.bn1.parameters(): + param.requires_grad = False + self.bn1.eval() + self.bn1.weight.requires_grad = False + self.bn1.bias.requires_grad = False + for i in range(1, self.frozen_stages + 1): + mod = getattr(self, f'layer{i}') + mod.eval() + for param in mod.parameters(): + param.requires_grad = False diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/utils/__init__.py b/lavis/common/annotator/uniformer/mmcv/cnn/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a263e31c1e3977712827ca229bbc04910b4e928e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/utils/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .flops_counter import get_model_complexity_info +from .fuse_conv_bn import fuse_conv_bn +from .sync_bn import revert_sync_batchnorm +from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit, + KaimingInit, NormalInit, PretrainedInit, + TruncNormalInit, UniformInit, XavierInit, + bias_init_with_prob, caffe2_xavier_init, + constant_init, initialize, kaiming_init, normal_init, + trunc_normal_init, uniform_init, xavier_init) + +__all__ = [ + 'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init', + 'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init', + 'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize', + 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit', + 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit', + 'Caffe2XavierInit', 'revert_sync_batchnorm' +] diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/utils/flops_counter.py b/lavis/common/annotator/uniformer/mmcv/cnn/utils/flops_counter.py new file mode 100644 index 0000000000000000000000000000000000000000..d10af5feca7f4b8c0ba359b7b1c826f754e048be --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/utils/flops_counter.py @@ -0,0 +1,599 @@ +# Modified from flops-counter.pytorch by Vladislav Sovrasov +# original repo: https://github.com/sovrasov/flops-counter.pytorch + +# MIT License + +# Copyright (c) 2018 Vladislav Sovrasov + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import sys +from functools import partial + +import numpy as np +import torch +import torch.nn as nn + +import annotator.uniformer.mmcv as mmcv + + +def get_model_complexity_info(model, + input_shape, + print_per_layer_stat=True, + as_strings=True, + input_constructor=None, + flush=False, + ost=sys.stdout): + """Get complexity information of a model. + + This method can calculate FLOPs and parameter counts of a model with + corresponding input shape. It can also print complexity information for + each layer in a model. + + Supported layers are listed as below: + - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``. + - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``, + ``nn.ReLU6``. + - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``, + ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``, + ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``, + ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``, + ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``. + - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``, + ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``, + ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``. + - Linear: ``nn.Linear``. + - Deconvolution: ``nn.ConvTranspose2d``. + - Upsample: ``nn.Upsample``. + + Args: + model (nn.Module): The model for complexity calculation. + input_shape (tuple): Input shape used for calculation. + print_per_layer_stat (bool): Whether to print complexity information + for each layer in a model. Default: True. + as_strings (bool): Output FLOPs and params counts in a string form. + Default: True. + input_constructor (None | callable): If specified, it takes a callable + method that generates input. otherwise, it will generate a random + tensor with input shape to calculate FLOPs. Default: None. + flush (bool): same as that in :func:`print`. Default: False. + ost (stream): same as ``file`` param in :func:`print`. + Default: sys.stdout. + + Returns: + tuple[float | str]: If ``as_strings`` is set to True, it will return + FLOPs and parameter counts in a string format. otherwise, it will + return those in a float number format. + """ + assert type(input_shape) is tuple + assert len(input_shape) >= 1 + assert isinstance(model, nn.Module) + flops_model = add_flops_counting_methods(model) + flops_model.eval() + flops_model.start_flops_count() + if input_constructor: + input = input_constructor(input_shape) + _ = flops_model(**input) + else: + try: + batch = torch.ones(()).new_empty( + (1, *input_shape), + dtype=next(flops_model.parameters()).dtype, + device=next(flops_model.parameters()).device) + except StopIteration: + # Avoid StopIteration for models which have no parameters, + # like `nn.Relu()`, `nn.AvgPool2d`, etc. + batch = torch.ones(()).new_empty((1, *input_shape)) + + _ = flops_model(batch) + + flops_count, params_count = flops_model.compute_average_flops_cost() + if print_per_layer_stat: + print_model_with_flops( + flops_model, flops_count, params_count, ost=ost, flush=flush) + flops_model.stop_flops_count() + + if as_strings: + return flops_to_string(flops_count), params_to_string(params_count) + + return flops_count, params_count + + +def flops_to_string(flops, units='GFLOPs', precision=2): + """Convert FLOPs number into a string. + + Note that Here we take a multiply-add counts as one FLOP. + + Args: + flops (float): FLOPs number to be converted. + units (str | None): Converted FLOPs units. Options are None, 'GFLOPs', + 'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically + choose the most suitable unit for FLOPs. Default: 'GFLOPs'. + precision (int): Digit number after the decimal point. Default: 2. + + Returns: + str: The converted FLOPs number with units. + + Examples: + >>> flops_to_string(1e9) + '1.0 GFLOPs' + >>> flops_to_string(2e5, 'MFLOPs') + '0.2 MFLOPs' + >>> flops_to_string(3e-9, None) + '3e-09 FLOPs' + """ + if units is None: + if flops // 10**9 > 0: + return str(round(flops / 10.**9, precision)) + ' GFLOPs' + elif flops // 10**6 > 0: + return str(round(flops / 10.**6, precision)) + ' MFLOPs' + elif flops // 10**3 > 0: + return str(round(flops / 10.**3, precision)) + ' KFLOPs' + else: + return str(flops) + ' FLOPs' + else: + if units == 'GFLOPs': + return str(round(flops / 10.**9, precision)) + ' ' + units + elif units == 'MFLOPs': + return str(round(flops / 10.**6, precision)) + ' ' + units + elif units == 'KFLOPs': + return str(round(flops / 10.**3, precision)) + ' ' + units + else: + return str(flops) + ' FLOPs' + + +def params_to_string(num_params, units=None, precision=2): + """Convert parameter number into a string. + + Args: + num_params (float): Parameter number to be converted. + units (str | None): Converted FLOPs units. Options are None, 'M', + 'K' and ''. If set to None, it will automatically choose the most + suitable unit for Parameter number. Default: None. + precision (int): Digit number after the decimal point. Default: 2. + + Returns: + str: The converted parameter number with units. + + Examples: + >>> params_to_string(1e9) + '1000.0 M' + >>> params_to_string(2e5) + '200.0 k' + >>> params_to_string(3e-9) + '3e-09' + """ + if units is None: + if num_params // 10**6 > 0: + return str(round(num_params / 10**6, precision)) + ' M' + elif num_params // 10**3: + return str(round(num_params / 10**3, precision)) + ' k' + else: + return str(num_params) + else: + if units == 'M': + return str(round(num_params / 10.**6, precision)) + ' ' + units + elif units == 'K': + return str(round(num_params / 10.**3, precision)) + ' ' + units + else: + return str(num_params) + + +def print_model_with_flops(model, + total_flops, + total_params, + units='GFLOPs', + precision=3, + ost=sys.stdout, + flush=False): + """Print a model with FLOPs for each layer. + + Args: + model (nn.Module): The model to be printed. + total_flops (float): Total FLOPs of the model. + total_params (float): Total parameter counts of the model. + units (str | None): Converted FLOPs units. Default: 'GFLOPs'. + precision (int): Digit number after the decimal point. Default: 3. + ost (stream): same as `file` param in :func:`print`. + Default: sys.stdout. + flush (bool): same as that in :func:`print`. Default: False. + + Example: + >>> class ExampleModel(nn.Module): + + >>> def __init__(self): + >>> super().__init__() + >>> self.conv1 = nn.Conv2d(3, 8, 3) + >>> self.conv2 = nn.Conv2d(8, 256, 3) + >>> self.conv3 = nn.Conv2d(256, 8, 3) + >>> self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) + >>> self.flatten = nn.Flatten() + >>> self.fc = nn.Linear(8, 1) + + >>> def forward(self, x): + >>> x = self.conv1(x) + >>> x = self.conv2(x) + >>> x = self.conv3(x) + >>> x = self.avg_pool(x) + >>> x = self.flatten(x) + >>> x = self.fc(x) + >>> return x + + >>> model = ExampleModel() + >>> x = (3, 16, 16) + to print the complexity information state for each layer, you can use + >>> get_model_complexity_info(model, x) + or directly use + >>> print_model_with_flops(model, 4579784.0, 37361) + ExampleModel( + 0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs, + (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1)) # noqa: E501 + (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1)) + (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1)) + (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1)) + (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, ) + (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True) + ) + """ + + def accumulate_params(self): + if is_supported_instance(self): + return self.__params__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_params() + return sum + + def accumulate_flops(self): + if is_supported_instance(self): + return self.__flops__ / model.__batch_counter__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_flops() + return sum + + def flops_repr(self): + accumulated_num_params = self.accumulate_params() + accumulated_flops_cost = self.accumulate_flops() + return ', '.join([ + params_to_string( + accumulated_num_params, units='M', precision=precision), + '{:.3%} Params'.format(accumulated_num_params / total_params), + flops_to_string( + accumulated_flops_cost, units=units, precision=precision), + '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops), + self.original_extra_repr() + ]) + + def add_extra_repr(m): + m.accumulate_flops = accumulate_flops.__get__(m) + m.accumulate_params = accumulate_params.__get__(m) + flops_extra_repr = flops_repr.__get__(m) + if m.extra_repr != flops_extra_repr: + m.original_extra_repr = m.extra_repr + m.extra_repr = flops_extra_repr + assert m.extra_repr != m.original_extra_repr + + def del_extra_repr(m): + if hasattr(m, 'original_extra_repr'): + m.extra_repr = m.original_extra_repr + del m.original_extra_repr + if hasattr(m, 'accumulate_flops'): + del m.accumulate_flops + + model.apply(add_extra_repr) + print(model, file=ost, flush=flush) + model.apply(del_extra_repr) + + +def get_model_parameters_number(model): + """Calculate parameter number of a model. + + Args: + model (nn.module): The model for parameter number calculation. + + Returns: + float: Parameter number of the model. + """ + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + return num_params + + +def add_flops_counting_methods(net_main_module): + # adding additional methods to the existing module object, + # this is done this way so that each function has access to self object + net_main_module.start_flops_count = start_flops_count.__get__( + net_main_module) + net_main_module.stop_flops_count = stop_flops_count.__get__( + net_main_module) + net_main_module.reset_flops_count = reset_flops_count.__get__( + net_main_module) + net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # noqa: E501 + net_main_module) + + net_main_module.reset_flops_count() + + return net_main_module + + +def compute_average_flops_cost(self): + """Compute average FLOPs cost. + + A method to compute average FLOPs cost, which will be available after + `add_flops_counting_methods()` is called on a desired net object. + + Returns: + float: Current mean flops consumption per image. + """ + batches_count = self.__batch_counter__ + flops_sum = 0 + for module in self.modules(): + if is_supported_instance(module): + flops_sum += module.__flops__ + params_sum = get_model_parameters_number(self) + return flops_sum / batches_count, params_sum + + +def start_flops_count(self): + """Activate the computation of mean flops consumption per image. + + A method to activate the computation of mean flops consumption per image. + which will be available after ``add_flops_counting_methods()`` is called on + a desired net object. It should be called before running the network. + """ + add_batch_counter_hook_function(self) + + def add_flops_counter_hook_function(module): + if is_supported_instance(module): + if hasattr(module, '__flops_handle__'): + return + + else: + handle = module.register_forward_hook( + get_modules_mapping()[type(module)]) + + module.__flops_handle__ = handle + + self.apply(partial(add_flops_counter_hook_function)) + + +def stop_flops_count(self): + """Stop computing the mean flops consumption per image. + + A method to stop computing the mean flops consumption per image, which will + be available after ``add_flops_counting_methods()`` is called on a desired + net object. It can be called to pause the computation whenever. + """ + remove_batch_counter_hook_function(self) + self.apply(remove_flops_counter_hook_function) + + +def reset_flops_count(self): + """Reset statistics computed so far. + + A method to Reset computed statistics, which will be available after + `add_flops_counting_methods()` is called on a desired net object. + """ + add_batch_counter_variables_or_reset(self) + self.apply(add_flops_counter_variable_or_reset) + + +# ---- Internal functions +def empty_flops_counter_hook(module, input, output): + module.__flops__ += 0 + + +def upsample_flops_counter_hook(module, input, output): + output_size = output[0] + batch_size = output_size.shape[0] + output_elements_count = batch_size + for val in output_size.shape[1:]: + output_elements_count *= val + module.__flops__ += int(output_elements_count) + + +def relu_flops_counter_hook(module, input, output): + active_elements_count = output.numel() + module.__flops__ += int(active_elements_count) + + +def linear_flops_counter_hook(module, input, output): + input = input[0] + output_last_dim = output.shape[ + -1] # pytorch checks dimensions, so here we don't care much + module.__flops__ += int(np.prod(input.shape) * output_last_dim) + + +def pool_flops_counter_hook(module, input, output): + input = input[0] + module.__flops__ += int(np.prod(input.shape)) + + +def norm_flops_counter_hook(module, input, output): + input = input[0] + + batch_flops = np.prod(input.shape) + if (getattr(module, 'affine', False) + or getattr(module, 'elementwise_affine', False)): + batch_flops *= 2 + module.__flops__ += int(batch_flops) + + +def deconv_flops_counter_hook(conv_module, input, output): + # Can have multiple inputs, getting the first one + input = input[0] + + batch_size = input.shape[0] + input_height, input_width = input.shape[2:] + + kernel_height, kernel_width = conv_module.kernel_size + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = ( + kernel_height * kernel_width * in_channels * filters_per_channel) + + active_elements_count = batch_size * input_height * input_width + overall_conv_flops = conv_per_position_flops * active_elements_count + bias_flops = 0 + if conv_module.bias is not None: + output_height, output_width = output.shape[2:] + bias_flops = out_channels * batch_size * output_height * output_height + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def conv_flops_counter_hook(conv_module, input, output): + # Can have multiple inputs, getting the first one + input = input[0] + + batch_size = input.shape[0] + output_dims = list(output.shape[2:]) + + kernel_dims = list(conv_module.kernel_size) + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = int( + np.prod(kernel_dims)) * in_channels * filters_per_channel + + active_elements_count = batch_size * int(np.prod(output_dims)) + + overall_conv_flops = conv_per_position_flops * active_elements_count + + bias_flops = 0 + + if conv_module.bias is not None: + + bias_flops = out_channels * active_elements_count + + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def batch_counter_hook(module, input, output): + batch_size = 1 + if len(input) > 0: + # Can have multiple inputs, getting the first one + input = input[0] + batch_size = len(input) + else: + pass + print('Warning! No positional inputs found for a module, ' + 'assuming batch size is 1.') + module.__batch_counter__ += batch_size + + +def add_batch_counter_variables_or_reset(module): + + module.__batch_counter__ = 0 + + +def add_batch_counter_hook_function(module): + if hasattr(module, '__batch_counter_handle__'): + return + + handle = module.register_forward_hook(batch_counter_hook) + module.__batch_counter_handle__ = handle + + +def remove_batch_counter_hook_function(module): + if hasattr(module, '__batch_counter_handle__'): + module.__batch_counter_handle__.remove() + del module.__batch_counter_handle__ + + +def add_flops_counter_variable_or_reset(module): + if is_supported_instance(module): + if hasattr(module, '__flops__') or hasattr(module, '__params__'): + print('Warning: variables __flops__ or __params__ are already ' + 'defined for the module' + type(module).__name__ + + ' ptflops can affect your code!') + module.__flops__ = 0 + module.__params__ = get_model_parameters_number(module) + + +def is_supported_instance(module): + if type(module) in get_modules_mapping(): + return True + return False + + +def remove_flops_counter_hook_function(module): + if is_supported_instance(module): + if hasattr(module, '__flops_handle__'): + module.__flops_handle__.remove() + del module.__flops_handle__ + + +def get_modules_mapping(): + return { + # convolutions + nn.Conv1d: conv_flops_counter_hook, + nn.Conv2d: conv_flops_counter_hook, + mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook, + nn.Conv3d: conv_flops_counter_hook, + mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook, + # activations + nn.ReLU: relu_flops_counter_hook, + nn.PReLU: relu_flops_counter_hook, + nn.ELU: relu_flops_counter_hook, + nn.LeakyReLU: relu_flops_counter_hook, + nn.ReLU6: relu_flops_counter_hook, + # poolings + nn.MaxPool1d: pool_flops_counter_hook, + nn.AvgPool1d: pool_flops_counter_hook, + nn.AvgPool2d: pool_flops_counter_hook, + nn.MaxPool2d: pool_flops_counter_hook, + mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook, + nn.MaxPool3d: pool_flops_counter_hook, + mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook, + nn.AvgPool3d: pool_flops_counter_hook, + nn.AdaptiveMaxPool1d: pool_flops_counter_hook, + nn.AdaptiveAvgPool1d: pool_flops_counter_hook, + nn.AdaptiveMaxPool2d: pool_flops_counter_hook, + nn.AdaptiveAvgPool2d: pool_flops_counter_hook, + nn.AdaptiveMaxPool3d: pool_flops_counter_hook, + nn.AdaptiveAvgPool3d: pool_flops_counter_hook, + # normalizations + nn.BatchNorm1d: norm_flops_counter_hook, + nn.BatchNorm2d: norm_flops_counter_hook, + nn.BatchNorm3d: norm_flops_counter_hook, + nn.GroupNorm: norm_flops_counter_hook, + nn.InstanceNorm1d: norm_flops_counter_hook, + nn.InstanceNorm2d: norm_flops_counter_hook, + nn.InstanceNorm3d: norm_flops_counter_hook, + nn.LayerNorm: norm_flops_counter_hook, + # FC + nn.Linear: linear_flops_counter_hook, + mmcv.cnn.bricks.Linear: linear_flops_counter_hook, + # Upscale + nn.Upsample: upsample_flops_counter_hook, + # Deconvolution + nn.ConvTranspose2d: deconv_flops_counter_hook, + mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook, + } diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/utils/fuse_conv_bn.py b/lavis/common/annotator/uniformer/mmcv/cnn/utils/fuse_conv_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..cb7076f80bf37f7931185bf0293ffcc1ce19c8ef --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/utils/fuse_conv_bn.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + + +def _fuse_conv_bn(conv, bn): + """Fuse conv and bn into one module. + + Args: + conv (nn.Module): Conv to be fused. + bn (nn.Module): BN to be fused. + + Returns: + nn.Module: Fused module. + """ + conv_w = conv.weight + conv_b = conv.bias if conv.bias is not None else torch.zeros_like( + bn.running_mean) + + factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) + conv.weight = nn.Parameter(conv_w * + factor.reshape([conv.out_channels, 1, 1, 1])) + conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) + return conv + + +def fuse_conv_bn(module): + """Recursively fuse conv and bn in a module. + + During inference, the functionary of batch norm layers is turned off + but only the mean and var alone channels are used, which exposes the + chance to fuse it with the preceding conv layers to save computations and + simplify network structures. + + Args: + module (nn.Module): Module to be fused. + + Returns: + nn.Module: Fused module. + """ + last_conv = None + last_conv_name = None + + for name, child in module.named_children(): + if isinstance(child, + (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)): + if last_conv is None: # only fuse BN that is after Conv + continue + fused_conv = _fuse_conv_bn(last_conv, child) + module._modules[last_conv_name] = fused_conv + # To reduce changes, set BN as Identity instead of deleting it. + module._modules[name] = nn.Identity() + last_conv = None + elif isinstance(child, nn.Conv2d): + last_conv = child + last_conv_name = name + else: + fuse_conv_bn(child) + return module diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/utils/sync_bn.py b/lavis/common/annotator/uniformer/mmcv/cnn/utils/sync_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..f78f39181d75bb85c53e8c7c8eaf45690e9f0bee --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/utils/sync_bn.py @@ -0,0 +1,59 @@ +import torch + +import annotator.uniformer.mmcv as mmcv + + +class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm): + """A general BatchNorm layer without input dimension check. + + Reproduced from @kapily's work: + (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547) + The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc + is `_check_input_dim` that is designed for tensor sanity checks. + The check has been bypassed in this class for the convenience of converting + SyncBatchNorm. + """ + + def _check_input_dim(self, input): + return + + +def revert_sync_batchnorm(module): + """Helper function to convert all `SyncBatchNorm` (SyncBN) and + `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to + `BatchNormXd` layers. + + Adapted from @kapily's work: + (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547) + + Args: + module (nn.Module): The module containing `SyncBatchNorm` layers. + + Returns: + module_output: The converted module with `BatchNormXd` layers. + """ + module_output = module + module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm] + if hasattr(mmcv, 'ops'): + module_checklist.append(mmcv.ops.SyncBatchNorm) + if isinstance(module, tuple(module_checklist)): + module_output = _BatchNormXd(module.num_features, module.eps, + module.momentum, module.affine, + module.track_running_stats) + if module.affine: + # no_grad() may not be needed here but + # just to be consistent with `convert_sync_batchnorm()` + with torch.no_grad(): + module_output.weight = module.weight + module_output.bias = module.bias + module_output.running_mean = module.running_mean + module_output.running_var = module.running_var + module_output.num_batches_tracked = module.num_batches_tracked + module_output.training = module.training + # qconfig exists in quantized models + if hasattr(module, 'qconfig'): + module_output.qconfig = module.qconfig + for name, child in module.named_children(): + module_output.add_module(name, revert_sync_batchnorm(child)) + del module + return module_output diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/utils/weight_init.py b/lavis/common/annotator/uniformer/mmcv/cnn/utils/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..287a1d0bffe26e023029d48634d9b761deda7ba4 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/utils/weight_init.py @@ -0,0 +1,684 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +import warnings + +import numpy as np +import torch +import torch.nn as nn +from torch import Tensor + +from annotator.uniformer.mmcv.utils import Registry, build_from_cfg, get_logger, print_log + +INITIALIZERS = Registry('initializer') + + +def update_init_info(module, init_info): + """Update the `_params_init_info` in the module if the value of parameters + are changed. + + Args: + module (obj:`nn.Module`): The module of PyTorch with a user-defined + attribute `_params_init_info` which records the initialization + information. + init_info (str): The string that describes the initialization. + """ + assert hasattr( + module, + '_params_init_info'), f'Can not find `_params_init_info` in {module}' + for name, param in module.named_parameters(): + + assert param in module._params_init_info, ( + f'Find a new :obj:`Parameter` ' + f'named `{name}` during executing the ' + f'`init_weights` of ' + f'`{module.__class__.__name__}`. ' + f'Please do not add or ' + f'replace parameters during executing ' + f'the `init_weights`. ') + + # The parameter has been changed during executing the + # `init_weights` of module + mean_value = param.data.mean() + if module._params_init_info[param]['tmp_mean_value'] != mean_value: + module._params_init_info[param]['init_info'] = init_info + module._params_init_info[param]['tmp_mean_value'] = mean_value + + +def constant_init(module, val, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.constant_(module.weight, val) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def xavier_init(module, gain=1, bias=0, distribution='normal'): + assert distribution in ['uniform', 'normal'] + if hasattr(module, 'weight') and module.weight is not None: + if distribution == 'uniform': + nn.init.xavier_uniform_(module.weight, gain=gain) + else: + nn.init.xavier_normal_(module.weight, gain=gain) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def normal_init(module, mean=0, std=1, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.normal_(module.weight, mean, std) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def trunc_normal_init(module: nn.Module, + mean: float = 0, + std: float = 1, + a: float = -2, + b: float = 2, + bias: float = 0) -> None: + if hasattr(module, 'weight') and module.weight is not None: + trunc_normal_(module.weight, mean, std, a, b) # type: ignore + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) # type: ignore + + +def uniform_init(module, a=0, b=1, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.uniform_(module.weight, a, b) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def kaiming_init(module, + a=0, + mode='fan_out', + nonlinearity='relu', + bias=0, + distribution='normal'): + assert distribution in ['uniform', 'normal'] + if hasattr(module, 'weight') and module.weight is not None: + if distribution == 'uniform': + nn.init.kaiming_uniform_( + module.weight, a=a, mode=mode, nonlinearity=nonlinearity) + else: + nn.init.kaiming_normal_( + module.weight, a=a, mode=mode, nonlinearity=nonlinearity) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def caffe2_xavier_init(module, bias=0): + # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch + # Acknowledgment to FAIR's internal code + kaiming_init( + module, + a=1, + mode='fan_in', + nonlinearity='leaky_relu', + bias=bias, + distribution='uniform') + + +def bias_init_with_prob(prior_prob): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init + + +def _get_bases_name(m): + return [b.__name__ for b in m.__class__.__bases__] + + +class BaseInit(object): + + def __init__(self, *, bias=0, bias_prob=None, layer=None): + self.wholemodule = False + if not isinstance(bias, (int, float)): + raise TypeError(f'bias must be a number, but got a {type(bias)}') + + if bias_prob is not None: + if not isinstance(bias_prob, float): + raise TypeError(f'bias_prob type must be float, \ + but got {type(bias_prob)}') + + if layer is not None: + if not isinstance(layer, (str, list)): + raise TypeError(f'layer must be a str or a list of str, \ + but got a {type(layer)}') + else: + layer = [] + + if bias_prob is not None: + self.bias = bias_init_with_prob(bias_prob) + else: + self.bias = bias + self.layer = [layer] if isinstance(layer, str) else layer + + def _get_init_info(self): + info = f'{self.__class__.__name__}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Constant') +class ConstantInit(BaseInit): + """Initialize module parameters with constant values. + + Args: + val (int | float): the value to fill the weights in the module with + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + """ + + def __init__(self, val, **kwargs): + super().__init__(**kwargs) + self.val = val + + def __call__(self, module): + + def init(m): + if self.wholemodule: + constant_init(m, self.val, self.bias) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + constant_init(m, self.val, self.bias) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Xavier') +class XavierInit(BaseInit): + r"""Initialize module parameters with values according to the method + described in `Understanding the difficulty of training deep feedforward + neural networks - Glorot, X. & Bengio, Y. (2010). + `_ + + Args: + gain (int | float): an optional scaling factor. Defaults to 1. + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + distribution (str): distribution either be ``'normal'`` + or ``'uniform'``. Defaults to ``'normal'``. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + """ + + def __init__(self, gain=1, distribution='normal', **kwargs): + super().__init__(**kwargs) + self.gain = gain + self.distribution = distribution + + def __call__(self, module): + + def init(m): + if self.wholemodule: + xavier_init(m, self.gain, self.bias, self.distribution) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + xavier_init(m, self.gain, self.bias, self.distribution) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: gain={self.gain}, ' \ + f'distribution={self.distribution}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Normal') +class NormalInit(BaseInit): + r"""Initialize module parameters with the values drawn from the normal + distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`. + + Args: + mean (int | float):the mean of the normal distribution. Defaults to 0. + std (int | float): the standard deviation of the normal distribution. + Defaults to 1. + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + + """ + + def __init__(self, mean=0, std=1, **kwargs): + super().__init__(**kwargs) + self.mean = mean + self.std = std + + def __call__(self, module): + + def init(m): + if self.wholemodule: + normal_init(m, self.mean, self.std, self.bias) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + normal_init(m, self.mean, self.std, self.bias) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: mean={self.mean},' \ + f' std={self.std}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='TruncNormal') +class TruncNormalInit(BaseInit): + r"""Initialize module parameters with the values drawn from the normal + distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values + outside :math:`[a, b]`. + + Args: + mean (float): the mean of the normal distribution. Defaults to 0. + std (float): the standard deviation of the normal distribution. + Defaults to 1. + a (float): The minimum cutoff value. + b ( float): The maximum cutoff value. + bias (float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + + """ + + def __init__(self, + mean: float = 0, + std: float = 1, + a: float = -2, + b: float = 2, + **kwargs) -> None: + super().__init__(**kwargs) + self.mean = mean + self.std = std + self.a = a + self.b = b + + def __call__(self, module: nn.Module) -> None: + + def init(m): + if self.wholemodule: + trunc_normal_init(m, self.mean, self.std, self.a, self.b, + self.bias) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + trunc_normal_init(m, self.mean, self.std, self.a, self.b, + self.bias) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \ + f' mean={self.mean}, std={self.std}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Uniform') +class UniformInit(BaseInit): + r"""Initialize module parameters with values drawn from the uniform + distribution :math:`\mathcal{U}(a, b)`. + + Args: + a (int | float): the lower bound of the uniform distribution. + Defaults to 0. + b (int | float): the upper bound of the uniform distribution. + Defaults to 1. + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + """ + + def __init__(self, a=0, b=1, **kwargs): + super().__init__(**kwargs) + self.a = a + self.b = b + + def __call__(self, module): + + def init(m): + if self.wholemodule: + uniform_init(m, self.a, self.b, self.bias) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + uniform_init(m, self.a, self.b, self.bias) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: a={self.a},' \ + f' b={self.b}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Kaiming') +class KaimingInit(BaseInit): + r"""Initialize module parameters with the values according to the method + described in `Delving deep into rectifiers: Surpassing human-level + performance on ImageNet classification - He, K. et al. (2015). + `_ + + Args: + a (int | float): the negative slope of the rectifier used after this + layer (only used with ``'leaky_relu'``). Defaults to 0. + mode (str): either ``'fan_in'`` or ``'fan_out'``. Choosing + ``'fan_in'`` preserves the magnitude of the variance of the weights + in the forward pass. Choosing ``'fan_out'`` preserves the + magnitudes in the backwards pass. Defaults to ``'fan_out'``. + nonlinearity (str): the non-linear function (`nn.functional` name), + recommended to use only with ``'relu'`` or ``'leaky_relu'`` . + Defaults to 'relu'. + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + distribution (str): distribution either be ``'normal'`` or + ``'uniform'``. Defaults to ``'normal'``. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + """ + + def __init__(self, + a=0, + mode='fan_out', + nonlinearity='relu', + distribution='normal', + **kwargs): + super().__init__(**kwargs) + self.a = a + self.mode = mode + self.nonlinearity = nonlinearity + self.distribution = distribution + + def __call__(self, module): + + def init(m): + if self.wholemodule: + kaiming_init(m, self.a, self.mode, self.nonlinearity, + self.bias, self.distribution) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + kaiming_init(m, self.a, self.mode, self.nonlinearity, + self.bias, self.distribution) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \ + f'nonlinearity={self.nonlinearity}, ' \ + f'distribution ={self.distribution}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Caffe2Xavier') +class Caffe2XavierInit(KaimingInit): + # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch + # Acknowledgment to FAIR's internal code + def __init__(self, **kwargs): + super().__init__( + a=1, + mode='fan_in', + nonlinearity='leaky_relu', + distribution='uniform', + **kwargs) + + def __call__(self, module): + super().__call__(module) + + +@INITIALIZERS.register_module(name='Pretrained') +class PretrainedInit(object): + """Initialize module by loading a pretrained model. + + Args: + checkpoint (str): the checkpoint file of the pretrained model should + be load. + prefix (str, optional): the prefix of a sub-module in the pretrained + model. it is for loading a part of the pretrained model to + initialize. For example, if we would like to only load the + backbone of a detector model, we can set ``prefix='backbone.'``. + Defaults to None. + map_location (str): map tensors into proper locations. + """ + + def __init__(self, checkpoint, prefix=None, map_location=None): + self.checkpoint = checkpoint + self.prefix = prefix + self.map_location = map_location + + def __call__(self, module): + from annotator.uniformer.mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint, + load_state_dict) + logger = get_logger('mmcv') + if self.prefix is None: + print_log(f'load model from: {self.checkpoint}', logger=logger) + load_checkpoint( + module, + self.checkpoint, + map_location=self.map_location, + strict=False, + logger=logger) + else: + print_log( + f'load {self.prefix} in model from: {self.checkpoint}', + logger=logger) + state_dict = _load_checkpoint_with_prefix( + self.prefix, self.checkpoint, map_location=self.map_location) + load_state_dict(module, state_dict, strict=False, logger=logger) + + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: load from {self.checkpoint}' + return info + + +def _initialize(module, cfg, wholemodule=False): + func = build_from_cfg(cfg, INITIALIZERS) + # wholemodule flag is for override mode, there is no layer key in override + # and initializer will give init values for the whole module with the name + # in override. + func.wholemodule = wholemodule + func(module) + + +def _initialize_override(module, override, cfg): + if not isinstance(override, (dict, list)): + raise TypeError(f'override must be a dict or a list of dict, \ + but got {type(override)}') + + override = [override] if isinstance(override, dict) else override + + for override_ in override: + + cp_override = copy.deepcopy(override_) + name = cp_override.pop('name', None) + if name is None: + raise ValueError('`override` must contain the key "name",' + f'but got {cp_override}') + # if override only has name key, it means use args in init_cfg + if not cp_override: + cp_override.update(cfg) + # if override has name key and other args except type key, it will + # raise error + elif 'type' not in cp_override.keys(): + raise ValueError( + f'`override` need "type" key, but got {cp_override}') + + if hasattr(module, name): + _initialize(getattr(module, name), cp_override, wholemodule=True) + else: + raise RuntimeError(f'module did not have attribute {name}, ' + f'but init_cfg is {cp_override}.') + + +def initialize(module, init_cfg): + """Initialize a module. + + Args: + module (``torch.nn.Module``): the module will be initialized. + init_cfg (dict | list[dict]): initialization configuration dict to + define initializer. OpenMMLab has implemented 6 initializers + including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``, + ``Kaiming``, and ``Pretrained``. + Example: + >>> module = nn.Linear(2, 3, bias=True) + >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2) + >>> initialize(module, init_cfg) + + >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2)) + >>> # define key ``'layer'`` for initializing layer with different + >>> # configuration + >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1), + dict(type='Constant', layer='Linear', val=2)] + >>> initialize(module, init_cfg) + + >>> # define key``'override'`` to initialize some specific part in + >>> # module + >>> class FooNet(nn.Module): + >>> def __init__(self): + >>> super().__init__() + >>> self.feat = nn.Conv2d(3, 16, 3) + >>> self.reg = nn.Conv2d(16, 10, 3) + >>> self.cls = nn.Conv2d(16, 5, 3) + >>> model = FooNet() + >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d', + >>> override=dict(type='Constant', name='reg', val=3, bias=4)) + >>> initialize(model, init_cfg) + + >>> model = ResNet(depth=50) + >>> # Initialize weights with the pretrained model. + >>> init_cfg = dict(type='Pretrained', + checkpoint='torchvision://resnet50') + >>> initialize(model, init_cfg) + + >>> # Initialize weights of a sub-module with the specific part of + >>> # a pretrained model by using "prefix". + >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\ + >>> 'retinanet_r50_fpn_1x_coco/'\ + >>> 'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth' + >>> init_cfg = dict(type='Pretrained', + checkpoint=url, prefix='backbone.') + """ + if not isinstance(init_cfg, (dict, list)): + raise TypeError(f'init_cfg must be a dict or a list of dict, \ + but got {type(init_cfg)}') + + if isinstance(init_cfg, dict): + init_cfg = [init_cfg] + + for cfg in init_cfg: + # should deeply copy the original config because cfg may be used by + # other modules, e.g., one init_cfg shared by multiple bottleneck + # blocks, the expected cfg will be changed after pop and will change + # the initialization behavior of other modules + cp_cfg = copy.deepcopy(cfg) + override = cp_cfg.pop('override', None) + _initialize(module, cp_cfg) + + if override is not None: + cp_cfg.pop('layer', None) + _initialize_override(module, override, cp_cfg) + else: + # All attributes in module have same initialization. + pass + + +def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float, + b: float) -> Tensor: + # Method based on + # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + # Modified from + # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. ' + 'The distribution of values may be incorrect.', + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + lower = norm_cdf((a - mean) / std) + upper = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [lower, upper], then translate + # to [2lower-1, 2upper-1]. + tensor.uniform_(2 * lower - 1, 2 * upper - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor: Tensor, + mean: float = 0., + std: float = 1., + a: float = -2., + b: float = 2.) -> Tensor: + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + + Modified from + https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py + + Args: + tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`. + mean (float): the mean of the normal distribution. + std (float): the standard deviation of the normal distribution. + a (float): the minimum cutoff value. + b (float): the maximum cutoff value. + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) diff --git a/lavis/common/annotator/uniformer/mmcv/cnn/vgg.py b/lavis/common/annotator/uniformer/mmcv/cnn/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..8778b649561a45a9652b1a15a26c2d171e58f3e1 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/cnn/vgg.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch.nn as nn + +from .utils import constant_init, kaiming_init, normal_init + + +def conv3x3(in_planes, out_planes, dilation=1): + """3x3 convolution with padding.""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + padding=dilation, + dilation=dilation) + + +def make_vgg_layer(inplanes, + planes, + num_blocks, + dilation=1, + with_bn=False, + ceil_mode=False): + layers = [] + for _ in range(num_blocks): + layers.append(conv3x3(inplanes, planes, dilation)) + if with_bn: + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + inplanes = planes + layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode)) + + return layers + + +class VGG(nn.Module): + """VGG backbone. + + Args: + depth (int): Depth of vgg, from {11, 13, 16, 19}. + with_bn (bool): Use BatchNorm or not. + num_classes (int): number of classes for classification. + num_stages (int): VGG stages, normally 5. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze + running stats (mean and var). + bn_frozen (bool): Whether to freeze weight and bias of BN layers. + """ + + arch_settings = { + 11: (1, 1, 2, 2, 2), + 13: (2, 2, 2, 2, 2), + 16: (2, 2, 3, 3, 3), + 19: (2, 2, 4, 4, 4) + } + + def __init__(self, + depth, + with_bn=False, + num_classes=-1, + num_stages=5, + dilations=(1, 1, 1, 1, 1), + out_indices=(0, 1, 2, 3, 4), + frozen_stages=-1, + bn_eval=True, + bn_frozen=False, + ceil_mode=False, + with_last_pool=True): + super(VGG, self).__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for vgg') + assert num_stages >= 1 and num_stages <= 5 + stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + assert len(dilations) == num_stages + assert max(out_indices) <= num_stages + + self.num_classes = num_classes + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.bn_eval = bn_eval + self.bn_frozen = bn_frozen + + self.inplanes = 3 + start_idx = 0 + vgg_layers = [] + self.range_sub_modules = [] + for i, num_blocks in enumerate(self.stage_blocks): + num_modules = num_blocks * (2 + with_bn) + 1 + end_idx = start_idx + num_modules + dilation = dilations[i] + planes = 64 * 2**i if i < 4 else 512 + vgg_layer = make_vgg_layer( + self.inplanes, + planes, + num_blocks, + dilation=dilation, + with_bn=with_bn, + ceil_mode=ceil_mode) + vgg_layers.extend(vgg_layer) + self.inplanes = planes + self.range_sub_modules.append([start_idx, end_idx]) + start_idx = end_idx + if not with_last_pool: + vgg_layers.pop(-1) + self.range_sub_modules[-1][1] -= 1 + self.module_name = 'features' + self.add_module(self.module_name, nn.Sequential(*vgg_layers)) + + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Linear(512 * 7 * 7, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + from ..runner import load_checkpoint + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + elif isinstance(m, nn.Linear): + normal_init(m, std=0.01) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + outs = [] + vgg_layers = getattr(self, self.module_name) + for i in range(len(self.stage_blocks)): + for j in range(*self.range_sub_modules[i]): + vgg_layer = vgg_layers[j] + x = vgg_layer(x) + if i in self.out_indices: + outs.append(x) + if self.num_classes > 0: + x = x.view(x.size(0), -1) + x = self.classifier(x) + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode=True): + super(VGG, self).train(mode) + if self.bn_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.bn_frozen: + for params in m.parameters(): + params.requires_grad = False + vgg_layers = getattr(self, self.module_name) + if mode and self.frozen_stages >= 0: + for i in range(self.frozen_stages): + for j in range(*self.range_sub_modules[i]): + mod = vgg_layers[j] + mod.eval() + for param in mod.parameters(): + param.requires_grad = False diff --git a/lavis/common/annotator/uniformer/mmcv/engine/__init__.py b/lavis/common/annotator/uniformer/mmcv/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3193b7f664e19ce2458d81c836597fa22e4bb082 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/engine/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test, + single_gpu_test) + +__all__ = [ + 'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test', + 'single_gpu_test' +] diff --git a/lavis/common/annotator/uniformer/mmcv/engine/test.py b/lavis/common/annotator/uniformer/mmcv/engine/test.py new file mode 100644 index 0000000000000000000000000000000000000000..8dbeef271db634ec2dadfda3bc0b5ef9c7a677ff --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/engine/test.py @@ -0,0 +1,202 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import pickle +import shutil +import tempfile +import time + +import torch +import torch.distributed as dist + +import annotator.uniformer.mmcv as mmcv +from annotator.uniformer.mmcv.runner import get_dist_info + + +def single_gpu_test(model, data_loader): + """Test model with a single gpu. + + This method tests model with a single gpu and displays test progress bar. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + prog_bar = mmcv.ProgressBar(len(dataset)) + for data in data_loader: + with torch.no_grad(): + result = model(return_loss=False, **data) + results.extend(result) + + # Assume result has the same length of batch_size + # refer to https://github.com/open-mmlab/mmcv/issues/985 + batch_size = len(result) + for _ in range(batch_size): + prog_bar.update() + return results + + +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting + ``gpu_collect=True``, it encodes results to gpu tensors and use gpu + communication for results collection. On cpu mode it saves the results on + different gpus to ``tmpdir`` and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + time.sleep(2) # This line can prevent deadlock problem in some cases. + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, **data) + results.extend(result) + + if rank == 0: + batch_size = len(result) + batch_size_all = batch_size * world_size + if batch_size_all + prog_bar.completed > len(dataset): + batch_size_all = len(dataset) - prog_bar.completed + for _ in range(batch_size_all): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results + + +def collect_results_cpu(result_part, size, tmpdir=None): + """Collect results under cpu mode. + + On cpu mode, this function will save the results on different gpus to + ``tmpdir`` and collect them by the rank 0 worker. + + Args: + result_part (list): Result list containing result parts + to be collected. + size (int): Size of the results, commonly equal to length of + the results. + tmpdir (str | None): temporal directory for collected results to + store. If set to None, it will create a random temporal directory + for it. + + Returns: + list: The collected results. + """ + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_result = mmcv.load(part_file) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_result: + part_list.append(part_result) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + """Collect results under gpu mode. + + On gpu mode, this function will encode results to gpu tensors and use gpu + communication for results collection. + + Args: + result_part (list): Result list containing result parts + to be collected. + size (int): Size of the results, commonly equal to length of + the results. + + Returns: + list: The collected results. + """ + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_result: + part_list.append(part_result) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/__init__.py b/lavis/common/annotator/uniformer/mmcv/fileio/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2051b85f7e59bff7bdbaa131849ce8cd31f059a4 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .file_client import BaseStorageBackend, FileClient +from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler +from .io import dump, load, register_handler +from .parse import dict_from_file, list_from_file + +__all__ = [ + 'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler', + 'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler', + 'list_from_file', 'dict_from_file' +] diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/file_client.py b/lavis/common/annotator/uniformer/mmcv/fileio/file_client.py new file mode 100644 index 0000000000000000000000000000000000000000..950f0c1aeab14b8e308a7455ccd64a95b5d98add --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/file_client.py @@ -0,0 +1,1148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect +import os +import os.path as osp +import re +import tempfile +import warnings +from abc import ABCMeta, abstractmethod +from contextlib import contextmanager +from pathlib import Path +from typing import Iterable, Iterator, Optional, Tuple, Union +from urllib.request import urlopen + +import annotator.uniformer.mmcv as mmcv +from annotator.uniformer.mmcv.utils.misc import has_method +from annotator.uniformer.mmcv.utils.path import is_filepath + + +class BaseStorageBackend(metaclass=ABCMeta): + """Abstract class of storage backends. + + All backends need to implement two apis: ``get()`` and ``get_text()``. + ``get()`` reads the file as a byte stream and ``get_text()`` reads the file + as texts. + """ + + # a flag to indicate whether the backend can create a symlink for a file + _allow_symlink = False + + @property + def name(self): + return self.__class__.__name__ + + @property + def allow_symlink(self): + return self._allow_symlink + + @abstractmethod + def get(self, filepath): + pass + + @abstractmethod + def get_text(self, filepath): + pass + + +class CephBackend(BaseStorageBackend): + """Ceph storage backend (for internal use). + + Args: + path_mapping (dict|None): path mapping dict from local path to Petrel + path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath`` + will be replaced by ``dst``. Default: None. + + .. warning:: + :class:`mmcv.fileio.file_client.CephBackend` will be deprecated, + please use :class:`mmcv.fileio.file_client.PetrelBackend` instead. + """ + + def __init__(self, path_mapping=None): + try: + import ceph + except ImportError: + raise ImportError('Please install ceph to enable CephBackend.') + + warnings.warn( + 'CephBackend will be deprecated, please use PetrelBackend instead') + self._client = ceph.S3Client() + assert isinstance(path_mapping, dict) or path_mapping is None + self.path_mapping = path_mapping + + def get(self, filepath): + filepath = str(filepath) + if self.path_mapping is not None: + for k, v in self.path_mapping.items(): + filepath = filepath.replace(k, v) + value = self._client.Get(filepath) + value_buf = memoryview(value) + return value_buf + + def get_text(self, filepath, encoding=None): + raise NotImplementedError + + +class PetrelBackend(BaseStorageBackend): + """Petrel storage backend (for internal use). + + PetrelBackend supports reading and writing data to multiple clusters. + If the file path contains the cluster name, PetrelBackend will read data + from specified cluster or write data to it. Otherwise, PetrelBackend will + access the default cluster. + + Args: + path_mapping (dict, optional): Path mapping dict from local path to + Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in + ``filepath`` will be replaced by ``dst``. Default: None. + enable_mc (bool, optional): Whether to enable memcached support. + Default: True. + + Examples: + >>> filepath1 = 's3://path/of/file' + >>> filepath2 = 'cluster-name:s3://path/of/file' + >>> client = PetrelBackend() + >>> client.get(filepath1) # get data from default cluster + >>> client.get(filepath2) # get data from 'cluster-name' cluster + """ + + def __init__(self, + path_mapping: Optional[dict] = None, + enable_mc: bool = True): + try: + from petrel_client import client + except ImportError: + raise ImportError('Please install petrel_client to enable ' + 'PetrelBackend.') + + self._client = client.Client(enable_mc=enable_mc) + assert isinstance(path_mapping, dict) or path_mapping is None + self.path_mapping = path_mapping + + def _map_path(self, filepath: Union[str, Path]) -> str: + """Map ``filepath`` to a string path whose prefix will be replaced by + :attr:`self.path_mapping`. + + Args: + filepath (str): Path to be mapped. + """ + filepath = str(filepath) + if self.path_mapping is not None: + for k, v in self.path_mapping.items(): + filepath = filepath.replace(k, v) + return filepath + + def _format_path(self, filepath: str) -> str: + """Convert a ``filepath`` to standard format of petrel oss. + + If the ``filepath`` is concatenated by ``os.path.join``, in a Windows + environment, the ``filepath`` will be the format of + 's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the + above ``filepath`` will be converted to 's3://bucket_name/image.jpg'. + + Args: + filepath (str): Path to be formatted. + """ + return re.sub(r'\\+', '/', filepath) + + def get(self, filepath: Union[str, Path]) -> memoryview: + """Read data from a given ``filepath`` with 'rb' mode. + + Args: + filepath (str or Path): Path to read data. + + Returns: + memoryview: A memory view of expected bytes object to avoid + copying. The memoryview object can be converted to bytes by + ``value_buf.tobytes()``. + """ + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + value = self._client.Get(filepath) + value_buf = memoryview(value) + return value_buf + + def get_text(self, + filepath: Union[str, Path], + encoding: str = 'utf-8') -> str: + """Read data from a given ``filepath`` with 'r' mode. + + Args: + filepath (str or Path): Path to read data. + encoding (str): The encoding format used to open the ``filepath``. + Default: 'utf-8'. + + Returns: + str: Expected text reading from ``filepath``. + """ + return str(self.get(filepath), encoding=encoding) + + def put(self, obj: bytes, filepath: Union[str, Path]) -> None: + """Save data to a given ``filepath``. + + Args: + obj (bytes): Data to be saved. + filepath (str or Path): Path to write data. + """ + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + self._client.put(filepath, obj) + + def put_text(self, + obj: str, + filepath: Union[str, Path], + encoding: str = 'utf-8') -> None: + """Save data to a given ``filepath``. + + Args: + obj (str): Data to be written. + filepath (str or Path): Path to write data. + encoding (str): The encoding format used to encode the ``obj``. + Default: 'utf-8'. + """ + self.put(bytes(obj, encoding=encoding), filepath) + + def remove(self, filepath: Union[str, Path]) -> None: + """Remove a file. + + Args: + filepath (str or Path): Path to be removed. + """ + if not has_method(self._client, 'delete'): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `delete` method, please use a higher version or dev' + ' branch instead.')) + + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + self._client.delete(filepath) + + def exists(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path exists. + + Args: + filepath (str or Path): Path to be checked whether exists. + + Returns: + bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. + """ + if not (has_method(self._client, 'contains') + and has_method(self._client, 'isdir')): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `contains` and `isdir` methods, please use a higher' + 'version or dev branch instead.')) + + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + return self._client.contains(filepath) or self._client.isdir(filepath) + + def isdir(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a directory. + + Args: + filepath (str or Path): Path to be checked whether it is a + directory. + + Returns: + bool: Return ``True`` if ``filepath`` points to a directory, + ``False`` otherwise. + """ + if not has_method(self._client, 'isdir'): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `isdir` method, please use a higher version or dev' + ' branch instead.')) + + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + return self._client.isdir(filepath) + + def isfile(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a file. + + Args: + filepath (str or Path): Path to be checked whether it is a file. + + Returns: + bool: Return ``True`` if ``filepath`` points to a file, ``False`` + otherwise. + """ + if not has_method(self._client, 'contains'): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `contains` method, please use a higher version or ' + 'dev branch instead.')) + + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + return self._client.contains(filepath) + + def join_path(self, filepath: Union[str, Path], + *filepaths: Union[str, Path]) -> str: + """Concatenate all file paths. + + Args: + filepath (str or Path): Path to be concatenated. + + Returns: + str: The result after concatenation. + """ + filepath = self._format_path(self._map_path(filepath)) + if filepath.endswith('/'): + filepath = filepath[:-1] + formatted_paths = [filepath] + for path in filepaths: + formatted_paths.append(self._format_path(self._map_path(path))) + return '/'.join(formatted_paths) + + @contextmanager + def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]: + """Download a file from ``filepath`` and return a temporary path. + + ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It + can be called with ``with`` statement, and when exists from the + ``with`` statement, the temporary path will be released. + + Args: + filepath (str | Path): Download a file from ``filepath``. + + Examples: + >>> client = PetrelBackend() + >>> # After existing from the ``with`` clause, + >>> # the path will be removed + >>> with client.get_local_path('s3://path/of/your/file') as path: + ... # do something here + + Yields: + Iterable[str]: Only yield one temporary path. + """ + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + assert self.isfile(filepath) + try: + f = tempfile.NamedTemporaryFile(delete=False) + f.write(self.get(filepath)) + f.close() + yield f.name + finally: + os.remove(f.name) + + def list_dir_or_file(self, + dir_path: Union[str, Path], + list_dir: bool = True, + list_file: bool = True, + suffix: Optional[Union[str, Tuple[str]]] = None, + recursive: bool = False) -> Iterator[str]: + """Scan a directory to find the interested directories or files in + arbitrary order. + + Note: + Petrel has no concept of directories but it simulates the directory + hierarchy in the filesystem through public prefixes. In addition, + if the returned path ends with '/', it means the path is a public + prefix which is a logical directory. + + Note: + :meth:`list_dir_or_file` returns the path relative to ``dir_path``. + In addition, the returned path of directory will not contains the + suffix '/' which is consistent with other backends. + + Args: + dir_path (str | Path): Path of the directory. + list_dir (bool): List the directories. Default: True. + list_file (bool): List the path of files. Default: True. + suffix (str or tuple[str], optional): File suffix + that we are interested in. Default: None. + recursive (bool): If set to True, recursively scan the + directory. Default: False. + + Yields: + Iterable[str]: A relative path to ``dir_path``. + """ + if not has_method(self._client, 'list'): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `list` method, please use a higher version or dev' + ' branch instead.')) + + dir_path = self._map_path(dir_path) + dir_path = self._format_path(dir_path) + if list_dir and suffix is not None: + raise TypeError( + '`list_dir` should be False when `suffix` is not None') + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('`suffix` must be a string or tuple of strings') + + # Petrel's simulated directory hierarchy assumes that directory paths + # should end with `/` + if not dir_path.endswith('/'): + dir_path += '/' + + root = dir_path + + def _list_dir_or_file(dir_path, list_dir, list_file, suffix, + recursive): + for path in self._client.list(dir_path): + # the `self.isdir` is not used here to determine whether path + # is a directory, because `self.isdir` relies on + # `self._client.list` + if path.endswith('/'): # a directory path + next_dir_path = self.join_path(dir_path, path) + if list_dir: + # get the relative path and exclude the last + # character '/' + rel_dir = next_dir_path[len(root):-1] + yield rel_dir + if recursive: + yield from _list_dir_or_file(next_dir_path, list_dir, + list_file, suffix, + recursive) + else: # a file path + absolute_path = self.join_path(dir_path, path) + rel_path = absolute_path[len(root):] + if (suffix is None + or rel_path.endswith(suffix)) and list_file: + yield rel_path + + return _list_dir_or_file(dir_path, list_dir, list_file, suffix, + recursive) + + +class MemcachedBackend(BaseStorageBackend): + """Memcached storage backend. + + Attributes: + server_list_cfg (str): Config file for memcached server list. + client_cfg (str): Config file for memcached client. + sys_path (str | None): Additional path to be appended to `sys.path`. + Default: None. + """ + + def __init__(self, server_list_cfg, client_cfg, sys_path=None): + if sys_path is not None: + import sys + sys.path.append(sys_path) + try: + import mc + except ImportError: + raise ImportError( + 'Please install memcached to enable MemcachedBackend.') + + self.server_list_cfg = server_list_cfg + self.client_cfg = client_cfg + self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg, + self.client_cfg) + # mc.pyvector servers as a point which points to a memory cache + self._mc_buffer = mc.pyvector() + + def get(self, filepath): + filepath = str(filepath) + import mc + self._client.Get(filepath, self._mc_buffer) + value_buf = mc.ConvertBuffer(self._mc_buffer) + return value_buf + + def get_text(self, filepath, encoding=None): + raise NotImplementedError + + +class LmdbBackend(BaseStorageBackend): + """Lmdb storage backend. + + Args: + db_path (str): Lmdb database path. + readonly (bool, optional): Lmdb environment parameter. If True, + disallow any write operations. Default: True. + lock (bool, optional): Lmdb environment parameter. If False, when + concurrent access occurs, do not lock the database. Default: False. + readahead (bool, optional): Lmdb environment parameter. If False, + disable the OS filesystem readahead mechanism, which may improve + random read performance when a database is larger than RAM. + Default: False. + + Attributes: + db_path (str): Lmdb database path. + """ + + def __init__(self, + db_path, + readonly=True, + lock=False, + readahead=False, + **kwargs): + try: + import lmdb + except ImportError: + raise ImportError('Please install lmdb to enable LmdbBackend.') + + self.db_path = str(db_path) + self._client = lmdb.open( + self.db_path, + readonly=readonly, + lock=lock, + readahead=readahead, + **kwargs) + + def get(self, filepath): + """Get values according to the filepath. + + Args: + filepath (str | obj:`Path`): Here, filepath is the lmdb key. + """ + filepath = str(filepath) + with self._client.begin(write=False) as txn: + value_buf = txn.get(filepath.encode('ascii')) + return value_buf + + def get_text(self, filepath, encoding=None): + raise NotImplementedError + + +class HardDiskBackend(BaseStorageBackend): + """Raw hard disks storage backend.""" + + _allow_symlink = True + + def get(self, filepath: Union[str, Path]) -> bytes: + """Read data from a given ``filepath`` with 'rb' mode. + + Args: + filepath (str or Path): Path to read data. + + Returns: + bytes: Expected bytes object. + """ + with open(filepath, 'rb') as f: + value_buf = f.read() + return value_buf + + def get_text(self, + filepath: Union[str, Path], + encoding: str = 'utf-8') -> str: + """Read data from a given ``filepath`` with 'r' mode. + + Args: + filepath (str or Path): Path to read data. + encoding (str): The encoding format used to open the ``filepath``. + Default: 'utf-8'. + + Returns: + str: Expected text reading from ``filepath``. + """ + with open(filepath, 'r', encoding=encoding) as f: + value_buf = f.read() + return value_buf + + def put(self, obj: bytes, filepath: Union[str, Path]) -> None: + """Write data to a given ``filepath`` with 'wb' mode. + + Note: + ``put`` will create a directory if the directory of ``filepath`` + does not exist. + + Args: + obj (bytes): Data to be written. + filepath (str or Path): Path to write data. + """ + mmcv.mkdir_or_exist(osp.dirname(filepath)) + with open(filepath, 'wb') as f: + f.write(obj) + + def put_text(self, + obj: str, + filepath: Union[str, Path], + encoding: str = 'utf-8') -> None: + """Write data to a given ``filepath`` with 'w' mode. + + Note: + ``put_text`` will create a directory if the directory of + ``filepath`` does not exist. + + Args: + obj (str): Data to be written. + filepath (str or Path): Path to write data. + encoding (str): The encoding format used to open the ``filepath``. + Default: 'utf-8'. + """ + mmcv.mkdir_or_exist(osp.dirname(filepath)) + with open(filepath, 'w', encoding=encoding) as f: + f.write(obj) + + def remove(self, filepath: Union[str, Path]) -> None: + """Remove a file. + + Args: + filepath (str or Path): Path to be removed. + """ + os.remove(filepath) + + def exists(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path exists. + + Args: + filepath (str or Path): Path to be checked whether exists. + + Returns: + bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. + """ + return osp.exists(filepath) + + def isdir(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a directory. + + Args: + filepath (str or Path): Path to be checked whether it is a + directory. + + Returns: + bool: Return ``True`` if ``filepath`` points to a directory, + ``False`` otherwise. + """ + return osp.isdir(filepath) + + def isfile(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a file. + + Args: + filepath (str or Path): Path to be checked whether it is a file. + + Returns: + bool: Return ``True`` if ``filepath`` points to a file, ``False`` + otherwise. + """ + return osp.isfile(filepath) + + def join_path(self, filepath: Union[str, Path], + *filepaths: Union[str, Path]) -> str: + """Concatenate all file paths. + + Join one or more filepath components intelligently. The return value + is the concatenation of filepath and any members of *filepaths. + + Args: + filepath (str or Path): Path to be concatenated. + + Returns: + str: The result of concatenation. + """ + return osp.join(filepath, *filepaths) + + @contextmanager + def get_local_path( + self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]: + """Only for unified API and do nothing.""" + yield filepath + + def list_dir_or_file(self, + dir_path: Union[str, Path], + list_dir: bool = True, + list_file: bool = True, + suffix: Optional[Union[str, Tuple[str]]] = None, + recursive: bool = False) -> Iterator[str]: + """Scan a directory to find the interested directories or files in + arbitrary order. + + Note: + :meth:`list_dir_or_file` returns the path relative to ``dir_path``. + + Args: + dir_path (str | Path): Path of the directory. + list_dir (bool): List the directories. Default: True. + list_file (bool): List the path of files. Default: True. + suffix (str or tuple[str], optional): File suffix + that we are interested in. Default: None. + recursive (bool): If set to True, recursively scan the + directory. Default: False. + + Yields: + Iterable[str]: A relative path to ``dir_path``. + """ + if list_dir and suffix is not None: + raise TypeError('`suffix` should be None when `list_dir` is True') + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('`suffix` must be a string or tuple of strings') + + root = dir_path + + def _list_dir_or_file(dir_path, list_dir, list_file, suffix, + recursive): + for entry in os.scandir(dir_path): + if not entry.name.startswith('.') and entry.is_file(): + rel_path = osp.relpath(entry.path, root) + if (suffix is None + or rel_path.endswith(suffix)) and list_file: + yield rel_path + elif osp.isdir(entry.path): + if list_dir: + rel_dir = osp.relpath(entry.path, root) + yield rel_dir + if recursive: + yield from _list_dir_or_file(entry.path, list_dir, + list_file, suffix, + recursive) + + return _list_dir_or_file(dir_path, list_dir, list_file, suffix, + recursive) + + +class HTTPBackend(BaseStorageBackend): + """HTTP and HTTPS storage bachend.""" + + def get(self, filepath): + value_buf = urlopen(filepath).read() + return value_buf + + def get_text(self, filepath, encoding='utf-8'): + value_buf = urlopen(filepath).read() + return value_buf.decode(encoding) + + @contextmanager + def get_local_path(self, filepath: str) -> Iterable[str]: + """Download a file from ``filepath``. + + ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It + can be called with ``with`` statement, and when exists from the + ``with`` statement, the temporary path will be released. + + Args: + filepath (str): Download a file from ``filepath``. + + Examples: + >>> client = HTTPBackend() + >>> # After existing from the ``with`` clause, + >>> # the path will be removed + >>> with client.get_local_path('http://path/of/your/file') as path: + ... # do something here + """ + try: + f = tempfile.NamedTemporaryFile(delete=False) + f.write(self.get(filepath)) + f.close() + yield f.name + finally: + os.remove(f.name) + + +class FileClient: + """A general file client to access files in different backends. + + The client loads a file or text in a specified backend from its path + and returns it as a binary or text file. There are two ways to choose a + backend, the name of backend and the prefix of path. Although both of them + can be used to choose a storage backend, ``backend`` has a higher priority + that is if they are all set, the storage backend will be chosen by the + backend argument. If they are all `None`, the disk backend will be chosen. + Note that It can also register other backend accessor with a given name, + prefixes, and backend class. In addition, We use the singleton pattern to + avoid repeated object creation. If the arguments are the same, the same + object will be returned. + + Args: + backend (str, optional): The storage backend type. Options are "disk", + "ceph", "memcached", "lmdb", "http" and "petrel". Default: None. + prefix (str, optional): The prefix of the registered storage backend. + Options are "s3", "http", "https". Default: None. + + Examples: + >>> # only set backend + >>> file_client = FileClient(backend='petrel') + >>> # only set prefix + >>> file_client = FileClient(prefix='s3') + >>> # set both backend and prefix but use backend to choose client + >>> file_client = FileClient(backend='petrel', prefix='s3') + >>> # if the arguments are the same, the same object is returned + >>> file_client1 = FileClient(backend='petrel') + >>> file_client1 is file_client + True + + Attributes: + client (:obj:`BaseStorageBackend`): The backend object. + """ + + _backends = { + 'disk': HardDiskBackend, + 'ceph': CephBackend, + 'memcached': MemcachedBackend, + 'lmdb': LmdbBackend, + 'petrel': PetrelBackend, + 'http': HTTPBackend, + } + # This collection is used to record the overridden backends, and when a + # backend appears in the collection, the singleton pattern is disabled for + # that backend, because if the singleton pattern is used, then the object + # returned will be the backend before overwriting + _overridden_backends = set() + _prefix_to_backends = { + 's3': PetrelBackend, + 'http': HTTPBackend, + 'https': HTTPBackend, + } + _overridden_prefixes = set() + + _instances = {} + + def __new__(cls, backend=None, prefix=None, **kwargs): + if backend is None and prefix is None: + backend = 'disk' + if backend is not None and backend not in cls._backends: + raise ValueError( + f'Backend {backend} is not supported. Currently supported ones' + f' are {list(cls._backends.keys())}') + if prefix is not None and prefix not in cls._prefix_to_backends: + raise ValueError( + f'prefix {prefix} is not supported. Currently supported ones ' + f'are {list(cls._prefix_to_backends.keys())}') + + # concatenate the arguments to a unique key for determining whether + # objects with the same arguments were created + arg_key = f'{backend}:{prefix}' + for key, value in kwargs.items(): + arg_key += f':{key}:{value}' + + # if a backend was overridden, it will create a new object + if (arg_key in cls._instances + and backend not in cls._overridden_backends + and prefix not in cls._overridden_prefixes): + _instance = cls._instances[arg_key] + else: + # create a new object and put it to _instance + _instance = super().__new__(cls) + if backend is not None: + _instance.client = cls._backends[backend](**kwargs) + else: + _instance.client = cls._prefix_to_backends[prefix](**kwargs) + + cls._instances[arg_key] = _instance + + return _instance + + @property + def name(self): + return self.client.name + + @property + def allow_symlink(self): + return self.client.allow_symlink + + @staticmethod + def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]: + """Parse the prefix of a uri. + + Args: + uri (str | Path): Uri to be parsed that contains the file prefix. + + Examples: + >>> FileClient.parse_uri_prefix('s3://path/of/your/file') + 's3' + + Returns: + str | None: Return the prefix of uri if the uri contains '://' + else ``None``. + """ + assert is_filepath(uri) + uri = str(uri) + if '://' not in uri: + return None + else: + prefix, _ = uri.split('://') + # In the case of PetrelBackend, the prefix may contains the cluster + # name like clusterName:s3 + if ':' in prefix: + _, prefix = prefix.split(':') + return prefix + + @classmethod + def infer_client(cls, + file_client_args: Optional[dict] = None, + uri: Optional[Union[str, Path]] = None) -> 'FileClient': + """Infer a suitable file client based on the URI and arguments. + + Args: + file_client_args (dict, optional): Arguments to instantiate a + FileClient. Default: None. + uri (str | Path, optional): Uri to be parsed that contains the file + prefix. Default: None. + + Examples: + >>> uri = 's3://path/of/your/file' + >>> file_client = FileClient.infer_client(uri=uri) + >>> file_client_args = {'backend': 'petrel'} + >>> file_client = FileClient.infer_client(file_client_args) + + Returns: + FileClient: Instantiated FileClient object. + """ + assert file_client_args is not None or uri is not None + if file_client_args is None: + file_prefix = cls.parse_uri_prefix(uri) # type: ignore + return cls(prefix=file_prefix) + else: + return cls(**file_client_args) + + @classmethod + def _register_backend(cls, name, backend, force=False, prefixes=None): + if not isinstance(name, str): + raise TypeError('the backend name should be a string, ' + f'but got {type(name)}') + if not inspect.isclass(backend): + raise TypeError( + f'backend should be a class but got {type(backend)}') + if not issubclass(backend, BaseStorageBackend): + raise TypeError( + f'backend {backend} is not a subclass of BaseStorageBackend') + if not force and name in cls._backends: + raise KeyError( + f'{name} is already registered as a storage backend, ' + 'add "force=True" if you want to override it') + + if name in cls._backends and force: + cls._overridden_backends.add(name) + cls._backends[name] = backend + + if prefixes is not None: + if isinstance(prefixes, str): + prefixes = [prefixes] + else: + assert isinstance(prefixes, (list, tuple)) + for prefix in prefixes: + if prefix not in cls._prefix_to_backends: + cls._prefix_to_backends[prefix] = backend + elif (prefix in cls._prefix_to_backends) and force: + cls._overridden_prefixes.add(prefix) + cls._prefix_to_backends[prefix] = backend + else: + raise KeyError( + f'{prefix} is already registered as a storage backend,' + ' add "force=True" if you want to override it') + + @classmethod + def register_backend(cls, name, backend=None, force=False, prefixes=None): + """Register a backend to FileClient. + + This method can be used as a normal class method or a decorator. + + .. code-block:: python + + class NewBackend(BaseStorageBackend): + + def get(self, filepath): + return filepath + + def get_text(self, filepath): + return filepath + + FileClient.register_backend('new', NewBackend) + + or + + .. code-block:: python + + @FileClient.register_backend('new') + class NewBackend(BaseStorageBackend): + + def get(self, filepath): + return filepath + + def get_text(self, filepath): + return filepath + + Args: + name (str): The name of the registered backend. + backend (class, optional): The backend class to be registered, + which must be a subclass of :class:`BaseStorageBackend`. + When this method is used as a decorator, backend is None. + Defaults to None. + force (bool, optional): Whether to override the backend if the name + has already been registered. Defaults to False. + prefixes (str or list[str] or tuple[str], optional): The prefixes + of the registered storage backend. Default: None. + `New in version 1.3.15.` + """ + if backend is not None: + cls._register_backend( + name, backend, force=force, prefixes=prefixes) + return + + def _register(backend_cls): + cls._register_backend( + name, backend_cls, force=force, prefixes=prefixes) + return backend_cls + + return _register + + def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]: + """Read data from a given ``filepath`` with 'rb' mode. + + Note: + There are two types of return values for ``get``, one is ``bytes`` + and the other is ``memoryview``. The advantage of using memoryview + is that you can avoid copying, and if you want to convert it to + ``bytes``, you can use ``.tobytes()``. + + Args: + filepath (str or Path): Path to read data. + + Returns: + bytes | memoryview: Expected bytes object or a memory view of the + bytes object. + """ + return self.client.get(filepath) + + def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str: + """Read data from a given ``filepath`` with 'r' mode. + + Args: + filepath (str or Path): Path to read data. + encoding (str): The encoding format used to open the ``filepath``. + Default: 'utf-8'. + + Returns: + str: Expected text reading from ``filepath``. + """ + return self.client.get_text(filepath, encoding) + + def put(self, obj: bytes, filepath: Union[str, Path]) -> None: + """Write data to a given ``filepath`` with 'wb' mode. + + Note: + ``put`` should create a directory if the directory of ``filepath`` + does not exist. + + Args: + obj (bytes): Data to be written. + filepath (str or Path): Path to write data. + """ + self.client.put(obj, filepath) + + def put_text(self, obj: str, filepath: Union[str, Path]) -> None: + """Write data to a given ``filepath`` with 'w' mode. + + Note: + ``put_text`` should create a directory if the directory of + ``filepath`` does not exist. + + Args: + obj (str): Data to be written. + filepath (str or Path): Path to write data. + encoding (str, optional): The encoding format used to open the + `filepath`. Default: 'utf-8'. + """ + self.client.put_text(obj, filepath) + + def remove(self, filepath: Union[str, Path]) -> None: + """Remove a file. + + Args: + filepath (str, Path): Path to be removed. + """ + self.client.remove(filepath) + + def exists(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path exists. + + Args: + filepath (str or Path): Path to be checked whether exists. + + Returns: + bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. + """ + return self.client.exists(filepath) + + def isdir(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a directory. + + Args: + filepath (str or Path): Path to be checked whether it is a + directory. + + Returns: + bool: Return ``True`` if ``filepath`` points to a directory, + ``False`` otherwise. + """ + return self.client.isdir(filepath) + + def isfile(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a file. + + Args: + filepath (str or Path): Path to be checked whether it is a file. + + Returns: + bool: Return ``True`` if ``filepath`` points to a file, ``False`` + otherwise. + """ + return self.client.isfile(filepath) + + def join_path(self, filepath: Union[str, Path], + *filepaths: Union[str, Path]) -> str: + """Concatenate all file paths. + + Join one or more filepath components intelligently. The return value + is the concatenation of filepath and any members of *filepaths. + + Args: + filepath (str or Path): Path to be concatenated. + + Returns: + str: The result of concatenation. + """ + return self.client.join_path(filepath, *filepaths) + + @contextmanager + def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]: + """Download data from ``filepath`` and write the data to local path. + + ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It + can be called with ``with`` statement, and when exists from the + ``with`` statement, the temporary path will be released. + + Note: + If the ``filepath`` is a local path, just return itself. + + .. warning:: + ``get_local_path`` is an experimental interface that may change in + the future. + + Args: + filepath (str or Path): Path to be read data. + + Examples: + >>> file_client = FileClient(prefix='s3') + >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path: + ... # do something here + + Yields: + Iterable[str]: Only yield one path. + """ + with self.client.get_local_path(str(filepath)) as local_path: + yield local_path + + def list_dir_or_file(self, + dir_path: Union[str, Path], + list_dir: bool = True, + list_file: bool = True, + suffix: Optional[Union[str, Tuple[str]]] = None, + recursive: bool = False) -> Iterator[str]: + """Scan a directory to find the interested directories or files in + arbitrary order. + + Note: + :meth:`list_dir_or_file` returns the path relative to ``dir_path``. + + Args: + dir_path (str | Path): Path of the directory. + list_dir (bool): List the directories. Default: True. + list_file (bool): List the path of files. Default: True. + suffix (str or tuple[str], optional): File suffix + that we are interested in. Default: None. + recursive (bool): If set to True, recursively scan the + directory. Default: False. + + Yields: + Iterable[str]: A relative path to ``dir_path``. + """ + yield from self.client.list_dir_or_file(dir_path, list_dir, list_file, + suffix, recursive) diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/handlers/__init__.py b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa24d91972837b8756b225f4879bac20436eb72a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseFileHandler +from .json_handler import JsonHandler +from .pickle_handler import PickleHandler +from .yaml_handler import YamlHandler + +__all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler'] diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/handlers/base.py b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..288878bc57282fbb2f12b32290152ca8e9d3cab0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/base.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + + +class BaseFileHandler(metaclass=ABCMeta): + # `str_like` is a flag to indicate whether the type of file object is + # str-like object or bytes-like object. Pickle only processes bytes-like + # objects but json only processes str-like object. If it is str-like + # object, `StringIO` will be used to process the buffer. + str_like = True + + @abstractmethod + def load_from_fileobj(self, file, **kwargs): + pass + + @abstractmethod + def dump_to_fileobj(self, obj, file, **kwargs): + pass + + @abstractmethod + def dump_to_str(self, obj, **kwargs): + pass + + def load_from_path(self, filepath, mode='r', **kwargs): + with open(filepath, mode) as f: + return self.load_from_fileobj(f, **kwargs) + + def dump_to_path(self, obj, filepath, mode='w', **kwargs): + with open(filepath, mode) as f: + self.dump_to_fileobj(obj, f, **kwargs) diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/handlers/json_handler.py b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/json_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..18d4f15f74139d20adff18b20be5529c592a66b6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/json_handler.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json + +import numpy as np + +from .base import BaseFileHandler + + +def set_default(obj): + """Set default json values for non-serializable values. + + It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list. + It also converts ``np.generic`` (including ``np.int32``, ``np.float32``, + etc.) into plain numbers of plain python built-in types. + """ + if isinstance(obj, (set, range)): + return list(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, np.generic): + return obj.item() + raise TypeError(f'{type(obj)} is unsupported for json dump') + + +class JsonHandler(BaseFileHandler): + + def load_from_fileobj(self, file): + return json.load(file) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault('default', set_default) + json.dump(obj, file, **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault('default', set_default) + return json.dumps(obj, **kwargs) diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..b37c79bed4ef9fd8913715e62dbe3fc5cafdc3aa --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pickle + +from .base import BaseFileHandler + + +class PickleHandler(BaseFileHandler): + + str_like = False + + def load_from_fileobj(self, file, **kwargs): + return pickle.load(file, **kwargs) + + def load_from_path(self, filepath, **kwargs): + return super(PickleHandler, self).load_from_path( + filepath, mode='rb', **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault('protocol', 2) + return pickle.dumps(obj, **kwargs) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault('protocol', 2) + pickle.dump(obj, file, **kwargs) + + def dump_to_path(self, obj, filepath, **kwargs): + super(PickleHandler, self).dump_to_path( + obj, filepath, mode='wb', **kwargs) diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..c5aa2eea1e8c76f8baf753d1c8c959dee665e543 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import yaml + +try: + from yaml import CLoader as Loader, CDumper as Dumper +except ImportError: + from yaml import Loader, Dumper + +from .base import BaseFileHandler # isort:skip + + +class YamlHandler(BaseFileHandler): + + def load_from_fileobj(self, file, **kwargs): + kwargs.setdefault('Loader', Loader) + return yaml.load(file, **kwargs) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault('Dumper', Dumper) + yaml.dump(obj, file, **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault('Dumper', Dumper) + return yaml.dump(obj, **kwargs) diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/io.py b/lavis/common/annotator/uniformer/mmcv/fileio/io.py new file mode 100644 index 0000000000000000000000000000000000000000..aaefde58aa3ea5b58f86249ce7e1c40c186eb8dd --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/io.py @@ -0,0 +1,151 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from io import BytesIO, StringIO +from pathlib import Path + +from ..utils import is_list_of, is_str +from .file_client import FileClient +from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler + +file_handlers = { + 'json': JsonHandler(), + 'yaml': YamlHandler(), + 'yml': YamlHandler(), + 'pickle': PickleHandler(), + 'pkl': PickleHandler() +} + + +def load(file, file_format=None, file_client_args=None, **kwargs): + """Load data from json/yaml/pickle files. + + This method provides a unified api for loading data from serialized files. + + Note: + In v1.3.16 and later, ``load`` supports loading data from serialized + files those can be storaged in different backends. + + Args: + file (str or :obj:`Path` or file-like object): Filename or a file-like + object. + file_format (str, optional): If not specified, the file format will be + inferred from the file extension, otherwise use the specified one. + Currently supported formats include "json", "yaml/yml" and + "pickle/pkl". + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + + Examples: + >>> load('/path/of/your/file') # file is storaged in disk + >>> load('https://path/of/your/file') # file is storaged in Internet + >>> load('s3://path/of/your/file') # file is storaged in petrel + + Returns: + The content from the file. + """ + if isinstance(file, Path): + file = str(file) + if file_format is None and is_str(file): + file_format = file.split('.')[-1] + if file_format not in file_handlers: + raise TypeError(f'Unsupported format: {file_format}') + + handler = file_handlers[file_format] + if is_str(file): + file_client = FileClient.infer_client(file_client_args, file) + if handler.str_like: + with StringIO(file_client.get_text(file)) as f: + obj = handler.load_from_fileobj(f, **kwargs) + else: + with BytesIO(file_client.get(file)) as f: + obj = handler.load_from_fileobj(f, **kwargs) + elif hasattr(file, 'read'): + obj = handler.load_from_fileobj(file, **kwargs) + else: + raise TypeError('"file" must be a filepath str or a file-object') + return obj + + +def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs): + """Dump data to json/yaml/pickle strings or files. + + This method provides a unified api for dumping data as strings or to files, + and also supports custom arguments for each file format. + + Note: + In v1.3.16 and later, ``dump`` supports dumping data as strings or to + files which is saved to different backends. + + Args: + obj (any): The python object to be dumped. + file (str or :obj:`Path` or file-like object, optional): If not + specified, then the object is dumped to a str, otherwise to a file + specified by the filename or file-like object. + file_format (str, optional): Same as :func:`load`. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + + Examples: + >>> dump('hello world', '/path/of/your/file') # disk + >>> dump('hello world', 's3://path/of/your/file') # ceph or petrel + + Returns: + bool: True for success, False otherwise. + """ + if isinstance(file, Path): + file = str(file) + if file_format is None: + if is_str(file): + file_format = file.split('.')[-1] + elif file is None: + raise ValueError( + 'file_format must be specified since file is None') + if file_format not in file_handlers: + raise TypeError(f'Unsupported format: {file_format}') + + handler = file_handlers[file_format] + if file is None: + return handler.dump_to_str(obj, **kwargs) + elif is_str(file): + file_client = FileClient.infer_client(file_client_args, file) + if handler.str_like: + with StringIO() as f: + handler.dump_to_fileobj(obj, f, **kwargs) + file_client.put_text(f.getvalue(), file) + else: + with BytesIO() as f: + handler.dump_to_fileobj(obj, f, **kwargs) + file_client.put(f.getvalue(), file) + elif hasattr(file, 'write'): + handler.dump_to_fileobj(obj, file, **kwargs) + else: + raise TypeError('"file" must be a filename str or a file-object') + + +def _register_handler(handler, file_formats): + """Register a handler for some file extensions. + + Args: + handler (:obj:`BaseFileHandler`): Handler to be registered. + file_formats (str or list[str]): File formats to be handled by this + handler. + """ + if not isinstance(handler, BaseFileHandler): + raise TypeError( + f'handler must be a child of BaseFileHandler, not {type(handler)}') + if isinstance(file_formats, str): + file_formats = [file_formats] + if not is_list_of(file_formats, str): + raise TypeError('file_formats must be a str or a list of str') + for ext in file_formats: + file_handlers[ext] = handler + + +def register_handler(file_formats, **kwargs): + + def wrap(cls): + _register_handler(cls(**kwargs), file_formats) + return cls + + return wrap diff --git a/lavis/common/annotator/uniformer/mmcv/fileio/parse.py b/lavis/common/annotator/uniformer/mmcv/fileio/parse.py new file mode 100644 index 0000000000000000000000000000000000000000..f60f0d611b8d75692221d0edd7dc993b0a6445c9 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/fileio/parse.py @@ -0,0 +1,97 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from io import StringIO + +from .file_client import FileClient + + +def list_from_file(filename, + prefix='', + offset=0, + max_num=0, + encoding='utf-8', + file_client_args=None): + """Load a text file and parse the content as a list of strings. + + Note: + In v1.3.16 and later, ``list_from_file`` supports loading a text file + which can be storaged in different backends and parsing the content as + a list for strings. + + Args: + filename (str): Filename. + prefix (str): The prefix to be inserted to the beginning of each item. + offset (int): The offset of lines. + max_num (int): The maximum number of lines to be read, + zeros and negatives mean no limitation. + encoding (str): Encoding used to open the file. Default utf-8. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + + Examples: + >>> list_from_file('/path/of/your/file') # disk + ['hello', 'world'] + >>> list_from_file('s3://path/of/your/file') # ceph or petrel + ['hello', 'world'] + + Returns: + list[str]: A list of strings. + """ + cnt = 0 + item_list = [] + file_client = FileClient.infer_client(file_client_args, filename) + with StringIO(file_client.get_text(filename, encoding)) as f: + for _ in range(offset): + f.readline() + for line in f: + if 0 < max_num <= cnt: + break + item_list.append(prefix + line.rstrip('\n\r')) + cnt += 1 + return item_list + + +def dict_from_file(filename, + key_type=str, + encoding='utf-8', + file_client_args=None): + """Load a text file and parse the content as a dict. + + Each line of the text file will be two or more columns split by + whitespaces or tabs. The first column will be parsed as dict keys, and + the following columns will be parsed as dict values. + + Note: + In v1.3.16 and later, ``dict_from_file`` supports loading a text file + which can be storaged in different backends and parsing the content as + a dict. + + Args: + filename(str): Filename. + key_type(type): Type of the dict keys. str is user by default and + type conversion will be performed if specified. + encoding (str): Encoding used to open the file. Default utf-8. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + + Examples: + >>> dict_from_file('/path/of/your/file') # disk + {'key1': 'value1', 'key2': 'value2'} + >>> dict_from_file('s3://path/of/your/file') # ceph or petrel + {'key1': 'value1', 'key2': 'value2'} + + Returns: + dict: The parsed contents. + """ + mapping = {} + file_client = FileClient.infer_client(file_client_args, filename) + with StringIO(file_client.get_text(filename, encoding)) as f: + for line in f: + items = line.rstrip('\n').split() + assert len(items) >= 2 + key = key_type(items[0]) + val = items[1:] if len(items) > 2 else items[1] + mapping[key] = val + return mapping diff --git a/lavis/common/annotator/uniformer/mmcv/image/__init__.py b/lavis/common/annotator/uniformer/mmcv/image/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0051d609d3de4e7562e3fe638335c66617c4d91 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/image/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr, + gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert, + rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb) +from .geometric import (cutout, imcrop, imflip, imflip_, impad, + impad_to_multiple, imrescale, imresize, imresize_like, + imresize_to_multiple, imrotate, imshear, imtranslate, + rescale_size) +from .io import imfrombytes, imread, imwrite, supported_backends, use_backend +from .misc import tensor2imgs +from .photometric import (adjust_brightness, adjust_color, adjust_contrast, + adjust_lighting, adjust_sharpness, auto_contrast, + clahe, imdenormalize, imequalize, iminvert, + imnormalize, imnormalize_, lut_transform, posterize, + solarize) + +__all__ = [ + 'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb', + 'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale', + 'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size', + 'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate', + 'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend', + 'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize', + 'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr', + 'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize', + 'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe', + 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting' +] diff --git a/lavis/common/annotator/uniformer/mmcv/image/colorspace.py b/lavis/common/annotator/uniformer/mmcv/image/colorspace.py new file mode 100644 index 0000000000000000000000000000000000000000..814533952fdfda23d67cb6a3073692d8c1156add --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/image/colorspace.py @@ -0,0 +1,306 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + + +def imconvert(img, src, dst): + """Convert an image from the src colorspace to dst colorspace. + + Args: + img (ndarray): The input image. + src (str): The source colorspace, e.g., 'rgb', 'hsv'. + dst (str): The destination colorspace, e.g., 'rgb', 'hsv'. + + Returns: + ndarray: The converted image. + """ + code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') + out_img = cv2.cvtColor(img, code) + return out_img + + +def bgr2gray(img, keepdim=False): + """Convert a BGR image to grayscale image. + + Args: + img (ndarray): The input image. + keepdim (bool): If False (by default), then return the grayscale image + with 2 dims, otherwise 3 dims. + + Returns: + ndarray: The converted grayscale image. + """ + out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + if keepdim: + out_img = out_img[..., None] + return out_img + + +def rgb2gray(img, keepdim=False): + """Convert a RGB image to grayscale image. + + Args: + img (ndarray): The input image. + keepdim (bool): If False (by default), then return the grayscale image + with 2 dims, otherwise 3 dims. + + Returns: + ndarray: The converted grayscale image. + """ + out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + if keepdim: + out_img = out_img[..., None] + return out_img + + +def gray2bgr(img): + """Convert a grayscale image to BGR image. + + Args: + img (ndarray): The input image. + + Returns: + ndarray: The converted BGR image. + """ + img = img[..., None] if img.ndim == 2 else img + out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + return out_img + + +def gray2rgb(img): + """Convert a grayscale image to RGB image. + + Args: + img (ndarray): The input image. + + Returns: + ndarray: The converted RGB image. + """ + img = img[..., None] if img.ndim == 2 else img + out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) + return out_img + + +def _convert_input_type_range(img): + """Convert the type and range of the input image. + + It converts the input image to np.float32 type and range of [0, 1]. + It is mainly used for pre-processing the input image in colorspace + conversion functions such as rgb2ycbcr and ycbcr2rgb. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + (ndarray): The converted image with type of np.float32 and range of + [0, 1]. + """ + img_type = img.dtype + img = img.astype(np.float32) + if img_type == np.float32: + pass + elif img_type == np.uint8: + img /= 255. + else: + raise TypeError('The img type should be np.float32 or np.uint8, ' + f'but got {img_type}') + return img + + +def _convert_output_type_range(img, dst_type): + """Convert the type and range of the image according to dst_type. + + It converts the image to desired type and range. If `dst_type` is np.uint8, + images will be converted to np.uint8 type with range [0, 255]. If + `dst_type` is np.float32, it converts the image to np.float32 type with + range [0, 1]. + It is mainly used for post-processing images in colorspace conversion + functions such as rgb2ycbcr and ycbcr2rgb. + + Args: + img (ndarray): The image to be converted with np.float32 type and + range [0, 255]. + dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it + converts the image to np.uint8 type with range [0, 255]. If + dst_type is np.float32, it converts the image to np.float32 type + with range [0, 1]. + + Returns: + (ndarray): The converted image with desired type and range. + """ + if dst_type not in (np.uint8, np.float32): + raise TypeError('The dst_type should be np.float32 or np.uint8, ' + f'but got {dst_type}') + if dst_type == np.uint8: + img = img.round() + else: + img /= 255. + return img.astype(dst_type) + + +def rgb2ycbcr(img, y_only=False): + """Convert a RGB image to YCbCr image. + + This function produces the same results as Matlab's `rgb2ycbcr` function. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + y_only (bool): Whether to only return Y channel. Default: False. + + Returns: + ndarray: The converted YCbCr image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) + if y_only: + out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0 + else: + out_img = np.matmul( + img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786], + [24.966, 112.0, -18.214]]) + [16, 128, 128] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def bgr2ycbcr(img, y_only=False): + """Convert a BGR image to YCbCr image. + + The bgr version of rgb2ycbcr. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + y_only (bool): Whether to only return Y channel. Default: False. + + Returns: + ndarray: The converted YCbCr image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) + if y_only: + out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0 + else: + out_img = np.matmul( + img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786], + [65.481, -37.797, 112.0]]) + [16, 128, 128] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def ycbcr2rgb(img): + """Convert a YCbCr image to RGB image. + + This function produces the same results as Matlab's ycbcr2rgb function. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + ndarray: The converted RGB image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) * 255 + out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], + [0, -0.00153632, 0.00791071], + [0.00625893, -0.00318811, 0]]) * 255.0 + [ + -222.921, 135.576, -276.836 + ] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def ycbcr2bgr(img): + """Convert a YCbCr image to BGR image. + + The bgr version of ycbcr2rgb. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + ndarray: The converted BGR image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) * 255 + out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], + [0.00791071, -0.00153632, 0], + [0, -0.00318811, 0.00625893]]) * 255.0 + [ + -276.836, 135.576, -222.921 + ] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def convert_color_factory(src, dst): + + code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') + + def convert_color(img): + out_img = cv2.cvtColor(img, code) + return out_img + + convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()} + image. + + Args: + img (ndarray or str): The input image. + + Returns: + ndarray: The converted {dst.upper()} image. + """ + + return convert_color + + +bgr2rgb = convert_color_factory('bgr', 'rgb') + +rgb2bgr = convert_color_factory('rgb', 'bgr') + +bgr2hsv = convert_color_factory('bgr', 'hsv') + +hsv2bgr = convert_color_factory('hsv', 'bgr') + +bgr2hls = convert_color_factory('bgr', 'hls') + +hls2bgr = convert_color_factory('hls', 'bgr') diff --git a/lavis/common/annotator/uniformer/mmcv/image/geometric.py b/lavis/common/annotator/uniformer/mmcv/image/geometric.py new file mode 100644 index 0000000000000000000000000000000000000000..cf97c201cb4e43796c911919d03fb26a07ed817d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/image/geometric.py @@ -0,0 +1,728 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numbers + +import cv2 +import numpy as np + +from ..utils import to_2tuple +from .io import imread_backend + +try: + from PIL import Image +except ImportError: + Image = None + + +def _scale_size(size, scale): + """Rescale a size by a ratio. + + Args: + size (tuple[int]): (w, h). + scale (float | tuple(float)): Scaling factor. + + Returns: + tuple[int]: scaled size. + """ + if isinstance(scale, (float, int)): + scale = (scale, scale) + w, h = size + return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5) + + +cv2_interp_codes = { + 'nearest': cv2.INTER_NEAREST, + 'bilinear': cv2.INTER_LINEAR, + 'bicubic': cv2.INTER_CUBIC, + 'area': cv2.INTER_AREA, + 'lanczos': cv2.INTER_LANCZOS4 +} + +if Image is not None: + pillow_interp_codes = { + 'nearest': Image.NEAREST, + 'bilinear': Image.BILINEAR, + 'bicubic': Image.BICUBIC, + 'box': Image.BOX, + 'lanczos': Image.LANCZOS, + 'hamming': Image.HAMMING + } + + +def imresize(img, + size, + return_scale=False, + interpolation='bilinear', + out=None, + backend=None): + """Resize image to a given size. + + Args: + img (ndarray): The input image. + size (tuple[int]): Target size (w, h). + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. + out (ndarray): The output destination. + backend (str | None): The image resize backend type. Options are `cv2`, + `pillow`, `None`. If backend is None, the global imread_backend + specified by ``mmcv.use_backend()`` will be used. Default: None. + + Returns: + tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = img.shape[:2] + if backend is None: + backend = imread_backend + if backend not in ['cv2', 'pillow']: + raise ValueError(f'backend: {backend} is not supported for resize.' + f"Supported backends are 'cv2', 'pillow'") + + if backend == 'pillow': + assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' + pil_image = Image.fromarray(img) + pil_image = pil_image.resize(size, pillow_interp_codes[interpolation]) + resized_img = np.array(pil_image) + else: + resized_img = cv2.resize( + img, size, dst=out, interpolation=cv2_interp_codes[interpolation]) + if not return_scale: + return resized_img + else: + w_scale = size[0] / w + h_scale = size[1] / h + return resized_img, w_scale, h_scale + + +def imresize_to_multiple(img, + divisor, + size=None, + scale_factor=None, + keep_ratio=False, + return_scale=False, + interpolation='bilinear', + out=None, + backend=None): + """Resize image according to a given size or scale factor and then rounds + up the the resized or rescaled image size to the nearest value that can be + divided by the divisor. + + Args: + img (ndarray): The input image. + divisor (int | tuple): Resized image size will be a multiple of + divisor. If divisor is a tuple, divisor should be + (w_divisor, h_divisor). + size (None | int | tuple[int]): Target size (w, h). Default: None. + scale_factor (None | float | tuple[float]): Multiplier for spatial + size. Should match input size if it is a tuple and the 2D style is + (w_scale_factor, h_scale_factor). Default: None. + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. Default: False. + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. + out (ndarray): The output destination. + backend (str | None): The image resize backend type. Options are `cv2`, + `pillow`, `None`. If backend is None, the global imread_backend + specified by ``mmcv.use_backend()`` will be used. Default: None. + + Returns: + tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = img.shape[:2] + if size is not None and scale_factor is not None: + raise ValueError('only one of size or scale_factor should be defined') + elif size is None and scale_factor is None: + raise ValueError('one of size or scale_factor should be defined') + elif size is not None: + size = to_2tuple(size) + if keep_ratio: + size = rescale_size((w, h), size, return_scale=False) + else: + size = _scale_size((w, h), scale_factor) + + divisor = to_2tuple(divisor) + size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)]) + resized_img, w_scale, h_scale = imresize( + img, + size, + return_scale=True, + interpolation=interpolation, + out=out, + backend=backend) + if return_scale: + return resized_img, w_scale, h_scale + else: + return resized_img + + +def imresize_like(img, + dst_img, + return_scale=False, + interpolation='bilinear', + backend=None): + """Resize image to the same size of a given image. + + Args: + img (ndarray): The input image. + dst_img (ndarray): The target image. + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Same as :func:`resize`. + backend (str | None): Same as :func:`resize`. + + Returns: + tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = dst_img.shape[:2] + return imresize(img, (w, h), return_scale, interpolation, backend=backend) + + +def rescale_size(old_size, scale, return_scale=False): + """Calculate the new size to be rescaled to. + + Args: + old_size (tuple[int]): The old size (w, h) of image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by this + factor, else if it is a tuple of 2 integers, then the image will + be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image size. + + Returns: + tuple[int]: The new rescaled image size. + """ + w, h = old_size + if isinstance(scale, (float, int)): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + scale_factor = scale + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + else: + raise TypeError( + f'Scale must be a number or tuple of int, but got {type(scale)}') + + new_size = _scale_size((w, h), scale_factor) + + if return_scale: + return new_size, scale_factor + else: + return new_size + + +def imrescale(img, + scale, + return_scale=False, + interpolation='bilinear', + backend=None): + """Resize image while keeping the aspect ratio. + + Args: + img (ndarray): The input image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by this + factor, else if it is a tuple of 2 integers, then the image will + be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image. + interpolation (str): Same as :func:`resize`. + backend (str | None): Same as :func:`resize`. + + Returns: + ndarray: The rescaled image. + """ + h, w = img.shape[:2] + new_size, scale_factor = rescale_size((w, h), scale, return_scale=True) + rescaled_img = imresize( + img, new_size, interpolation=interpolation, backend=backend) + if return_scale: + return rescaled_img, scale_factor + else: + return rescaled_img + + +def imflip(img, direction='horizontal'): + """Flip an image horizontally or vertically. + + Args: + img (ndarray): Image to be flipped. + direction (str): The flip direction, either "horizontal" or + "vertical" or "diagonal". + + Returns: + ndarray: The flipped image. + """ + assert direction in ['horizontal', 'vertical', 'diagonal'] + if direction == 'horizontal': + return np.flip(img, axis=1) + elif direction == 'vertical': + return np.flip(img, axis=0) + else: + return np.flip(img, axis=(0, 1)) + + +def imflip_(img, direction='horizontal'): + """Inplace flip an image horizontally or vertically. + + Args: + img (ndarray): Image to be flipped. + direction (str): The flip direction, either "horizontal" or + "vertical" or "diagonal". + + Returns: + ndarray: The flipped image (inplace). + """ + assert direction in ['horizontal', 'vertical', 'diagonal'] + if direction == 'horizontal': + return cv2.flip(img, 1, img) + elif direction == 'vertical': + return cv2.flip(img, 0, img) + else: + return cv2.flip(img, -1, img) + + +def imrotate(img, + angle, + center=None, + scale=1.0, + border_value=0, + interpolation='bilinear', + auto_bound=False): + """Rotate an image. + + Args: + img (ndarray): Image to be rotated. + angle (float): Rotation angle in degrees, positive values mean + clockwise rotation. + center (tuple[float], optional): Center point (w, h) of the rotation in + the source image. If not specified, the center of the image will be + used. + scale (float): Isotropic scale factor. + border_value (int): Border value. + interpolation (str): Same as :func:`resize`. + auto_bound (bool): Whether to adjust the image size to cover the whole + rotated image. + + Returns: + ndarray: The rotated image. + """ + if center is not None and auto_bound: + raise ValueError('`auto_bound` conflicts with `center`') + h, w = img.shape[:2] + if center is None: + center = ((w - 1) * 0.5, (h - 1) * 0.5) + assert isinstance(center, tuple) + + matrix = cv2.getRotationMatrix2D(center, -angle, scale) + if auto_bound: + cos = np.abs(matrix[0, 0]) + sin = np.abs(matrix[0, 1]) + new_w = h * sin + w * cos + new_h = h * cos + w * sin + matrix[0, 2] += (new_w - w) * 0.5 + matrix[1, 2] += (new_h - h) * 0.5 + w = int(np.round(new_w)) + h = int(np.round(new_h)) + rotated = cv2.warpAffine( + img, + matrix, (w, h), + flags=cv2_interp_codes[interpolation], + borderValue=border_value) + return rotated + + +def bbox_clip(bboxes, img_shape): + """Clip bboxes to fit the image shape. + + Args: + bboxes (ndarray): Shape (..., 4*k) + img_shape (tuple[int]): (height, width) of the image. + + Returns: + ndarray: Clipped bboxes. + """ + assert bboxes.shape[-1] % 4 == 0 + cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype) + cmin[0::2] = img_shape[1] - 1 + cmin[1::2] = img_shape[0] - 1 + clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0) + return clipped_bboxes + + +def bbox_scaling(bboxes, scale, clip_shape=None): + """Scaling bboxes w.r.t the box center. + + Args: + bboxes (ndarray): Shape(..., 4). + scale (float): Scaling factor. + clip_shape (tuple[int], optional): If specified, bboxes that exceed the + boundary will be clipped according to the given shape (h, w). + + Returns: + ndarray: Scaled bboxes. + """ + if float(scale) == 1.0: + scaled_bboxes = bboxes.copy() + else: + w = bboxes[..., 2] - bboxes[..., 0] + 1 + h = bboxes[..., 3] - bboxes[..., 1] + 1 + dw = (w * (scale - 1)) * 0.5 + dh = (h * (scale - 1)) * 0.5 + scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1) + if clip_shape is not None: + return bbox_clip(scaled_bboxes, clip_shape) + else: + return scaled_bboxes + + +def imcrop(img, bboxes, scale=1.0, pad_fill=None): + """Crop image patches. + + 3 steps: scale the bboxes -> clip bboxes -> crop and pad. + + Args: + img (ndarray): Image to be cropped. + bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes. + scale (float, optional): Scale ratio of bboxes, the default value + 1.0 means no padding. + pad_fill (Number | list[Number]): Value to be filled for padding. + Default: None, which means no padding. + + Returns: + list[ndarray] | ndarray: The cropped image patches. + """ + chn = 1 if img.ndim == 2 else img.shape[2] + if pad_fill is not None: + if isinstance(pad_fill, (int, float)): + pad_fill = [pad_fill for _ in range(chn)] + assert len(pad_fill) == chn + + _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes + scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32) + clipped_bbox = bbox_clip(scaled_bboxes, img.shape) + + patches = [] + for i in range(clipped_bbox.shape[0]): + x1, y1, x2, y2 = tuple(clipped_bbox[i, :]) + if pad_fill is None: + patch = img[y1:y2 + 1, x1:x2 + 1, ...] + else: + _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :]) + if chn == 1: + patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1) + else: + patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn) + patch = np.array( + pad_fill, dtype=img.dtype) * np.ones( + patch_shape, dtype=img.dtype) + x_start = 0 if _x1 >= 0 else -_x1 + y_start = 0 if _y1 >= 0 else -_y1 + w = x2 - x1 + 1 + h = y2 - y1 + 1 + patch[y_start:y_start + h, x_start:x_start + w, + ...] = img[y1:y1 + h, x1:x1 + w, ...] + patches.append(patch) + + if bboxes.ndim == 1: + return patches[0] + else: + return patches + + +def impad(img, + *, + shape=None, + padding=None, + pad_val=0, + padding_mode='constant'): + """Pad the given image to a certain shape or pad on all sides with + specified padding mode and padding value. + + Args: + img (ndarray): Image to be padded. + shape (tuple[int]): Expected padding shape (h, w). Default: None. + padding (int or tuple[int]): Padding on each border. If a single int is + provided this is used to pad all borders. If tuple of length 2 is + provided this is the padding on left/right and top/bottom + respectively. If a tuple of length 4 is provided this is the + padding for the left, top, right and bottom borders respectively. + Default: None. Note that `shape` and `padding` can not be both + set. + pad_val (Number | Sequence[Number]): Values to be filled in padding + areas when padding_mode is 'constant'. Default: 0. + padding_mode (str): Type of padding. Should be: constant, edge, + reflect or symmetric. Default: constant. + + - constant: pads with a constant value, this value is specified + with pad_val. + - edge: pads with the last value at the edge of the image. + - reflect: pads with reflection of image without repeating the + last value on the edge. For example, padding [1, 2, 3, 4] + with 2 elements on both sides in reflect mode will result + in [3, 2, 1, 2, 3, 4, 3, 2]. + - symmetric: pads with reflection of image repeating the last + value on the edge. For example, padding [1, 2, 3, 4] with + 2 elements on both sides in symmetric mode will result in + [2, 1, 1, 2, 3, 4, 4, 3] + + Returns: + ndarray: The padded image. + """ + + assert (shape is not None) ^ (padding is not None) + if shape is not None: + padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0]) + + # check pad_val + if isinstance(pad_val, tuple): + assert len(pad_val) == img.shape[-1] + elif not isinstance(pad_val, numbers.Number): + raise TypeError('pad_val must be a int or a tuple. ' + f'But received {type(pad_val)}') + + # check padding + if isinstance(padding, tuple) and len(padding) in [2, 4]: + if len(padding) == 2: + padding = (padding[0], padding[1], padding[0], padding[1]) + elif isinstance(padding, numbers.Number): + padding = (padding, padding, padding, padding) + else: + raise ValueError('Padding must be a int or a 2, or 4 element tuple.' + f'But received {padding}') + + # check padding mode + assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] + + border_type = { + 'constant': cv2.BORDER_CONSTANT, + 'edge': cv2.BORDER_REPLICATE, + 'reflect': cv2.BORDER_REFLECT_101, + 'symmetric': cv2.BORDER_REFLECT + } + img = cv2.copyMakeBorder( + img, + padding[1], + padding[3], + padding[0], + padding[2], + border_type[padding_mode], + value=pad_val) + + return img + + +def impad_to_multiple(img, divisor, pad_val=0): + """Pad an image to ensure each edge to be multiple to some number. + + Args: + img (ndarray): Image to be padded. + divisor (int): Padded image edges will be multiple to divisor. + pad_val (Number | Sequence[Number]): Same as :func:`impad`. + + Returns: + ndarray: The padded image. + """ + pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor + pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor + return impad(img, shape=(pad_h, pad_w), pad_val=pad_val) + + +def cutout(img, shape, pad_val=0): + """Randomly cut out a rectangle from the original img. + + Args: + img (ndarray): Image to be cutout. + shape (int | tuple[int]): Expected cutout shape (h, w). If given as a + int, the value will be used for both h and w. + pad_val (int | float | tuple[int | float]): Values to be filled in the + cut area. Defaults to 0. + + Returns: + ndarray: The cutout image. + """ + + channels = 1 if img.ndim == 2 else img.shape[2] + if isinstance(shape, int): + cut_h, cut_w = shape, shape + else: + assert isinstance(shape, tuple) and len(shape) == 2, \ + f'shape must be a int or a tuple with length 2, but got type ' \ + f'{type(shape)} instead.' + cut_h, cut_w = shape + if isinstance(pad_val, (int, float)): + pad_val = tuple([pad_val] * channels) + elif isinstance(pad_val, tuple): + assert len(pad_val) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(pad_val), channels) + else: + raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`') + + img_h, img_w = img.shape[:2] + y0 = np.random.uniform(img_h) + x0 = np.random.uniform(img_w) + + y1 = int(max(0, y0 - cut_h / 2.)) + x1 = int(max(0, x0 - cut_w / 2.)) + y2 = min(img_h, y1 + cut_h) + x2 = min(img_w, x1 + cut_w) + + if img.ndim == 2: + patch_shape = (y2 - y1, x2 - x1) + else: + patch_shape = (y2 - y1, x2 - x1, channels) + + img_cutout = img.copy() + patch = np.array( + pad_val, dtype=img.dtype) * np.ones( + patch_shape, dtype=img.dtype) + img_cutout[y1:y2, x1:x2, ...] = patch + + return img_cutout + + +def _get_shear_matrix(magnitude, direction='horizontal'): + """Generate the shear matrix for transformation. + + Args: + magnitude (int | float): The magnitude used for shear. + direction (str): The flip direction, either "horizontal" + or "vertical". + + Returns: + ndarray: The shear matrix with dtype float32. + """ + if direction == 'horizontal': + shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]]) + elif direction == 'vertical': + shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]]) + return shear_matrix + + +def imshear(img, + magnitude, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """Shear an image. + + Args: + img (ndarray): Image to be sheared with format (h, w) + or (h, w, c). + magnitude (int | float): The magnitude used for shear. + direction (str): The flip direction, either "horizontal" + or "vertical". + border_value (int | tuple[int]): Value used in case of a + constant border. + interpolation (str): Same as :func:`resize`. + + Returns: + ndarray: The sheared image. + """ + assert direction in ['horizontal', + 'vertical'], f'Invalid direction: {direction}' + height, width = img.shape[:2] + if img.ndim == 2: + channels = 1 + elif img.ndim == 3: + channels = img.shape[-1] + if isinstance(border_value, int): + border_value = tuple([border_value] * channels) + elif isinstance(border_value, tuple): + assert len(border_value) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(border_value), channels) + else: + raise ValueError( + f'Invalid type {type(border_value)} for `border_value`') + shear_matrix = _get_shear_matrix(magnitude, direction) + sheared = cv2.warpAffine( + img, + shear_matrix, + (width, height), + # Note case when the number elements in `border_value` + # greater than 3 (e.g. shearing masks whose channels large + # than 3) will raise TypeError in `cv2.warpAffine`. + # Here simply slice the first 3 values in `border_value`. + borderValue=border_value[:3], + flags=cv2_interp_codes[interpolation]) + return sheared + + +def _get_translate_matrix(offset, direction='horizontal'): + """Generate the translate matrix. + + Args: + offset (int | float): The offset used for translate. + direction (str): The translate direction, either + "horizontal" or "vertical". + + Returns: + ndarray: The translate matrix with dtype float32. + """ + if direction == 'horizontal': + translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]]) + elif direction == 'vertical': + translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]]) + return translate_matrix + + +def imtranslate(img, + offset, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """Translate an image. + + Args: + img (ndarray): Image to be translated with format + (h, w) or (h, w, c). + offset (int | float): The offset used for translate. + direction (str): The translate direction, either "horizontal" + or "vertical". + border_value (int | tuple[int]): Value used in case of a + constant border. + interpolation (str): Same as :func:`resize`. + + Returns: + ndarray: The translated image. + """ + assert direction in ['horizontal', + 'vertical'], f'Invalid direction: {direction}' + height, width = img.shape[:2] + if img.ndim == 2: + channels = 1 + elif img.ndim == 3: + channels = img.shape[-1] + if isinstance(border_value, int): + border_value = tuple([border_value] * channels) + elif isinstance(border_value, tuple): + assert len(border_value) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(border_value), channels) + else: + raise ValueError( + f'Invalid type {type(border_value)} for `border_value`.') + translate_matrix = _get_translate_matrix(offset, direction) + translated = cv2.warpAffine( + img, + translate_matrix, + (width, height), + # Note case when the number elements in `border_value` + # greater than 3 (e.g. translating masks whose channels + # large than 3) will raise TypeError in `cv2.warpAffine`. + # Here simply slice the first 3 values in `border_value`. + borderValue=border_value[:3], + flags=cv2_interp_codes[interpolation]) + return translated diff --git a/lavis/common/annotator/uniformer/mmcv/image/io.py b/lavis/common/annotator/uniformer/mmcv/image/io.py new file mode 100644 index 0000000000000000000000000000000000000000..d3fa2e8cc06b1a7b0b69de6406980b15d61a1e5d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/image/io.py @@ -0,0 +1,258 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import io +import os.path as osp +from pathlib import Path + +import cv2 +import numpy as np +from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION, + IMREAD_UNCHANGED) + +from annotator.uniformer.mmcv.utils import check_file_exist, is_str, mkdir_or_exist + +try: + from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG +except ImportError: + TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None + +try: + from PIL import Image, ImageOps +except ImportError: + Image = None + +try: + import tifffile +except ImportError: + tifffile = None + +jpeg = None +supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile'] + +imread_flags = { + 'color': IMREAD_COLOR, + 'grayscale': IMREAD_GRAYSCALE, + 'unchanged': IMREAD_UNCHANGED, + 'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR, + 'grayscale_ignore_orientation': + IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE +} + +imread_backend = 'cv2' + + +def use_backend(backend): + """Select a backend for image decoding. + + Args: + backend (str): The image decoding backend type. Options are `cv2`, + `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG) + and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg` + file format. + """ + assert backend in supported_backends + global imread_backend + imread_backend = backend + if imread_backend == 'turbojpeg': + if TurboJPEG is None: + raise ImportError('`PyTurboJPEG` is not installed') + global jpeg + if jpeg is None: + jpeg = TurboJPEG() + elif imread_backend == 'pillow': + if Image is None: + raise ImportError('`Pillow` is not installed') + elif imread_backend == 'tifffile': + if tifffile is None: + raise ImportError('`tifffile` is not installed') + + +def _jpegflag(flag='color', channel_order='bgr'): + channel_order = channel_order.lower() + if channel_order not in ['rgb', 'bgr']: + raise ValueError('channel order must be either "rgb" or "bgr"') + + if flag == 'color': + if channel_order == 'bgr': + return TJPF_BGR + elif channel_order == 'rgb': + return TJCS_RGB + elif flag == 'grayscale': + return TJPF_GRAY + else: + raise ValueError('flag must be "color" or "grayscale"') + + +def _pillow2array(img, flag='color', channel_order='bgr'): + """Convert a pillow image to numpy array. + + Args: + img (:obj:`PIL.Image.Image`): The image loaded using PIL + flag (str): Flags specifying the color type of a loaded image, + candidates are 'color', 'grayscale' and 'unchanged'. + Default to 'color'. + channel_order (str): The channel order of the output image array, + candidates are 'bgr' and 'rgb'. Default to 'bgr'. + + Returns: + np.ndarray: The converted numpy array + """ + channel_order = channel_order.lower() + if channel_order not in ['rgb', 'bgr']: + raise ValueError('channel order must be either "rgb" or "bgr"') + + if flag == 'unchanged': + array = np.array(img) + if array.ndim >= 3 and array.shape[2] >= 3: # color image + array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR + else: + # Handle exif orientation tag + if flag in ['color', 'grayscale']: + img = ImageOps.exif_transpose(img) + # If the image mode is not 'RGB', convert it to 'RGB' first. + if img.mode != 'RGB': + if img.mode != 'LA': + # Most formats except 'LA' can be directly converted to RGB + img = img.convert('RGB') + else: + # When the mode is 'LA', the default conversion will fill in + # the canvas with black, which sometimes shadows black objects + # in the foreground. + # + # Therefore, a random color (124, 117, 104) is used for canvas + img_rgba = img.convert('RGBA') + img = Image.new('RGB', img_rgba.size, (124, 117, 104)) + img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha + if flag in ['color', 'color_ignore_orientation']: + array = np.array(img) + if channel_order != 'rgb': + array = array[:, :, ::-1] # RGB to BGR + elif flag in ['grayscale', 'grayscale_ignore_orientation']: + img = img.convert('L') + array = np.array(img) + else: + raise ValueError( + 'flag must be "color", "grayscale", "unchanged", ' + f'"color_ignore_orientation" or "grayscale_ignore_orientation"' + f' but got {flag}') + return array + + +def imread(img_or_path, flag='color', channel_order='bgr', backend=None): + """Read an image. + + Args: + img_or_path (ndarray or str or Path): Either a numpy array or str or + pathlib.Path. If it is a numpy array (loaded image), then + it will be returned as is. + flag (str): Flags specifying the color type of a loaded image, + candidates are `color`, `grayscale`, `unchanged`, + `color_ignore_orientation` and `grayscale_ignore_orientation`. + By default, `cv2` and `pillow` backend would rotate the image + according to its EXIF info unless called with `unchanged` or + `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend + always ignore image's EXIF info regardless of the flag. + The `turbojpeg` backend only supports `color` and `grayscale`. + channel_order (str): Order of channel, candidates are `bgr` and `rgb`. + backend (str | None): The image decoding backend type. Options are + `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. + If backend is None, the global imread_backend specified by + ``mmcv.use_backend()`` will be used. Default: None. + + Returns: + ndarray: Loaded image array. + """ + + if backend is None: + backend = imread_backend + if backend not in supported_backends: + raise ValueError(f'backend: {backend} is not supported. Supported ' + "backends are 'cv2', 'turbojpeg', 'pillow'") + if isinstance(img_or_path, Path): + img_or_path = str(img_or_path) + + if isinstance(img_or_path, np.ndarray): + return img_or_path + elif is_str(img_or_path): + check_file_exist(img_or_path, + f'img file does not exist: {img_or_path}') + if backend == 'turbojpeg': + with open(img_or_path, 'rb') as in_file: + img = jpeg.decode(in_file.read(), + _jpegflag(flag, channel_order)) + if img.shape[-1] == 1: + img = img[:, :, 0] + return img + elif backend == 'pillow': + img = Image.open(img_or_path) + img = _pillow2array(img, flag, channel_order) + return img + elif backend == 'tifffile': + img = tifffile.imread(img_or_path) + return img + else: + flag = imread_flags[flag] if is_str(flag) else flag + img = cv2.imread(img_or_path, flag) + if flag == IMREAD_COLOR and channel_order == 'rgb': + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) + return img + else: + raise TypeError('"img" must be a numpy array or a str or ' + 'a pathlib.Path object') + + +def imfrombytes(content, flag='color', channel_order='bgr', backend=None): + """Read an image from bytes. + + Args: + content (bytes): Image bytes got from files or other streams. + flag (str): Same as :func:`imread`. + backend (str | None): The image decoding backend type. Options are + `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the + global imread_backend specified by ``mmcv.use_backend()`` will be + used. Default: None. + + Returns: + ndarray: Loaded image array. + """ + + if backend is None: + backend = imread_backend + if backend not in supported_backends: + raise ValueError(f'backend: {backend} is not supported. Supported ' + "backends are 'cv2', 'turbojpeg', 'pillow'") + if backend == 'turbojpeg': + img = jpeg.decode(content, _jpegflag(flag, channel_order)) + if img.shape[-1] == 1: + img = img[:, :, 0] + return img + elif backend == 'pillow': + buff = io.BytesIO(content) + img = Image.open(buff) + img = _pillow2array(img, flag, channel_order) + return img + else: + img_np = np.frombuffer(content, np.uint8) + flag = imread_flags[flag] if is_str(flag) else flag + img = cv2.imdecode(img_np, flag) + if flag == IMREAD_COLOR and channel_order == 'rgb': + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) + return img + + +def imwrite(img, file_path, params=None, auto_mkdir=True): + """Write image to file. + + Args: + img (ndarray): Image array to be written. + file_path (str): Image file path. + params (None or list): Same as opencv :func:`imwrite` interface. + auto_mkdir (bool): If the parent folder of `file_path` does not exist, + whether to create it automatically. + + Returns: + bool: Successful or not. + """ + if auto_mkdir: + dir_name = osp.abspath(osp.dirname(file_path)) + mkdir_or_exist(dir_name) + return cv2.imwrite(file_path, img, params) diff --git a/lavis/common/annotator/uniformer/mmcv/image/misc.py b/lavis/common/annotator/uniformer/mmcv/image/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..3e61f05e3b05e4c7b40de4eb6c8eb100e6da41d0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/image/misc.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + +import annotator.uniformer.mmcv as mmcv + +try: + import torch +except ImportError: + torch = None + + +def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): + """Convert tensor to 3-channel images. + + Args: + tensor (torch.Tensor): Tensor that contains multiple images, shape ( + N, C, H, W). + mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0). + std (tuple[float], optional): Standard deviation of images. + Defaults to (1, 1, 1). + to_rgb (bool, optional): Whether the tensor was converted to RGB + format in the first place. If so, convert it back to BGR. + Defaults to True. + + Returns: + list[np.ndarray]: A list that contains multiple images. + """ + + if torch is None: + raise RuntimeError('pytorch is not installed') + assert torch.is_tensor(tensor) and tensor.ndim == 4 + assert len(mean) == 3 + assert len(std) == 3 + + num_imgs = tensor.size(0) + mean = np.array(mean, dtype=np.float32) + std = np.array(std, dtype=np.float32) + imgs = [] + for img_id in range(num_imgs): + img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) + img = mmcv.imdenormalize( + img, mean, std, to_bgr=to_rgb).astype(np.uint8) + imgs.append(np.ascontiguousarray(img)) + return imgs diff --git a/lavis/common/annotator/uniformer/mmcv/image/photometric.py b/lavis/common/annotator/uniformer/mmcv/image/photometric.py new file mode 100644 index 0000000000000000000000000000000000000000..5085d012019c0cbf56f66f421a378278c1a058ae --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/image/photometric.py @@ -0,0 +1,428 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + +from ..utils import is_tuple_of +from .colorspace import bgr2gray, gray2bgr + + +def imnormalize(img, mean, std, to_rgb=True): + """Normalize an image with mean and std. + + Args: + img (ndarray): Image to be normalized. + mean (ndarray): The mean to be used for normalize. + std (ndarray): The std to be used for normalize. + to_rgb (bool): Whether to convert to rgb. + + Returns: + ndarray: The normalized image. + """ + img = img.copy().astype(np.float32) + return imnormalize_(img, mean, std, to_rgb) + + +def imnormalize_(img, mean, std, to_rgb=True): + """Inplace normalize an image with mean and std. + + Args: + img (ndarray): Image to be normalized. + mean (ndarray): The mean to be used for normalize. + std (ndarray): The std to be used for normalize. + to_rgb (bool): Whether to convert to rgb. + + Returns: + ndarray: The normalized image. + """ + # cv2 inplace normalization does not accept uint8 + assert img.dtype != np.uint8 + mean = np.float64(mean.reshape(1, -1)) + stdinv = 1 / np.float64(std.reshape(1, -1)) + if to_rgb: + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace + cv2.subtract(img, mean, img) # inplace + cv2.multiply(img, stdinv, img) # inplace + return img + + +def imdenormalize(img, mean, std, to_bgr=True): + assert img.dtype != np.uint8 + mean = mean.reshape(1, -1).astype(np.float64) + std = std.reshape(1, -1).astype(np.float64) + img = cv2.multiply(img, std) # make a copy + cv2.add(img, mean, img) # inplace + if to_bgr: + cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img) # inplace + return img + + +def iminvert(img): + """Invert (negate) an image. + + Args: + img (ndarray): Image to be inverted. + + Returns: + ndarray: The inverted image. + """ + return np.full_like(img, 255) - img + + +def solarize(img, thr=128): + """Solarize an image (invert all pixel values above a threshold) + + Args: + img (ndarray): Image to be solarized. + thr (int): Threshold for solarizing (0 - 255). + + Returns: + ndarray: The solarized image. + """ + img = np.where(img < thr, img, 255 - img) + return img + + +def posterize(img, bits): + """Posterize an image (reduce the number of bits for each color channel) + + Args: + img (ndarray): Image to be posterized. + bits (int): Number of bits (1 to 8) to use for posterizing. + + Returns: + ndarray: The posterized image. + """ + shift = 8 - bits + img = np.left_shift(np.right_shift(img, shift), shift) + return img + + +def adjust_color(img, alpha=1, beta=None, gamma=0): + r"""It blends the source image and its gray image: + + .. math:: + output = img * alpha + gray\_img * beta + gamma + + Args: + img (ndarray): The input source image. + alpha (int | float): Weight for the source image. Default 1. + beta (int | float): Weight for the converted gray image. + If None, it's assigned the value (1 - `alpha`). + gamma (int | float): Scalar added to each sum. + Same as :func:`cv2.addWeighted`. Default 0. + + Returns: + ndarray: Colored image which has the same size and dtype as input. + """ + gray_img = bgr2gray(img) + gray_img = np.tile(gray_img[..., None], [1, 1, 3]) + if beta is None: + beta = 1 - alpha + colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma) + if not colored_img.dtype == np.uint8: + # Note when the dtype of `img` is not the default `np.uint8` + # (e.g. np.float32), the value in `colored_img` got from cv2 + # is not guaranteed to be in range [0, 255], so here clip + # is needed. + colored_img = np.clip(colored_img, 0, 255) + return colored_img + + +def imequalize(img): + """Equalize the image histogram. + + This function applies a non-linear mapping to the input image, + in order to create a uniform distribution of grayscale values + in the output image. + + Args: + img (ndarray): Image to be equalized. + + Returns: + ndarray: The equalized image. + """ + + def _scale_channel(im, c): + """Scale the data in the corresponding channel.""" + im = im[:, :, c] + # Compute the histogram of the image channel. + histo = np.histogram(im, 256, (0, 255))[0] + # For computing the step, filter out the nonzeros. + nonzero_histo = histo[histo > 0] + step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255 + if not step: + lut = np.array(range(256)) + else: + # Compute the cumulative sum, shifted by step // 2 + # and then normalized by step. + lut = (np.cumsum(histo) + (step // 2)) // step + # Shift lut, prepending with 0. + lut = np.concatenate([[0], lut[:-1]], 0) + # handle potential integer overflow + lut[lut > 255] = 255 + # If step is zero, return the original image. + # Otherwise, index from lut. + return np.where(np.equal(step, 0), im, lut[im]) + + # Scales each channel independently and then stacks + # the result. + s1 = _scale_channel(img, 0) + s2 = _scale_channel(img, 1) + s3 = _scale_channel(img, 2) + equalized_img = np.stack([s1, s2, s3], axis=-1) + return equalized_img.astype(img.dtype) + + +def adjust_brightness(img, factor=1.): + """Adjust image brightness. + + This function controls the brightness of an image. An + enhancement factor of 0.0 gives a black image. + A factor of 1.0 gives the original image. This function + blends the source image and the degenerated black image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be brightened. + factor (float): A value controls the enhancement. + Factor 1.0 returns the original image, lower + factors mean less color (brightness, contrast, + etc), and higher values more. Default 1. + + Returns: + ndarray: The brightened image. + """ + degenerated = np.zeros_like(img) + # Note manually convert the dtype to np.float32, to + # achieve as close results as PIL.ImageEnhance.Brightness. + # Set beta=1-factor, and gamma=0 + brightened_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + brightened_img = np.clip(brightened_img, 0, 255) + return brightened_img.astype(img.dtype) + + +def adjust_contrast(img, factor=1.): + """Adjust image contrast. + + This function controls the contrast of an image. An + enhancement factor of 0.0 gives a solid grey + image. A factor of 1.0 gives the original image. It + blends the source image and the degenerated mean image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be contrasted. BGR order. + factor (float): Same as :func:`mmcv.adjust_brightness`. + + Returns: + ndarray: The contrasted image. + """ + gray_img = bgr2gray(img) + hist = np.histogram(gray_img, 256, (0, 255))[0] + mean = round(np.sum(gray_img) / np.sum(hist)) + degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype) + degenerated = gray2bgr(degenerated) + contrasted_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + contrasted_img = np.clip(contrasted_img, 0, 255) + return contrasted_img.astype(img.dtype) + + +def auto_contrast(img, cutoff=0): + """Auto adjust image contrast. + + This function maximize (normalize) image contrast by first removing cutoff + percent of the lightest and darkest pixels from the histogram and remapping + the image so that the darkest pixel becomes black (0), and the lightest + becomes white (255). + + Args: + img (ndarray): Image to be contrasted. BGR order. + cutoff (int | float | tuple): The cutoff percent of the lightest and + darkest pixels to be removed. If given as tuple, it shall be + (low, high). Otherwise, the single value will be used for both. + Defaults to 0. + + Returns: + ndarray: The contrasted image. + """ + + def _auto_contrast_channel(im, c, cutoff): + im = im[:, :, c] + # Compute the histogram of the image channel. + histo = np.histogram(im, 256, (0, 255))[0] + # Remove cut-off percent pixels from histo + histo_sum = np.cumsum(histo) + cut_low = histo_sum[-1] * cutoff[0] // 100 + cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100 + histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low + histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0) + + # Compute mapping + low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1] + # If all the values have been cut off, return the origin img + if low >= high: + return im + scale = 255.0 / (high - low) + offset = -low * scale + lut = np.array(range(256)) + lut = lut * scale + offset + lut = np.clip(lut, 0, 255) + return lut[im] + + if isinstance(cutoff, (int, float)): + cutoff = (cutoff, cutoff) + else: + assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \ + f'float or tuple, but got {type(cutoff)} instead.' + # Auto adjusts contrast for each channel independently and then stacks + # the result. + s1 = _auto_contrast_channel(img, 0, cutoff) + s2 = _auto_contrast_channel(img, 1, cutoff) + s3 = _auto_contrast_channel(img, 2, cutoff) + contrasted_img = np.stack([s1, s2, s3], axis=-1) + return contrasted_img.astype(img.dtype) + + +def adjust_sharpness(img, factor=1., kernel=None): + """Adjust image sharpness. + + This function controls the sharpness of an image. An + enhancement factor of 0.0 gives a blurred image. A + factor of 1.0 gives the original image. And a factor + of 2.0 gives a sharpened image. It blends the source + image and the degenerated mean image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be sharpened. BGR order. + factor (float): Same as :func:`mmcv.adjust_brightness`. + kernel (np.ndarray, optional): Filter kernel to be applied on the img + to obtain the degenerated img. Defaults to None. + + Note: + No value sanity check is enforced on the kernel set by users. So with + an inappropriate kernel, the ``adjust_sharpness`` may fail to perform + the function its name indicates but end up performing whatever + transform determined by the kernel. + + Returns: + ndarray: The sharpened image. + """ + + if kernel is None: + # adopted from PIL.ImageFilter.SMOOTH + kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13 + assert isinstance(kernel, np.ndarray), \ + f'kernel must be of type np.ndarray, but got {type(kernel)} instead.' + assert kernel.ndim == 2, \ + f'kernel must have a dimension of 2, but got {kernel.ndim} instead.' + + degenerated = cv2.filter2D(img, -1, kernel) + sharpened_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + sharpened_img = np.clip(sharpened_img, 0, 255) + return sharpened_img.astype(img.dtype) + + +def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True): + """AlexNet-style PCA jitter. + + This data augmentation is proposed in `ImageNet Classification with Deep + Convolutional Neural Networks + `_. + + Args: + img (ndarray): Image to be adjusted lighting. BGR order. + eigval (ndarray): the eigenvalue of the convariance matrix of pixel + values, respectively. + eigvec (ndarray): the eigenvector of the convariance matrix of pixel + values, respectively. + alphastd (float): The standard deviation for distribution of alpha. + Defaults to 0.1 + to_rgb (bool): Whether to convert img to rgb. + + Returns: + ndarray: The adjusted image. + """ + assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \ + f'eigval and eigvec should both be of type np.ndarray, got ' \ + f'{type(eigval)} and {type(eigvec)} instead.' + + assert eigval.ndim == 1 and eigvec.ndim == 2 + assert eigvec.shape == (3, eigval.shape[0]) + n_eigval = eigval.shape[0] + assert isinstance(alphastd, float), 'alphastd should be of type float, ' \ + f'got {type(alphastd)} instead.' + + img = img.copy().astype(np.float32) + if to_rgb: + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace + + alpha = np.random.normal(0, alphastd, n_eigval) + alter = eigvec \ + * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \ + * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval)) + alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape) + img_adjusted = img + alter + return img_adjusted + + +def lut_transform(img, lut_table): + """Transform array by look-up table. + + The function lut_transform fills the output array with values from the + look-up table. Indices of the entries are taken from the input array. + + Args: + img (ndarray): Image to be transformed. + lut_table (ndarray): look-up table of 256 elements; in case of + multi-channel input array, the table should either have a single + channel (in this case the same table is used for all channels) or + the same number of channels as in the input array. + + Returns: + ndarray: The transformed image. + """ + assert isinstance(img, np.ndarray) + assert 0 <= np.min(img) and np.max(img) <= 255 + assert isinstance(lut_table, np.ndarray) + assert lut_table.shape == (256, ) + + return cv2.LUT(np.array(img, dtype=np.uint8), lut_table) + + +def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)): + """Use CLAHE method to process the image. + + See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J]. + Graphics Gems, 1994:474-485.` for more information. + + Args: + img (ndarray): Image to be processed. + clip_limit (float): Threshold for contrast limiting. Default: 40.0. + tile_grid_size (tuple[int]): Size of grid for histogram equalization. + Input image will be divided into equally sized rectangular tiles. + It defines the number of tiles in row and column. Default: (8, 8). + + Returns: + ndarray: The processed image. + """ + assert isinstance(img, np.ndarray) + assert img.ndim == 2 + assert isinstance(clip_limit, (float, int)) + assert is_tuple_of(tile_grid_size, int) + assert len(tile_grid_size) == 2 + + clahe = cv2.createCLAHE(clip_limit, tile_grid_size) + return clahe.apply(np.array(img, dtype=np.uint8)) diff --git a/lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json b/lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json new file mode 100644 index 0000000000000000000000000000000000000000..25cf6f28caecc22a77e3136fefa6b8dfc0e6cb5b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json @@ -0,0 +1,6 @@ +{ + "resnet50_caffe": "detectron/resnet50_caffe", + "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr", + "resnet101_caffe": "detectron/resnet101_caffe", + "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr" +} diff --git a/lavis/common/annotator/uniformer/mmcv/model_zoo/mmcls.json b/lavis/common/annotator/uniformer/mmcv/model_zoo/mmcls.json new file mode 100644 index 0000000000000000000000000000000000000000..bdb311d9fe6d9f317290feedc9e37236c6cf6e8f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/model_zoo/mmcls.json @@ -0,0 +1,31 @@ +{ + "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth", + "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth", + "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth", + "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth", + "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth", + "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth", + "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth", + "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth", + "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_batch256_imagenet_20200708-34ab8f90.pth", + "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_batch256_imagenet_20200708-32ffb4f7.pth", + "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth", + "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.pth", + "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.pth", + "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_batch256_imagenet_20200708-1ad0ce94.pth", + "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_batch256_imagenet_20200708-9cb302ef.pth", + "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_batch256_imagenet_20200708-e79cb6a2.pth", + "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth", + "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth", + "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth", + "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth", + "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth", + "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth", + "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth", + "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth", + "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth", + "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth", + "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth", + "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth", + "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth" +} diff --git a/lavis/common/annotator/uniformer/mmcv/model_zoo/open_mmlab.json b/lavis/common/annotator/uniformer/mmcv/model_zoo/open_mmlab.json new file mode 100644 index 0000000000000000000000000000000000000000..8311db4feef92faa0841c697d75efbee8430c3a0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/model_zoo/open_mmlab.json @@ -0,0 +1,50 @@ +{ + "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth", + "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth", + "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth", + "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth", + "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth", + "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth", + "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth", + "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth", + "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth", + "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth", + "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth", + "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth", + "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth", + "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth", + "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth", + "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth", + "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth", + "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth", + "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth", + "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth", + "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth", + "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth", + "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth", + "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth", + "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth", + "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth", + "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth", + "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth", + "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth", + "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth", + "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth", + "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth", + "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth", + "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth", + "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth", + "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth", + "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth", + "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth", + "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth", + "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth", + "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth", + "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth", + "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth", + "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth", + "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth", + "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth", + "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth", + "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth" +} diff --git a/lavis/common/annotator/uniformer/mmcv/ops/__init__.py b/lavis/common/annotator/uniformer/mmcv/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..999e090a458ee148ceca0649f1e3806a40e909bd --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/__init__.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assign_score_withk import assign_score_withk +from .ball_query import ball_query +from .bbox import bbox_overlaps +from .border_align import BorderAlign, border_align +from .box_iou_rotated import box_iou_rotated +from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive +from .cc_attention import CrissCrossAttention +from .contour_expand import contour_expand +from .corner_pool import CornerPool +from .correlation import Correlation +from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d +from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack, + ModulatedDeformRoIPoolPack, deform_roi_pool) +from .deprecated_wrappers import Conv2d_deprecated as Conv2d +from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d +from .deprecated_wrappers import Linear_deprecated as Linear +from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d +from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss, + sigmoid_focal_loss, softmax_focal_loss) +from .furthest_point_sample import (furthest_point_sample, + furthest_point_sample_with_dist) +from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu +from .gather_points import gather_points +from .group_points import GroupAll, QueryAndGroup, grouping_operation +from .info import (get_compiler_version, get_compiling_cuda_version, + get_onnxruntime_op_path) +from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev +from .knn import knn +from .masked_conv import MaskedConv2d, masked_conv2d +from .modulated_deform_conv import (ModulatedDeformConv2d, + ModulatedDeformConv2dPack, + modulated_deform_conv2d) +from .multi_scale_deform_attn import MultiScaleDeformableAttention +from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms +from .pixel_group import pixel_group +from .point_sample import (SimpleRoIAlign, point_sample, + rel_roi_point_to_rel_img_point) +from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, + points_in_boxes_part) +from .points_sampler import PointsSampler +from .psa_mask import PSAMask +from .roi_align import RoIAlign, roi_align +from .roi_align_rotated import RoIAlignRotated, roi_align_rotated +from .roi_pool import RoIPool, roi_pool +from .roiaware_pool3d import RoIAwarePool3d +from .roipoint_pool3d import RoIPointPool3d +from .saconv import SAConv2d +from .scatter_points import DynamicScatter, dynamic_scatter +from .sync_bn import SyncBatchNorm +from .three_interpolate import three_interpolate +from .three_nn import three_nn +from .tin_shift import TINShift, tin_shift +from .upfirdn2d import upfirdn2d +from .voxelize import Voxelization, voxelization + +__all__ = [ + 'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe', + 'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack', + 'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack', + 'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss', + 'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss', + 'get_compiler_version', 'get_compiling_cuda_version', + 'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d', + 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack', + 'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match', + 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d', + 'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask', + 'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign', + 'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk', + 'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query', + 'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu', + 'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup', + 'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn', + 'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign', + 'border_align', 'gather_points', 'furthest_point_sample', + 'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation', + 'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization', + 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', + 'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all' +] diff --git a/lavis/common/annotator/uniformer/mmcv/ops/assign_score_withk.py b/lavis/common/annotator/uniformer/mmcv/ops/assign_score_withk.py new file mode 100644 index 0000000000000000000000000000000000000000..4906adaa2cffd1b46912fbe7d4f87ef2f9fa0012 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/assign_score_withk.py @@ -0,0 +1,123 @@ +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward']) + + +class AssignScoreWithK(Function): + r"""Perform weighted sum to generate output features according to scores. + Modified from `PAConv `_. + + This is a memory-efficient CUDA implementation of assign_scores operation, + which first transform all point features with weight bank, then assemble + neighbor features with ``knn_idx`` and perform weighted sum of ``scores``. + + See the `paper `_ appendix Sec. D for + more detailed descriptions. + + Note: + This implementation assumes using ``neighbor`` kernel input, which is + (point_features - center_features, point_features). + See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/ + pointnet2/paconv.py#L128 for more details. + """ + + @staticmethod + def forward(ctx, + scores, + point_features, + center_features, + knn_idx, + aggregate='sum'): + """ + Args: + scores (torch.Tensor): (B, npoint, K, M), predicted scores to + aggregate weight matrices in the weight bank. + ``npoint`` is the number of sampled centers. + ``K`` is the number of queried neighbors. + ``M`` is the number of weight matrices in the weight bank. + point_features (torch.Tensor): (B, N, M, out_dim) + Pre-computed point features to be aggregated. + center_features (torch.Tensor): (B, N, M, out_dim) + Pre-computed center features to be aggregated. + knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN. + We assume the first idx in each row is the idx of the center. + aggregate (str, optional): Aggregation method. + Can be 'sum', 'avg' or 'max'. Defaults: 'sum'. + + Returns: + torch.Tensor: (B, out_dim, npoint, K), the aggregated features. + """ + agg = {'sum': 0, 'avg': 1, 'max': 2} + + B, N, M, out_dim = point_features.size() + _, npoint, K, _ = scores.size() + + output = point_features.new_zeros((B, out_dim, npoint, K)) + ext_module.assign_score_withk_forward( + point_features.contiguous(), + center_features.contiguous(), + scores.contiguous(), + knn_idx.contiguous(), + output, + B=B, + N0=N, + N1=npoint, + M=M, + K=K, + O=out_dim, + aggregate=agg[aggregate]) + + ctx.save_for_backward(output, point_features, center_features, scores, + knn_idx) + ctx.agg = agg[aggregate] + + return output + + @staticmethod + def backward(ctx, grad_out): + """ + Args: + grad_out (torch.Tensor): (B, out_dim, npoint, K) + + Returns: + grad_scores (torch.Tensor): (B, npoint, K, M) + grad_point_features (torch.Tensor): (B, N, M, out_dim) + grad_center_features (torch.Tensor): (B, N, M, out_dim) + """ + _, point_features, center_features, scores, knn_idx = ctx.saved_tensors + + agg = ctx.agg + + B, N, M, out_dim = point_features.size() + _, npoint, K, _ = scores.size() + + grad_point_features = point_features.new_zeros(point_features.shape) + grad_center_features = center_features.new_zeros(center_features.shape) + grad_scores = scores.new_zeros(scores.shape) + + ext_module.assign_score_withk_backward( + grad_out.contiguous(), + point_features.contiguous(), + center_features.contiguous(), + scores.contiguous(), + knn_idx.contiguous(), + grad_point_features, + grad_center_features, + grad_scores, + B=B, + N0=N, + N1=npoint, + M=M, + K=K, + O=out_dim, + aggregate=agg) + + return grad_scores, grad_point_features, \ + grad_center_features, None, None + + +assign_score_withk = AssignScoreWithK.apply diff --git a/lavis/common/annotator/uniformer/mmcv/ops/ball_query.py b/lavis/common/annotator/uniformer/mmcv/ops/ball_query.py new file mode 100644 index 0000000000000000000000000000000000000000..d0466847c6e5c1239e359a0397568413ebc1504a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/ball_query.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['ball_query_forward']) + + +class BallQuery(Function): + """Find nearby points in spherical space.""" + + @staticmethod + def forward(ctx, min_radius: float, max_radius: float, sample_num: int, + xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor: + """ + Args: + min_radius (float): minimum radius of the balls. + max_radius (float): maximum radius of the balls. + sample_num (int): maximum number of features in the balls. + xyz (Tensor): (B, N, 3) xyz coordinates of the features. + center_xyz (Tensor): (B, npoint, 3) centers of the ball query. + + Returns: + Tensor: (B, npoint, nsample) tensor with the indices of + the features that form the query balls. + """ + assert center_xyz.is_contiguous() + assert xyz.is_contiguous() + assert min_radius < max_radius + + B, N, _ = xyz.size() + npoint = center_xyz.size(1) + idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int) + + ext_module.ball_query_forward( + center_xyz, + xyz, + idx, + b=B, + n=N, + m=npoint, + min_radius=min_radius, + max_radius=max_radius, + nsample=sample_num) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(idx) + return idx + + @staticmethod + def backward(ctx, a=None): + return None, None, None, None + + +ball_query = BallQuery.apply diff --git a/lavis/common/annotator/uniformer/mmcv/ops/bbox.py b/lavis/common/annotator/uniformer/mmcv/ops/bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..0c4d58b6c91f652933974f519acd3403a833e906 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/bbox.py @@ -0,0 +1,72 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps']) + + +def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0): + """Calculate overlap between two set of bboxes. + + If ``aligned`` is ``False``, then calculate the ious between each bbox + of bboxes1 and bboxes2, otherwise the ious between each aligned pair of + bboxes1 and bboxes2. + + Args: + bboxes1 (Tensor): shape (m, 4) in format or empty. + bboxes2 (Tensor): shape (n, 4) in format or empty. + If aligned is ``True``, then m and n must be equal. + mode (str): "iou" (intersection over union) or iof (intersection over + foreground). + + Returns: + ious(Tensor): shape (m, n) if aligned == False else shape (m, 1) + + Example: + >>> bboxes1 = torch.FloatTensor([ + >>> [0, 0, 10, 10], + >>> [10, 10, 20, 20], + >>> [32, 32, 38, 42], + >>> ]) + >>> bboxes2 = torch.FloatTensor([ + >>> [0, 0, 10, 20], + >>> [0, 10, 10, 19], + >>> [10, 10, 20, 20], + >>> ]) + >>> bbox_overlaps(bboxes1, bboxes2) + tensor([[0.5000, 0.0000, 0.0000], + [0.0000, 0.0000, 1.0000], + [0.0000, 0.0000, 0.0000]]) + + Example: + >>> empty = torch.FloatTensor([]) + >>> nonempty = torch.FloatTensor([ + >>> [0, 0, 10, 9], + >>> ]) + >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) + >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) + >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) + """ + + mode_dict = {'iou': 0, 'iof': 1} + assert mode in mode_dict.keys() + mode_flag = mode_dict[mode] + # Either the boxes are empty or the length of boxes' last dimension is 4 + assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0) + assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0) + assert offset == 1 or offset == 0 + + rows = bboxes1.size(0) + cols = bboxes2.size(0) + if aligned: + assert rows == cols + + if rows * cols == 0: + return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols) + + if aligned: + ious = bboxes1.new_zeros(rows) + else: + ious = bboxes1.new_zeros((rows, cols)) + ext_module.bbox_overlaps( + bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset) + return ious diff --git a/lavis/common/annotator/uniformer/mmcv/ops/border_align.py b/lavis/common/annotator/uniformer/mmcv/ops/border_align.py new file mode 100644 index 0000000000000000000000000000000000000000..ff305be328e9b0a15e1bbb5e6b41beb940f55c81 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/border_align.py @@ -0,0 +1,109 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# modified from +# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py + +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['border_align_forward', 'border_align_backward']) + + +class BorderAlignFunction(Function): + + @staticmethod + def symbolic(g, input, boxes, pool_size): + return g.op( + 'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size) + + @staticmethod + def forward(ctx, input, boxes, pool_size): + ctx.pool_size = pool_size + ctx.input_shape = input.size() + + assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]' + assert boxes.size(2) == 4, \ + 'the last dimension of boxes must be (x1, y1, x2, y2)' + assert input.size(1) % 4 == 0, \ + 'the channel for input feature must be divisible by factor 4' + + # [B, C//4, H*W, 4] + output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4) + output = input.new_zeros(output_shape) + # `argmax_idx` only used for backward + argmax_idx = input.new_zeros(output_shape).to(torch.int) + + ext_module.border_align_forward( + input, boxes, output, argmax_idx, pool_size=ctx.pool_size) + + ctx.save_for_backward(boxes, argmax_idx) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + boxes, argmax_idx = ctx.saved_tensors + grad_input = grad_output.new_zeros(ctx.input_shape) + # complex head architecture may cause grad_output uncontiguous + grad_output = grad_output.contiguous() + ext_module.border_align_backward( + grad_output, + boxes, + argmax_idx, + grad_input, + pool_size=ctx.pool_size) + return grad_input, None, None + + +border_align = BorderAlignFunction.apply + + +class BorderAlign(nn.Module): + r"""Border align pooling layer. + + Applies border_align over the input feature based on predicted bboxes. + The details were described in the paper + `BorderDet: Border Feature for Dense Object Detection + `_. + + For each border line (e.g. top, left, bottom or right) of each box, + border_align does the following: + 1. uniformly samples `pool_size`+1 positions on this line, involving \ + the start and end points. + 2. the corresponding features on these points are computed by \ + bilinear interpolation. + 3. max pooling over all the `pool_size`+1 positions are used for \ + computing pooled feature. + + Args: + pool_size (int): number of positions sampled over the boxes' borders + (e.g. top, bottom, left, right). + + """ + + def __init__(self, pool_size): + super(BorderAlign, self).__init__() + self.pool_size = pool_size + + def forward(self, input, boxes): + """ + Args: + input: Features with shape [N,4C,H,W]. Channels ranged in [0,C), + [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, + right features respectively. + boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2). + + Returns: + Tensor: Pooled features with shape [N,C,H*W,4]. The order is + (top,left,bottom,right) for the last dimension. + """ + return border_align(input, boxes, self.pool_size) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(pool_size={self.pool_size})' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/box_iou_rotated.py b/lavis/common/annotator/uniformer/mmcv/ops/box_iou_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..2d78015e9c2a9e7a52859b4e18f84a9aa63481a0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/box_iou_rotated.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated']) + + +def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False): + """Return intersection-over-union (Jaccard index) of boxes. + + Both sets of boxes are expected to be in + (x_center, y_center, width, height, angle) format. + + If ``aligned`` is ``False``, then calculate the ious between each bbox + of bboxes1 and bboxes2, otherwise the ious between each aligned pair of + bboxes1 and bboxes2. + + Arguments: + boxes1 (Tensor): rotated bboxes 1. \ + It has shape (N, 5), indicating (x, y, w, h, theta) for each row. + Note that theta is in radian. + boxes2 (Tensor): rotated bboxes 2. \ + It has shape (M, 5), indicating (x, y, w, h, theta) for each row. + Note that theta is in radian. + mode (str): "iou" (intersection over union) or iof (intersection over + foreground). + + Returns: + ious(Tensor): shape (N, M) if aligned == False else shape (N,) + """ + assert mode in ['iou', 'iof'] + mode_dict = {'iou': 0, 'iof': 1} + mode_flag = mode_dict[mode] + rows = bboxes1.size(0) + cols = bboxes2.size(0) + if aligned: + ious = bboxes1.new_zeros(rows) + else: + ious = bboxes1.new_zeros((rows * cols)) + bboxes1 = bboxes1.contiguous() + bboxes2 = bboxes2.contiguous() + ext_module.box_iou_rotated( + bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned) + if not aligned: + ious = ious.view(rows, cols) + return ious diff --git a/lavis/common/annotator/uniformer/mmcv/ops/carafe.py b/lavis/common/annotator/uniformer/mmcv/ops/carafe.py new file mode 100644 index 0000000000000000000000000000000000000000..5154cb3abfccfbbe0a1b2daa67018dbf80aaf6d2 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/carafe.py @@ -0,0 +1,287 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Function +from torch.nn.modules.module import Module + +from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward', + 'carafe_backward' +]) + + +class CARAFENaiveFunction(Function): + + @staticmethod + def symbolic(g, features, masks, kernel_size, group_size, scale_factor): + return g.op( + 'mmcv::MMCVCARAFENaive', + features, + masks, + kernel_size_i=kernel_size, + group_size_i=group_size, + scale_factor_f=scale_factor) + + @staticmethod + def forward(ctx, features, masks, kernel_size, group_size, scale_factor): + assert scale_factor >= 1 + assert masks.size(1) == kernel_size * kernel_size * group_size + assert masks.size(-1) == features.size(-1) * scale_factor + assert masks.size(-2) == features.size(-2) * scale_factor + assert features.size(1) % group_size == 0 + assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 + ctx.kernel_size = kernel_size + ctx.group_size = group_size + ctx.scale_factor = scale_factor + ctx.feature_size = features.size() + ctx.mask_size = masks.size() + + n, c, h, w = features.size() + output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) + ext_module.carafe_naive_forward( + features, + masks, + output, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + if features.requires_grad or masks.requires_grad: + ctx.save_for_backward(features, masks) + return output + + @staticmethod + def backward(ctx, grad_output): + assert grad_output.is_cuda + + features, masks = ctx.saved_tensors + kernel_size = ctx.kernel_size + group_size = ctx.group_size + scale_factor = ctx.scale_factor + + grad_input = torch.zeros_like(features) + grad_masks = torch.zeros_like(masks) + ext_module.carafe_naive_backward( + grad_output.contiguous(), + features, + masks, + grad_input, + grad_masks, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + return grad_input, grad_masks, None, None, None + + +carafe_naive = CARAFENaiveFunction.apply + + +class CARAFENaive(Module): + + def __init__(self, kernel_size, group_size, scale_factor): + super(CARAFENaive, self).__init__() + + assert isinstance(kernel_size, int) and isinstance( + group_size, int) and isinstance(scale_factor, int) + self.kernel_size = kernel_size + self.group_size = group_size + self.scale_factor = scale_factor + + def forward(self, features, masks): + return carafe_naive(features, masks, self.kernel_size, self.group_size, + self.scale_factor) + + +class CARAFEFunction(Function): + + @staticmethod + def symbolic(g, features, masks, kernel_size, group_size, scale_factor): + return g.op( + 'mmcv::MMCVCARAFE', + features, + masks, + kernel_size_i=kernel_size, + group_size_i=group_size, + scale_factor_f=scale_factor) + + @staticmethod + def forward(ctx, features, masks, kernel_size, group_size, scale_factor): + assert scale_factor >= 1 + assert masks.size(1) == kernel_size * kernel_size * group_size + assert masks.size(-1) == features.size(-1) * scale_factor + assert masks.size(-2) == features.size(-2) * scale_factor + assert features.size(1) % group_size == 0 + assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 + ctx.kernel_size = kernel_size + ctx.group_size = group_size + ctx.scale_factor = scale_factor + ctx.feature_size = features.size() + ctx.mask_size = masks.size() + + n, c, h, w = features.size() + output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) + routput = features.new_zeros(output.size(), requires_grad=False) + rfeatures = features.new_zeros(features.size(), requires_grad=False) + rmasks = masks.new_zeros(masks.size(), requires_grad=False) + ext_module.carafe_forward( + features, + masks, + rfeatures, + routput, + rmasks, + output, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + if features.requires_grad or masks.requires_grad: + ctx.save_for_backward(features, masks, rfeatures) + return output + + @staticmethod + def backward(ctx, grad_output): + assert grad_output.is_cuda + + features, masks, rfeatures = ctx.saved_tensors + kernel_size = ctx.kernel_size + group_size = ctx.group_size + scale_factor = ctx.scale_factor + + rgrad_output = torch.zeros_like(grad_output, requires_grad=False) + rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False) + rgrad_input = torch.zeros_like(features, requires_grad=False) + rgrad_masks = torch.zeros_like(masks, requires_grad=False) + grad_input = torch.zeros_like(features, requires_grad=False) + grad_masks = torch.zeros_like(masks, requires_grad=False) + ext_module.carafe_backward( + grad_output.contiguous(), + rfeatures, + masks, + rgrad_output, + rgrad_input_hs, + rgrad_input, + rgrad_masks, + grad_input, + grad_masks, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + return grad_input, grad_masks, None, None, None + + +carafe = CARAFEFunction.apply + + +class CARAFE(Module): + """ CARAFE: Content-Aware ReAssembly of FEatures + + Please refer to https://arxiv.org/abs/1905.02188 for more details. + + Args: + kernel_size (int): reassemble kernel size + group_size (int): reassemble group size + scale_factor (int): upsample ratio + + Returns: + upsampled feature map + """ + + def __init__(self, kernel_size, group_size, scale_factor): + super(CARAFE, self).__init__() + + assert isinstance(kernel_size, int) and isinstance( + group_size, int) and isinstance(scale_factor, int) + self.kernel_size = kernel_size + self.group_size = group_size + self.scale_factor = scale_factor + + def forward(self, features, masks): + return carafe(features, masks, self.kernel_size, self.group_size, + self.scale_factor) + + +@UPSAMPLE_LAYERS.register_module(name='carafe') +class CARAFEPack(nn.Module): + """A unified package of CARAFE upsampler that contains: 1) channel + compressor 2) content encoder 3) CARAFE op. + + Official implementation of ICCV 2019 paper + CARAFE: Content-Aware ReAssembly of FEatures + Please refer to https://arxiv.org/abs/1905.02188 for more details. + + Args: + channels (int): input feature channels + scale_factor (int): upsample ratio + up_kernel (int): kernel size of CARAFE op + up_group (int): group size of CARAFE op + encoder_kernel (int): kernel size of content encoder + encoder_dilation (int): dilation of content encoder + compressed_channels (int): output channels of channels compressor + + Returns: + upsampled feature map + """ + + def __init__(self, + channels, + scale_factor, + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1, + compressed_channels=64): + super(CARAFEPack, self).__init__() + self.channels = channels + self.scale_factor = scale_factor + self.up_kernel = up_kernel + self.up_group = up_group + self.encoder_kernel = encoder_kernel + self.encoder_dilation = encoder_dilation + self.compressed_channels = compressed_channels + self.channel_compressor = nn.Conv2d(channels, self.compressed_channels, + 1) + self.content_encoder = nn.Conv2d( + self.compressed_channels, + self.up_kernel * self.up_kernel * self.up_group * + self.scale_factor * self.scale_factor, + self.encoder_kernel, + padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2), + dilation=self.encoder_dilation, + groups=1) + self.init_weights() + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + xavier_init(m, distribution='uniform') + normal_init(self.content_encoder, std=0.001) + + def kernel_normalizer(self, mask): + mask = F.pixel_shuffle(mask, self.scale_factor) + n, mask_c, h, w = mask.size() + # use float division explicitly, + # to void inconsistency while exporting to onnx + mask_channel = int(mask_c / float(self.up_kernel**2)) + mask = mask.view(n, mask_channel, -1, h, w) + + mask = F.softmax(mask, dim=2, dtype=mask.dtype) + mask = mask.view(n, mask_c, h, w).contiguous() + + return mask + + def feature_reassemble(self, x, mask): + x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor) + return x + + def forward(self, x): + compressed_x = self.channel_compressor(x) + mask = self.content_encoder(compressed_x) + mask = self.kernel_normalizer(mask) + + x = self.feature_reassemble(x, mask) + return x diff --git a/lavis/common/annotator/uniformer/mmcv/ops/cc_attention.py b/lavis/common/annotator/uniformer/mmcv/ops/cc_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..9207aa95e6730bd9b3362dee612059a5f0ce1c5e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/cc_attention.py @@ -0,0 +1,83 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from annotator.uniformer.mmcv.cnn import PLUGIN_LAYERS, Scale + + +def NEG_INF_DIAG(n, device): + """Returns a diagonal matrix of size [n, n]. + + The diagonal are all "-inf". This is for avoiding calculating the + overlapped element in the Criss-Cross twice. + """ + return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0) + + +@PLUGIN_LAYERS.register_module() +class CrissCrossAttention(nn.Module): + """Criss-Cross Attention Module. + + .. note:: + Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch + to a pure PyTorch and equivalent implementation. For more + details, please refer to https://github.com/open-mmlab/mmcv/pull/1201. + + Speed comparison for one forward pass + + - Input size: [2,512,97,97] + - Device: 1 NVIDIA GeForce RTX 2080 Ti + + +-----------------------+---------------+------------+---------------+ + | |PyTorch version|CUDA version|Relative speed | + +=======================+===============+============+===============+ + |with torch.no_grad() |0.00554402 s |0.0299619 s |5.4x | + +-----------------------+---------------+------------+---------------+ + |no with torch.no_grad()|0.00562803 s |0.0301349 s |5.4x | + +-----------------------+---------------+------------+---------------+ + + Args: + in_channels (int): Channels of the input feature map. + """ + + def __init__(self, in_channels): + super().__init__() + self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1) + self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1) + self.value_conv = nn.Conv2d(in_channels, in_channels, 1) + self.gamma = Scale(0.) + self.in_channels = in_channels + + def forward(self, x): + """forward function of Criss-Cross Attention. + + Args: + x (Tensor): Input feature. \ + shape (batch_size, in_channels, height, width) + Returns: + Tensor: Output of the layer, with shape of \ + (batch_size, in_channels, height, width) + """ + B, C, H, W = x.size() + query = self.query_conv(x) + key = self.key_conv(x) + value = self.value_conv(x) + energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG( + H, query.device) + energy_H = energy_H.transpose(1, 2) + energy_W = torch.einsum('bchw,bchj->bhwj', query, key) + attn = F.softmax( + torch.cat([energy_H, energy_W], dim=-1), dim=-1) # [B,H,W,(H+W)] + out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H]) + out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:]) + + out = self.gamma(out) + x + out = out.contiguous() + + return out + + def __repr__(self): + s = self.__class__.__name__ + s += f'(in_channels={self.in_channels})' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/contour_expand.py b/lavis/common/annotator/uniformer/mmcv/ops/contour_expand.py new file mode 100644 index 0000000000000000000000000000000000000000..ea1111e1768b5f27e118bf7dbc0d9c70a7afd6d7 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/contour_expand.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['contour_expand']) + + +def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area, + kernel_num): + """Expand kernel contours so that foreground pixels are assigned into + instances. + + Arguments: + kernel_mask (np.array or Tensor): The instance kernel mask with + size hxw. + internal_kernel_label (np.array or Tensor): The instance internal + kernel label with size hxw. + min_kernel_area (int): The minimum kernel area. + kernel_num (int): The instance kernel number. + + Returns: + label (list): The instance index map with size hxw. + """ + assert isinstance(kernel_mask, (torch.Tensor, np.ndarray)) + assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray)) + assert isinstance(min_kernel_area, int) + assert isinstance(kernel_num, int) + + if isinstance(kernel_mask, np.ndarray): + kernel_mask = torch.from_numpy(kernel_mask) + if isinstance(internal_kernel_label, np.ndarray): + internal_kernel_label = torch.from_numpy(internal_kernel_label) + + if torch.__version__ == 'parrots': + if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0: + label = [] + else: + label = ext_module.contour_expand( + kernel_mask, + internal_kernel_label, + min_kernel_area=min_kernel_area, + kernel_num=kernel_num) + label = label.tolist() + else: + label = ext_module.contour_expand(kernel_mask, internal_kernel_label, + min_kernel_area, kernel_num) + return label diff --git a/lavis/common/annotator/uniformer/mmcv/ops/corner_pool.py b/lavis/common/annotator/uniformer/mmcv/ops/corner_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..a33d798b43d405e4c86bee4cd6389be21ca9c637 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/corner_pool.py @@ -0,0 +1,161 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward', + 'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward', + 'right_pool_forward', 'right_pool_backward' +]) + +_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} + + +class TopPoolFunction(Function): + + @staticmethod + def symbolic(g, input): + output = g.op( + 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top'])) + return output + + @staticmethod + def forward(ctx, input): + output = ext_module.top_pool_forward(input) + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output = ext_module.top_pool_backward(input, grad_output) + return output + + +class BottomPoolFunction(Function): + + @staticmethod + def symbolic(g, input): + output = g.op( + 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom'])) + return output + + @staticmethod + def forward(ctx, input): + output = ext_module.bottom_pool_forward(input) + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output = ext_module.bottom_pool_backward(input, grad_output) + return output + + +class LeftPoolFunction(Function): + + @staticmethod + def symbolic(g, input): + output = g.op( + 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left'])) + return output + + @staticmethod + def forward(ctx, input): + output = ext_module.left_pool_forward(input) + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output = ext_module.left_pool_backward(input, grad_output) + return output + + +class RightPoolFunction(Function): + + @staticmethod + def symbolic(g, input): + output = g.op( + 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right'])) + return output + + @staticmethod + def forward(ctx, input): + output = ext_module.right_pool_forward(input) + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output = ext_module.right_pool_backward(input, grad_output) + return output + + +class CornerPool(nn.Module): + """Corner Pooling. + + Corner Pooling is a new type of pooling layer that helps a + convolutional network better localize corners of bounding boxes. + + Please refer to https://arxiv.org/abs/1808.01244 for more details. + Code is modified from https://github.com/princeton-vl/CornerNet-Lite. + + Args: + mode(str): Pooling orientation for the pooling layer + + - 'bottom': Bottom Pooling + - 'left': Left Pooling + - 'right': Right Pooling + - 'top': Top Pooling + + Returns: + Feature map after pooling. + """ + + pool_functions = { + 'bottom': BottomPoolFunction, + 'left': LeftPoolFunction, + 'right': RightPoolFunction, + 'top': TopPoolFunction, + } + + cummax_dim_flip = { + 'bottom': (2, False), + 'left': (3, True), + 'right': (3, False), + 'top': (2, True), + } + + def __init__(self, mode): + super(CornerPool, self).__init__() + assert mode in self.pool_functions + self.mode = mode + self.corner_pool = self.pool_functions[mode] + + def forward(self, x): + if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0': + if torch.onnx.is_in_onnx_export(): + assert torch.__version__ >= '1.7.0', \ + 'When `cummax` serves as an intermediate component whose '\ + 'outputs is used as inputs for another modules, it\'s '\ + 'expected that pytorch version must be >= 1.7.0, '\ + 'otherwise Error appears like: `RuntimeError: tuple '\ + 'appears in op that does not forward tuples, unsupported '\ + 'kind: prim::PythonOp`.' + + dim, flip = self.cummax_dim_flip[self.mode] + if flip: + x = x.flip(dim) + pool_tensor, _ = torch.cummax(x, dim=dim) + if flip: + pool_tensor = pool_tensor.flip(dim) + return pool_tensor + else: + return self.corner_pool.apply(x) diff --git a/lavis/common/annotator/uniformer/mmcv/ops/correlation.py b/lavis/common/annotator/uniformer/mmcv/ops/correlation.py new file mode 100644 index 0000000000000000000000000000000000000000..3d0b79c301b29915dfaf4d2b1846c59be73127d3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/correlation.py @@ -0,0 +1,196 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor, nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['correlation_forward', 'correlation_backward']) + + +class CorrelationFunction(Function): + + @staticmethod + def forward(ctx, + input1, + input2, + kernel_size=1, + max_displacement=1, + stride=1, + padding=1, + dilation=1, + dilation_patch=1): + + ctx.save_for_backward(input1, input2) + + kH, kW = ctx.kernel_size = _pair(kernel_size) + patch_size = max_displacement * 2 + 1 + ctx.patch_size = patch_size + dH, dW = ctx.stride = _pair(stride) + padH, padW = ctx.padding = _pair(padding) + dilationH, dilationW = ctx.dilation = _pair(dilation) + dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair( + dilation_patch) + + output_size = CorrelationFunction._output_size(ctx, input1) + + output = input1.new_zeros(output_size) + + ext_module.correlation_forward( + input1, + input2, + output, + kH=kH, + kW=kW, + patchH=patch_size, + patchW=patch_size, + padH=padH, + padW=padW, + dilationH=dilationH, + dilationW=dilationW, + dilation_patchH=dilation_patchH, + dilation_patchW=dilation_patchW, + dH=dH, + dW=dW) + + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input1, input2 = ctx.saved_tensors + + kH, kW = ctx.kernel_size + patch_size = ctx.patch_size + padH, padW = ctx.padding + dilationH, dilationW = ctx.dilation + dilation_patchH, dilation_patchW = ctx.dilation_patch + dH, dW = ctx.stride + grad_input1 = torch.zeros_like(input1) + grad_input2 = torch.zeros_like(input2) + + ext_module.correlation_backward( + grad_output, + input1, + input2, + grad_input1, + grad_input2, + kH=kH, + kW=kW, + patchH=patch_size, + patchW=patch_size, + padH=padH, + padW=padW, + dilationH=dilationH, + dilationW=dilationW, + dilation_patchH=dilation_patchH, + dilation_patchW=dilation_patchW, + dH=dH, + dW=dW) + return grad_input1, grad_input2, None, None, None, None, None, None + + @staticmethod + def _output_size(ctx, input1): + iH, iW = input1.size(2), input1.size(3) + batch_size = input1.size(0) + kH, kW = ctx.kernel_size + patch_size = ctx.patch_size + dH, dW = ctx.stride + padH, padW = ctx.padding + dilationH, dilationW = ctx.dilation + dilatedKH = (kH - 1) * dilationH + 1 + dilatedKW = (kW - 1) * dilationW + 1 + + oH = int((iH + 2 * padH - dilatedKH) / dH + 1) + oW = int((iW + 2 * padW - dilatedKW) / dW + 1) + + output_size = (batch_size, patch_size, patch_size, oH, oW) + return output_size + + +class Correlation(nn.Module): + r"""Correlation operator + + This correlation operator works for optical flow correlation computation. + + There are two batched tensors with shape :math:`(N, C, H, W)`, + and the correlation output's shape is :math:`(N, max\_displacement \times + 2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})` + + where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times padding - + dilation \times (kernel\_size - 1) - 1} + {stride} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times padding - dilation + \times (kernel\_size - 1) - 1} + {stride} + 1\right\rfloor + + the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding + window convolution between input1 and shifted input2, + + .. math:: + Corr(N_i, dx, dy) = + \sum_{c=0}^{C-1} + input1(N_i, c) \star + \mathcal{S}(input2(N_i, c), dy, dx) + + where :math:`\star` is the valid 2d sliding window convolution operator, + and :math:`\mathcal{S}` means shifting the input features (auto-complete + zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in + [-max\_displacement \times dilation\_patch, max\_displacement \times + dilation\_patch]`. + + Args: + kernel_size (int): The size of sliding window i.e. local neighborhood + representing the center points and involved in correlation + computation. Defaults to 1. + max_displacement (int): The radius for computing correlation volume, + but the actual working space can be dilated by dilation_patch. + Defaults to 1. + stride (int): The stride of the sliding blocks in the input spatial + dimensions. Defaults to 1. + padding (int): Zero padding added to all four sides of the input1. + Defaults to 0. + dilation (int): The spacing of local neighborhood that will involved + in correlation. Defaults to 1. + dilation_patch (int): The spacing between position need to compute + correlation. Defaults to 1. + """ + + def __init__(self, + kernel_size: int = 1, + max_displacement: int = 1, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + dilation_patch: int = 1) -> None: + super().__init__() + self.kernel_size = kernel_size + self.max_displacement = max_displacement + self.stride = stride + self.padding = padding + self.dilation = dilation + self.dilation_patch = dilation_patch + + def forward(self, input1: Tensor, input2: Tensor) -> Tensor: + return CorrelationFunction.apply(input1, input2, self.kernel_size, + self.max_displacement, self.stride, + self.padding, self.dilation, + self.dilation_patch) + + def __repr__(self) -> str: + s = self.__class__.__name__ + s += f'(kernel_size={self.kernel_size}, ' + s += f'max_displacement={self.max_displacement}, ' + s += f'stride={self.stride}, ' + s += f'padding={self.padding}, ' + s += f'dilation={self.dilation}, ' + s += f'dilation_patch={self.dilation_patch})' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/deform_conv.py b/lavis/common/annotator/uniformer/mmcv/ops/deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..a3f8c75ee774823eea334e3b3732af6a18f55038 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/deform_conv.py @@ -0,0 +1,405 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair, _single + +from annotator.uniformer.mmcv.utils import deprecated_api_warning +from ..cnn import CONV_LAYERS +from ..utils import ext_loader, print_log + +ext_module = ext_loader.load_ext('_ext', [ + 'deform_conv_forward', 'deform_conv_backward_input', + 'deform_conv_backward_parameters' +]) + + +class DeformConv2dFunction(Function): + + @staticmethod + def symbolic(g, + input, + offset, + weight, + stride, + padding, + dilation, + groups, + deform_groups, + bias=False, + im2col_step=32): + return g.op( + 'mmcv::MMCVDeformConv2d', + input, + offset, + weight, + stride_i=stride, + padding_i=padding, + dilation_i=dilation, + groups_i=groups, + deform_groups_i=deform_groups, + bias_i=bias, + im2col_step_i=im2col_step) + + @staticmethod + def forward(ctx, + input, + offset, + weight, + stride=1, + padding=0, + dilation=1, + groups=1, + deform_groups=1, + bias=False, + im2col_step=32): + if input is not None and input.dim() != 4: + raise ValueError( + f'Expected 4D tensor as input, got {input.dim()}D tensor \ + instead.') + assert bias is False, 'Only support bias is False.' + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deform_groups = deform_groups + ctx.im2col_step = im2col_step + + # When pytorch version >= 1.6.0, amp is adopted for fp16 mode; + # amp won't cast the type of model (float32), but "offset" is cast + # to float16 by nn.Conv2d automatically, leading to the type + # mismatch with input (when it is float32) or weight. + # The flag for whether to use fp16 or amp is the type of "offset", + # we cast weight and input to temporarily support fp16 and amp + # whatever the pytorch version is. + input = input.type_as(offset) + weight = weight.type_as(input) + ctx.save_for_backward(input, offset, weight) + + output = input.new_empty( + DeformConv2dFunction._output_size(ctx, input, weight)) + + ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones + + cur_im2col_step = min(ctx.im2col_step, input.size(0)) + assert (input.size(0) % + cur_im2col_step) == 0, 'im2col step must divide batchsize' + ext_module.deform_conv_forward( + input, + weight, + offset, + output, + ctx.bufs_[0], + ctx.bufs_[1], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + im2col_step=cur_im2col_step) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, weight = ctx.saved_tensors + + grad_input = grad_offset = grad_weight = None + + cur_im2col_step = min(ctx.im2col_step, input.size(0)) + assert (input.size(0) % cur_im2col_step + ) == 0, 'batch size must be divisible by im2col_step' + + grad_output = grad_output.contiguous() + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + ext_module.deform_conv_backward_input( + input, + offset, + grad_output, + grad_input, + grad_offset, + weight, + ctx.bufs_[0], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + im2col_step=cur_im2col_step) + + if ctx.needs_input_grad[2]: + grad_weight = torch.zeros_like(weight) + ext_module.deform_conv_backward_parameters( + input, + offset, + grad_output, + grad_weight, + ctx.bufs_[0], + ctx.bufs_[1], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + scale=1, + im2col_step=cur_im2col_step) + + return grad_input, grad_offset, grad_weight, \ + None, None, None, None, None, None, None + + @staticmethod + def _output_size(ctx, input, weight): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = ctx.padding[d] + kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = ctx.stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + 'convolution input is too small (output would be ' + + 'x'.join(map(str, output_size)) + ')') + return output_size + + +deform_conv2d = DeformConv2dFunction.apply + + +class DeformConv2d(nn.Module): + r"""Deformable 2D convolution. + + Applies a deformable 2D convolution over an input signal composed of + several input planes. DeformConv2d was described in the paper + `Deformable Convolutional Networks + `_ + + Note: + The argument ``im2col_step`` was added in version 1.3.17, which means + number of samples processed by the ``im2col_cuda_kernel`` per call. + It enables users to define ``batch_size`` and ``im2col_step`` more + flexibly and solved `issue mmcv#1440 + `_. + + Args: + in_channels (int): Number of channels in the input image. + out_channels (int): Number of channels produced by the convolution. + kernel_size(int, tuple): Size of the convolving kernel. + stride(int, tuple): Stride of the convolution. Default: 1. + padding (int or tuple): Zero-padding added to both sides of the input. + Default: 0. + dilation (int or tuple): Spacing between kernel elements. Default: 1. + groups (int): Number of blocked connections from input. + channels to output channels. Default: 1. + deform_groups (int): Number of deformable group partitions. + bias (bool): If True, adds a learnable bias to the output. + Default: False. + im2col_step (int): Number of samples processed by im2col_cuda_kernel + per call. It will work when ``batch_size`` > ``im2col_step``, but + ``batch_size`` must be divisible by ``im2col_step``. Default: 32. + `New in version 1.3.17.` + """ + + @deprecated_api_warning({'deformable_groups': 'deform_groups'}, + cls_name='DeformConv2d') + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, ...]], + stride: Union[int, Tuple[int, ...]] = 1, + padding: Union[int, Tuple[int, ...]] = 0, + dilation: Union[int, Tuple[int, ...]] = 1, + groups: int = 1, + deform_groups: int = 1, + bias: bool = False, + im2col_step: int = 32) -> None: + super(DeformConv2d, self).__init__() + + assert not bias, \ + f'bias={bias} is not supported in DeformConv2d.' + assert in_channels % groups == 0, \ + f'in_channels {in_channels} cannot be divisible by groups {groups}' + assert out_channels % groups == 0, \ + f'out_channels {out_channels} cannot be divisible by groups \ + {groups}' + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deform_groups = deform_groups + self.im2col_step = im2col_step + # enable compatibility with nn.Conv2d + self.transposed = False + self.output_padding = _single(0) + + # only weight, no bias + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // self.groups, + *self.kernel_size)) + + self.reset_parameters() + + def reset_parameters(self): + # switch the initialization of `self.weight` to the standard kaiming + # method described in `Delving deep into rectifiers: Surpassing + # human-level performance on ImageNet classification` - He, K. et al. + # (2015), using a uniform distribution + nn.init.kaiming_uniform_(self.weight, nonlinearity='relu') + + def forward(self, x: Tensor, offset: Tensor) -> Tensor: + """Deformable Convolutional forward function. + + Args: + x (Tensor): Input feature, shape (B, C_in, H_in, W_in) + offset (Tensor): Offset for deformable convolution, shape + (B, deform_groups*kernel_size[0]*kernel_size[1]*2, + H_out, W_out), H_out, W_out are equal to the output's. + + An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`. + The spatial arrangement is like: + + .. code:: text + + (x0, y0) (x1, y1) (x2, y2) + (x3, y3) (x4, y4) (x5, y5) + (x6, y6) (x7, y7) (x8, y8) + + Returns: + Tensor: Output of the layer. + """ + # To fix an assert error in deform_conv_cuda.cpp:128 + # input image is smaller than kernel + input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) < + self.kernel_size[1]) + if input_pad: + pad_h = max(self.kernel_size[0] - x.size(2), 0) + pad_w = max(self.kernel_size[1] - x.size(3), 0) + x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous() + offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0) + offset = offset.contiguous() + out = deform_conv2d(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deform_groups, + False, self.im2col_step) + if input_pad: + out = out[:, :, :out.size(2) - pad_h, :out.size(3) - + pad_w].contiguous() + return out + + def __repr__(self): + s = self.__class__.__name__ + s += f'(in_channels={self.in_channels},\n' + s += f'out_channels={self.out_channels},\n' + s += f'kernel_size={self.kernel_size},\n' + s += f'stride={self.stride},\n' + s += f'padding={self.padding},\n' + s += f'dilation={self.dilation},\n' + s += f'groups={self.groups},\n' + s += f'deform_groups={self.deform_groups},\n' + # bias is not supported in DeformConv2d. + s += 'bias=False)' + return s + + +@CONV_LAYERS.register_module('DCN') +class DeformConv2dPack(DeformConv2d): + """A Deformable Conv Encapsulation that acts as normal Conv layers. + + The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`. + The spatial arrangement is like: + + .. code:: text + + (x0, y0) (x1, y1) (x2, y2) + (x3, y3) (x4, y4) (x5, y5) + (x6, y6) (x7, y7) (x8, y8) + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int or tuple[int]): Same as nn.Conv2d. + padding (int or tuple[int]): Same as nn.Conv2d. + dilation (int or tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if norm_cfg is None, otherwise + False. + """ + + _version = 2 + + def __init__(self, *args, **kwargs): + super(DeformConv2dPack, self).__init__(*args, **kwargs) + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + dilation=_pair(self.dilation), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x): + offset = self.conv_offset(x) + return deform_conv2d(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deform_groups, + False, self.im2col_step) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + + if version is None or version < 2: + # the key is different in early versions + # In version < 2, DeformConvPack loads previous benchmark models. + if (prefix + 'conv_offset.weight' not in state_dict + and prefix[:-1] + '_offset.weight' in state_dict): + state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( + prefix[:-1] + '_offset.weight') + if (prefix + 'conv_offset.bias' not in state_dict + and prefix[:-1] + '_offset.bias' in state_dict): + state_dict[prefix + + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + + '_offset.bias') + + if version is not None and version > 1: + print_log( + f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to ' + 'version 2.', + logger='root') + + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) diff --git a/lavis/common/annotator/uniformer/mmcv/ops/deform_roi_pool.py b/lavis/common/annotator/uniformer/mmcv/ops/deform_roi_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..cc245ba91fee252226ba22e76bb94a35db9a629b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/deform_roi_pool.py @@ -0,0 +1,204 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward']) + + +class DeformRoIPoolFunction(Function): + + @staticmethod + def symbolic(g, input, rois, offset, output_size, spatial_scale, + sampling_ratio, gamma): + return g.op( + 'mmcv::MMCVDeformRoIPool', + input, + rois, + offset, + pooled_height_i=output_size[0], + pooled_width_i=output_size[1], + spatial_scale_f=spatial_scale, + sampling_ratio_f=sampling_ratio, + gamma_f=gamma) + + @staticmethod + def forward(ctx, + input, + rois, + offset, + output_size, + spatial_scale=1.0, + sampling_ratio=0, + gamma=0.1): + if offset is None: + offset = input.new_zeros(0) + ctx.output_size = _pair(output_size) + ctx.spatial_scale = float(spatial_scale) + ctx.sampling_ratio = int(sampling_ratio) + ctx.gamma = float(gamma) + + assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' + + output_shape = (rois.size(0), input.size(1), ctx.output_size[0], + ctx.output_size[1]) + output = input.new_zeros(output_shape) + + ext_module.deform_roi_pool_forward( + input, + rois, + offset, + output, + pooled_height=ctx.output_size[0], + pooled_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + gamma=ctx.gamma) + + ctx.save_for_backward(input, rois, offset) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, offset = ctx.saved_tensors + grad_input = grad_output.new_zeros(input.shape) + grad_offset = grad_output.new_zeros(offset.shape) + + ext_module.deform_roi_pool_backward( + grad_output, + input, + rois, + offset, + grad_input, + grad_offset, + pooled_height=ctx.output_size[0], + pooled_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + gamma=ctx.gamma) + if grad_offset.numel() == 0: + grad_offset = None + return grad_input, None, grad_offset, None, None, None, None + + +deform_roi_pool = DeformRoIPoolFunction.apply + + +class DeformRoIPool(nn.Module): + + def __init__(self, + output_size, + spatial_scale=1.0, + sampling_ratio=0, + gamma=0.1): + super(DeformRoIPool, self).__init__() + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + self.sampling_ratio = int(sampling_ratio) + self.gamma = float(gamma) + + def forward(self, input, rois, offset=None): + return deform_roi_pool(input, rois, offset, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + + +class DeformRoIPoolPack(DeformRoIPool): + + def __init__(self, + output_size, + output_channels, + deform_fc_channels=1024, + spatial_scale=1.0, + sampling_ratio=0, + gamma=0.1): + super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale, + sampling_ratio, gamma) + + self.output_channels = output_channels + self.deform_fc_channels = deform_fc_channels + + self.offset_fc = nn.Sequential( + nn.Linear( + self.output_size[0] * self.output_size[1] * + self.output_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, + self.output_size[0] * self.output_size[1] * 2)) + self.offset_fc[-1].weight.data.zero_() + self.offset_fc[-1].bias.data.zero_() + + def forward(self, input, rois): + assert input.size(1) == self.output_channels + x = deform_roi_pool(input, rois, None, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + rois_num = rois.size(0) + offset = self.offset_fc(x.view(rois_num, -1)) + offset = offset.view(rois_num, 2, self.output_size[0], + self.output_size[1]) + return deform_roi_pool(input, rois, offset, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + + +class ModulatedDeformRoIPoolPack(DeformRoIPool): + + def __init__(self, + output_size, + output_channels, + deform_fc_channels=1024, + spatial_scale=1.0, + sampling_ratio=0, + gamma=0.1): + super(ModulatedDeformRoIPoolPack, + self).__init__(output_size, spatial_scale, sampling_ratio, gamma) + + self.output_channels = output_channels + self.deform_fc_channels = deform_fc_channels + + self.offset_fc = nn.Sequential( + nn.Linear( + self.output_size[0] * self.output_size[1] * + self.output_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, + self.output_size[0] * self.output_size[1] * 2)) + self.offset_fc[-1].weight.data.zero_() + self.offset_fc[-1].bias.data.zero_() + + self.mask_fc = nn.Sequential( + nn.Linear( + self.output_size[0] * self.output_size[1] * + self.output_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, + self.output_size[0] * self.output_size[1] * 1), + nn.Sigmoid()) + self.mask_fc[2].weight.data.zero_() + self.mask_fc[2].bias.data.zero_() + + def forward(self, input, rois): + assert input.size(1) == self.output_channels + x = deform_roi_pool(input, rois, None, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + rois_num = rois.size(0) + offset = self.offset_fc(x.view(rois_num, -1)) + offset = offset.view(rois_num, 2, self.output_size[0], + self.output_size[1]) + mask = self.mask_fc(x.view(rois_num, -1)) + mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1]) + d = deform_roi_pool(input, rois, offset, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + return d * mask diff --git a/lavis/common/annotator/uniformer/mmcv/ops/deprecated_wrappers.py b/lavis/common/annotator/uniformer/mmcv/ops/deprecated_wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..a2e593df9ee57637038683d7a1efaa347b2b69e7 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/deprecated_wrappers.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This file is for backward compatibility. +# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks. +import warnings + +from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d + + +class Conv2d_deprecated(Conv2d): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in' + ' the future. Please import them from "mmcv.cnn" instead') + + +class ConvTranspose2d_deprecated(ConvTranspose2d): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'Importing ConvTranspose2d wrapper from "mmcv.ops" will be ' + 'deprecated in the future. Please import them from "mmcv.cnn" ' + 'instead') + + +class MaxPool2d_deprecated(MaxPool2d): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in' + ' the future. Please import them from "mmcv.cnn" instead') + + +class Linear_deprecated(Linear): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'Importing Linear wrapper from "mmcv.ops" will be deprecated in' + ' the future. Please import them from "mmcv.cnn" instead') diff --git a/lavis/common/annotator/uniformer/mmcv/ops/focal_loss.py b/lavis/common/annotator/uniformer/mmcv/ops/focal_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..763bc93bd2575c49ca8ccf20996bbd92d1e0d1a4 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/focal_loss.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward', + 'softmax_focal_loss_forward', 'softmax_focal_loss_backward' +]) + + +class SigmoidFocalLossFunction(Function): + + @staticmethod + def symbolic(g, input, target, gamma, alpha, weight, reduction): + return g.op( + 'mmcv::MMCVSigmoidFocalLoss', + input, + target, + gamma_f=gamma, + alpha_f=alpha, + weight_f=weight, + reduction_s=reduction) + + @staticmethod + def forward(ctx, + input, + target, + gamma=2.0, + alpha=0.25, + weight=None, + reduction='mean'): + + assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor)) + assert input.dim() == 2 + assert target.dim() == 1 + assert input.size(0) == target.size(0) + if weight is None: + weight = input.new_empty(0) + else: + assert weight.dim() == 1 + assert input.size(1) == weight.size(0) + ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2} + assert reduction in ctx.reduction_dict.keys() + + ctx.gamma = float(gamma) + ctx.alpha = float(alpha) + ctx.reduction = ctx.reduction_dict[reduction] + + output = input.new_zeros(input.size()) + + ext_module.sigmoid_focal_loss_forward( + input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha) + if ctx.reduction == ctx.reduction_dict['mean']: + output = output.sum() / input.size(0) + elif ctx.reduction == ctx.reduction_dict['sum']: + output = output.sum() + ctx.save_for_backward(input, target, weight) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, target, weight = ctx.saved_tensors + + grad_input = input.new_zeros(input.size()) + + ext_module.sigmoid_focal_loss_backward( + input, + target, + weight, + grad_input, + gamma=ctx.gamma, + alpha=ctx.alpha) + + grad_input *= grad_output + if ctx.reduction == ctx.reduction_dict['mean']: + grad_input /= input.size(0) + return grad_input, None, None, None, None, None + + +sigmoid_focal_loss = SigmoidFocalLossFunction.apply + + +class SigmoidFocalLoss(nn.Module): + + def __init__(self, gamma, alpha, weight=None, reduction='mean'): + super(SigmoidFocalLoss, self).__init__() + self.gamma = gamma + self.alpha = alpha + self.register_buffer('weight', weight) + self.reduction = reduction + + def forward(self, input, target): + return sigmoid_focal_loss(input, target, self.gamma, self.alpha, + self.weight, self.reduction) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(gamma={self.gamma}, ' + s += f'alpha={self.alpha}, ' + s += f'reduction={self.reduction})' + return s + + +class SoftmaxFocalLossFunction(Function): + + @staticmethod + def symbolic(g, input, target, gamma, alpha, weight, reduction): + return g.op( + 'mmcv::MMCVSoftmaxFocalLoss', + input, + target, + gamma_f=gamma, + alpha_f=alpha, + weight_f=weight, + reduction_s=reduction) + + @staticmethod + def forward(ctx, + input, + target, + gamma=2.0, + alpha=0.25, + weight=None, + reduction='mean'): + + assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor)) + assert input.dim() == 2 + assert target.dim() == 1 + assert input.size(0) == target.size(0) + if weight is None: + weight = input.new_empty(0) + else: + assert weight.dim() == 1 + assert input.size(1) == weight.size(0) + ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2} + assert reduction in ctx.reduction_dict.keys() + + ctx.gamma = float(gamma) + ctx.alpha = float(alpha) + ctx.reduction = ctx.reduction_dict[reduction] + + channel_stats, _ = torch.max(input, dim=1) + input_softmax = input - channel_stats.unsqueeze(1).expand_as(input) + input_softmax.exp_() + + channel_stats = input_softmax.sum(dim=1) + input_softmax /= channel_stats.unsqueeze(1).expand_as(input) + + output = input.new_zeros(input.size(0)) + ext_module.softmax_focal_loss_forward( + input_softmax, + target, + weight, + output, + gamma=ctx.gamma, + alpha=ctx.alpha) + + if ctx.reduction == ctx.reduction_dict['mean']: + output = output.sum() / input.size(0) + elif ctx.reduction == ctx.reduction_dict['sum']: + output = output.sum() + ctx.save_for_backward(input_softmax, target, weight) + return output + + @staticmethod + def backward(ctx, grad_output): + input_softmax, target, weight = ctx.saved_tensors + buff = input_softmax.new_zeros(input_softmax.size(0)) + grad_input = input_softmax.new_zeros(input_softmax.size()) + + ext_module.softmax_focal_loss_backward( + input_softmax, + target, + weight, + buff, + grad_input, + gamma=ctx.gamma, + alpha=ctx.alpha) + + grad_input *= grad_output + if ctx.reduction == ctx.reduction_dict['mean']: + grad_input /= input_softmax.size(0) + return grad_input, None, None, None, None, None + + +softmax_focal_loss = SoftmaxFocalLossFunction.apply + + +class SoftmaxFocalLoss(nn.Module): + + def __init__(self, gamma, alpha, weight=None, reduction='mean'): + super(SoftmaxFocalLoss, self).__init__() + self.gamma = gamma + self.alpha = alpha + self.register_buffer('weight', weight) + self.reduction = reduction + + def forward(self, input, target): + return softmax_focal_loss(input, target, self.gamma, self.alpha, + self.weight, self.reduction) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(gamma={self.gamma}, ' + s += f'alpha={self.alpha}, ' + s += f'reduction={self.reduction})' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/furthest_point_sample.py b/lavis/common/annotator/uniformer/mmcv/ops/furthest_point_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..374b7a878f1972c183941af28ba1df216ac1a60f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/furthest_point_sample.py @@ -0,0 +1,83 @@ +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'furthest_point_sampling_forward', + 'furthest_point_sampling_with_dist_forward' +]) + + +class FurthestPointSampling(Function): + """Uses iterative furthest point sampling to select a set of features whose + corresponding points have the furthest distance.""" + + @staticmethod + def forward(ctx, points_xyz: torch.Tensor, + num_points: int) -> torch.Tensor: + """ + Args: + points_xyz (Tensor): (B, N, 3) where N > num_points. + num_points (int): Number of points in the sampled set. + + Returns: + Tensor: (B, num_points) indices of the sampled points. + """ + assert points_xyz.is_contiguous() + + B, N = points_xyz.size()[:2] + output = torch.cuda.IntTensor(B, num_points) + temp = torch.cuda.FloatTensor(B, N).fill_(1e10) + + ext_module.furthest_point_sampling_forward( + points_xyz, + temp, + output, + b=B, + n=N, + m=num_points, + ) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(output) + return output + + @staticmethod + def backward(xyz, a=None): + return None, None + + +class FurthestPointSamplingWithDist(Function): + """Uses iterative furthest point sampling to select a set of features whose + corresponding points have the furthest distance.""" + + @staticmethod + def forward(ctx, points_dist: torch.Tensor, + num_points: int) -> torch.Tensor: + """ + Args: + points_dist (Tensor): (B, N, N) Distance between each point pair. + num_points (int): Number of points in the sampled set. + + Returns: + Tensor: (B, num_points) indices of the sampled points. + """ + assert points_dist.is_contiguous() + + B, N, _ = points_dist.size() + output = points_dist.new_zeros([B, num_points], dtype=torch.int32) + temp = points_dist.new_zeros([B, N]).fill_(1e10) + + ext_module.furthest_point_sampling_with_dist_forward( + points_dist, temp, output, b=B, n=N, m=num_points) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(output) + return output + + @staticmethod + def backward(xyz, a=None): + return None, None + + +furthest_point_sample = FurthestPointSampling.apply +furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply diff --git a/lavis/common/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py b/lavis/common/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py new file mode 100644 index 0000000000000000000000000000000000000000..6d12508469c6c8fa1884debece44c58d158cb6fa --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py @@ -0,0 +1,268 @@ +# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501 + +# Copyright (c) 2021, NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator +# Augmentation (ADA) +# ======================================================================= + +# 1. Definitions + +# "Licensor" means any person or entity that distributes its Work. + +# "Software" means the original work of authorship made available under +# this License. + +# "Work" means the Software and any additions to or derivative works of +# the Software that are made available under this License. + +# The terms "reproduce," "reproduction," "derivative works," and +# "distribution" have the meaning as provided under U.S. copyright law; +# provided, however, that for the purposes of this License, derivative +# works shall not include works that remain separable from, or merely +# link (or bind by name) to the interfaces of, the Work. + +# Works, including the Software, are "made available" under this License +# by including in or with the Work either (a) a copyright notice +# referencing the applicability of this License to the Work, or (b) a +# copy of this License. + +# 2. License Grants + +# 2.1 Copyright Grant. Subject to the terms and conditions of this +# License, each Licensor grants to you a perpetual, worldwide, +# non-exclusive, royalty-free, copyright license to reproduce, +# prepare derivative works of, publicly display, publicly perform, +# sublicense and distribute its Work and any resulting derivative +# works in any form. + +# 3. Limitations + +# 3.1 Redistribution. You may reproduce or distribute the Work only +# if (a) you do so under this License, (b) you include a complete +# copy of this License with your distribution, and (c) you retain +# without modification any copyright, patent, trademark, or +# attribution notices that are present in the Work. + +# 3.2 Derivative Works. You may specify that additional or different +# terms apply to the use, reproduction, and distribution of your +# derivative works of the Work ("Your Terms") only if (a) Your Terms +# provide that the use limitation in Section 3.3 applies to your +# derivative works, and (b) you identify the specific derivative +# works that are subject to Your Terms. Notwithstanding Your Terms, +# this License (including the redistribution requirements in Section +# 3.1) will continue to apply to the Work itself. + +# 3.3 Use Limitation. The Work and any derivative works thereof only +# may be used or intended for use non-commercially. Notwithstanding +# the foregoing, NVIDIA and its affiliates may use the Work and any +# derivative works commercially. As used herein, "non-commercially" +# means for research or evaluation purposes only. + +# 3.4 Patent Claims. If you bring or threaten to bring a patent claim +# against any Licensor (including any claim, cross-claim or +# counterclaim in a lawsuit) to enforce any patents that you allege +# are infringed by any Work, then your rights under this License from +# such Licensor (including the grant in Section 2.1) will terminate +# immediately. + +# 3.5 Trademarks. This License does not grant any rights to use any +# Licensor’s or its affiliates’ names, logos, or trademarks, except +# as necessary to reproduce the notices described in this License. + +# 3.6 Termination. If you violate any term of this License, then your +# rights under this License (including the grant in Section 2.1) will +# terminate immediately. + +# 4. Disclaimer of Warranty. + +# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR +# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER +# THIS LICENSE. + +# 5. Limitation of Liability. + +# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL +# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE +# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, +# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF +# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK +# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, +# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER +# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGES. + +# ======================================================================= + +import torch +import torch.nn.functional as F +from torch import nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu']) + + +class FusedBiasLeakyReLUFunctionBackward(Function): + """Calculate second order deviation. + + This function is to compute the second order deviation for the fused leaky + relu operation. + """ + + @staticmethod + def forward(ctx, grad_output, out, negative_slope, scale): + ctx.save_for_backward(out) + ctx.negative_slope = negative_slope + ctx.scale = scale + + empty = grad_output.new_empty(0) + + grad_input = ext_module.fused_bias_leakyrelu( + grad_output, + empty, + out, + act=3, + grad=1, + alpha=negative_slope, + scale=scale) + + dim = [0] + + if grad_input.ndim > 2: + dim += list(range(2, grad_input.ndim)) + + grad_bias = grad_input.sum(dim).detach() + + return grad_input, grad_bias + + @staticmethod + def backward(ctx, gradgrad_input, gradgrad_bias): + out, = ctx.saved_tensors + + # The second order deviation, in fact, contains two parts, while the + # the first part is zero. Thus, we direct consider the second part + # which is similar with the first order deviation in implementation. + gradgrad_out = ext_module.fused_bias_leakyrelu( + gradgrad_input, + gradgrad_bias.to(out.dtype), + out, + act=3, + grad=1, + alpha=ctx.negative_slope, + scale=ctx.scale) + + return gradgrad_out, None, None, None + + +class FusedBiasLeakyReLUFunction(Function): + + @staticmethod + def forward(ctx, input, bias, negative_slope, scale): + empty = input.new_empty(0) + + out = ext_module.fused_bias_leakyrelu( + input, + bias, + empty, + act=3, + grad=0, + alpha=negative_slope, + scale=scale) + ctx.save_for_backward(out) + ctx.negative_slope = negative_slope + ctx.scale = scale + + return out + + @staticmethod + def backward(ctx, grad_output): + out, = ctx.saved_tensors + + grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply( + grad_output, out, ctx.negative_slope, ctx.scale) + + return grad_input, grad_bias, None, None + + +class FusedBiasLeakyReLU(nn.Module): + """Fused bias leaky ReLU. + + This function is introduced in the StyleGAN2: + http://arxiv.org/abs/1912.04958 + + The bias term comes from the convolution operation. In addition, to keep + the variance of the feature map or gradients unchanged, they also adopt a + scale similarly with Kaiming initialization. However, since the + :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the + final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501 + your own scale. + + TODO: Implement the CPU version. + + Args: + channel (int): The channel number of the feature map. + negative_slope (float, optional): Same as nn.LeakyRelu. + Defaults to 0.2. + scale (float, optional): A scalar to adjust the variance of the feature + map. Defaults to 2**0.5. + """ + + def __init__(self, num_channels, negative_slope=0.2, scale=2**0.5): + super(FusedBiasLeakyReLU, self).__init__() + + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.negative_slope = negative_slope + self.scale = scale + + def forward(self, input): + return fused_bias_leakyrelu(input, self.bias, self.negative_slope, + self.scale) + + +def fused_bias_leakyrelu(input, bias, negative_slope=0.2, scale=2**0.5): + """Fused bias leaky ReLU function. + + This function is introduced in the StyleGAN2: + http://arxiv.org/abs/1912.04958 + + The bias term comes from the convolution operation. In addition, to keep + the variance of the feature map or gradients unchanged, they also adopt a + scale similarly with Kaiming initialization. However, since the + :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the + final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501 + your own scale. + + Args: + input (torch.Tensor): Input feature map. + bias (nn.Parameter): The bias from convolution operation. + negative_slope (float, optional): Same as nn.LeakyRelu. + Defaults to 0.2. + scale (float, optional): A scalar to adjust the variance of the feature + map. Defaults to 2**0.5. + + Returns: + torch.Tensor: Feature map after non-linear activation. + """ + + if not input.is_cuda: + return bias_leakyrelu_ref(input, bias, negative_slope, scale) + + return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype), + negative_slope, scale) + + +def bias_leakyrelu_ref(x, bias, negative_slope=0.2, scale=2**0.5): + + if bias is not None: + assert bias.ndim == 1 + assert bias.shape[0] == x.shape[1] + x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)]) + + x = F.leaky_relu(x, negative_slope) + if scale != 1: + x = x * scale + + return x diff --git a/lavis/common/annotator/uniformer/mmcv/ops/gather_points.py b/lavis/common/annotator/uniformer/mmcv/ops/gather_points.py new file mode 100644 index 0000000000000000000000000000000000000000..f52f1677d8ea0facafc56a3672d37adb44677ff3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/gather_points.py @@ -0,0 +1,57 @@ +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['gather_points_forward', 'gather_points_backward']) + + +class GatherPoints(Function): + """Gather points with given index.""" + + @staticmethod + def forward(ctx, features: torch.Tensor, + indices: torch.Tensor) -> torch.Tensor: + """ + Args: + features (Tensor): (B, C, N) features to gather. + indices (Tensor): (B, M) where M is the number of points. + + Returns: + Tensor: (B, C, M) where M is the number of points. + """ + assert features.is_contiguous() + assert indices.is_contiguous() + + B, npoint = indices.size() + _, C, N = features.size() + output = torch.cuda.FloatTensor(B, C, npoint) + + ext_module.gather_points_forward( + features, indices, output, b=B, c=C, n=N, npoints=npoint) + + ctx.for_backwards = (indices, C, N) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(indices) + return output + + @staticmethod + def backward(ctx, grad_out): + idx, C, N = ctx.for_backwards + B, npoint = idx.size() + + grad_features = torch.cuda.FloatTensor(B, C, N).zero_() + grad_out_data = grad_out.data.contiguous() + ext_module.gather_points_backward( + grad_out_data, + idx, + grad_features.data, + b=B, + c=C, + n=N, + npoints=npoint) + return grad_features, None + + +gather_points = GatherPoints.apply diff --git a/lavis/common/annotator/uniformer/mmcv/ops/group_points.py b/lavis/common/annotator/uniformer/mmcv/ops/group_points.py new file mode 100644 index 0000000000000000000000000000000000000000..6c3ec9d758ebe4e1c2205882af4be154008253a5 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/group_points.py @@ -0,0 +1,224 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +from torch import nn as nn +from torch.autograd import Function + +from ..utils import ext_loader +from .ball_query import ball_query +from .knn import knn + +ext_module = ext_loader.load_ext( + '_ext', ['group_points_forward', 'group_points_backward']) + + +class QueryAndGroup(nn.Module): + """Groups points with a ball query of radius. + + Args: + max_radius (float): The maximum radius of the balls. + If None is given, we will use kNN sampling instead of ball query. + sample_num (int): Maximum number of features to gather in the ball. + min_radius (float, optional): The minimum radius of the balls. + Default: 0. + use_xyz (bool, optional): Whether to use xyz. + Default: True. + return_grouped_xyz (bool, optional): Whether to return grouped xyz. + Default: False. + normalize_xyz (bool, optional): Whether to normalize xyz. + Default: False. + uniform_sample (bool, optional): Whether to sample uniformly. + Default: False + return_unique_cnt (bool, optional): Whether to return the count of + unique samples. Default: False. + return_grouped_idx (bool, optional): Whether to return grouped idx. + Default: False. + """ + + def __init__(self, + max_radius, + sample_num, + min_radius=0, + use_xyz=True, + return_grouped_xyz=False, + normalize_xyz=False, + uniform_sample=False, + return_unique_cnt=False, + return_grouped_idx=False): + super().__init__() + self.max_radius = max_radius + self.min_radius = min_radius + self.sample_num = sample_num + self.use_xyz = use_xyz + self.return_grouped_xyz = return_grouped_xyz + self.normalize_xyz = normalize_xyz + self.uniform_sample = uniform_sample + self.return_unique_cnt = return_unique_cnt + self.return_grouped_idx = return_grouped_idx + if self.return_unique_cnt: + assert self.uniform_sample, \ + 'uniform_sample should be True when ' \ + 'returning the count of unique samples' + if self.max_radius is None: + assert not self.normalize_xyz, \ + 'can not normalize grouped xyz when max_radius is None' + + def forward(self, points_xyz, center_xyz, features=None): + """ + Args: + points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. + center_xyz (Tensor): (B, npoint, 3) coordinates of the centriods. + features (Tensor): (B, C, N) Descriptors of the features. + + Returns: + Tensor: (B, 3 + C, npoint, sample_num) Grouped feature. + """ + # if self.max_radius is None, we will perform kNN instead of ball query + # idx is of shape [B, npoint, sample_num] + if self.max_radius is None: + idx = knn(self.sample_num, points_xyz, center_xyz, False) + idx = idx.transpose(1, 2).contiguous() + else: + idx = ball_query(self.min_radius, self.max_radius, self.sample_num, + points_xyz, center_xyz) + + if self.uniform_sample: + unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) + for i_batch in range(idx.shape[0]): + for i_region in range(idx.shape[1]): + unique_ind = torch.unique(idx[i_batch, i_region, :]) + num_unique = unique_ind.shape[0] + unique_cnt[i_batch, i_region] = num_unique + sample_ind = torch.randint( + 0, + num_unique, (self.sample_num - num_unique, ), + dtype=torch.long) + all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) + idx[i_batch, i_region, :] = all_ind + + xyz_trans = points_xyz.transpose(1, 2).contiguous() + # (B, 3, npoint, sample_num) + grouped_xyz = grouping_operation(xyz_trans, idx) + grouped_xyz_diff = grouped_xyz - \ + center_xyz.transpose(1, 2).unsqueeze(-1) # relative offsets + if self.normalize_xyz: + grouped_xyz_diff /= self.max_radius + + if features is not None: + grouped_features = grouping_operation(features, idx) + if self.use_xyz: + # (B, C + 3, npoint, sample_num) + new_features = torch.cat([grouped_xyz_diff, grouped_features], + dim=1) + else: + new_features = grouped_features + else: + assert (self.use_xyz + ), 'Cannot have not features and not use xyz as a feature!' + new_features = grouped_xyz_diff + + ret = [new_features] + if self.return_grouped_xyz: + ret.append(grouped_xyz) + if self.return_unique_cnt: + ret.append(unique_cnt) + if self.return_grouped_idx: + ret.append(idx) + if len(ret) == 1: + return ret[0] + else: + return tuple(ret) + + +class GroupAll(nn.Module): + """Group xyz with feature. + + Args: + use_xyz (bool): Whether to use xyz. + """ + + def __init__(self, use_xyz: bool = True): + super().__init__() + self.use_xyz = use_xyz + + def forward(self, + xyz: torch.Tensor, + new_xyz: torch.Tensor, + features: torch.Tensor = None): + """ + Args: + xyz (Tensor): (B, N, 3) xyz coordinates of the features. + new_xyz (Tensor): new xyz coordinates of the features. + features (Tensor): (B, C, N) features to group. + + Returns: + Tensor: (B, C + 3, 1, N) Grouped feature. + """ + grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) + if features is not None: + grouped_features = features.unsqueeze(2) + if self.use_xyz: + # (B, 3 + C, 1, N) + new_features = torch.cat([grouped_xyz, grouped_features], + dim=1) + else: + new_features = grouped_features + else: + new_features = grouped_xyz + + return new_features + + +class GroupingOperation(Function): + """Group feature with given index.""" + + @staticmethod + def forward(ctx, features: torch.Tensor, + indices: torch.Tensor) -> torch.Tensor: + """ + Args: + features (Tensor): (B, C, N) tensor of features to group. + indices (Tensor): (B, npoint, nsample) the indices of + features to group with. + + Returns: + Tensor: (B, C, npoint, nsample) Grouped features. + """ + features = features.contiguous() + indices = indices.contiguous() + + B, nfeatures, nsample = indices.size() + _, C, N = features.size() + output = torch.cuda.FloatTensor(B, C, nfeatures, nsample) + + ext_module.group_points_forward(B, C, N, nfeatures, nsample, features, + indices, output) + + ctx.for_backwards = (indices, N) + return output + + @staticmethod + def backward(ctx, + grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients + of the output from forward. + + Returns: + Tensor: (B, C, N) gradient of the features. + """ + idx, N = ctx.for_backwards + + B, C, npoint, nsample = grad_out.size() + grad_features = torch.cuda.FloatTensor(B, C, N).zero_() + + grad_out_data = grad_out.data.contiguous() + ext_module.group_points_backward(B, C, N, npoint, nsample, + grad_out_data, idx, + grad_features.data) + return grad_features, None + + +grouping_operation = GroupingOperation.apply diff --git a/lavis/common/annotator/uniformer/mmcv/ops/info.py b/lavis/common/annotator/uniformer/mmcv/ops/info.py new file mode 100644 index 0000000000000000000000000000000000000000..29f2e5598ae2bb5866ccd15a7d3b4de33c0cd14d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/info.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import glob +import os + +import torch + +if torch.__version__ == 'parrots': + import parrots + + def get_compiler_version(): + return 'GCC ' + parrots.version.compiler + + def get_compiling_cuda_version(): + return parrots.version.cuda +else: + from ..utils import ext_loader + ext_module = ext_loader.load_ext( + '_ext', ['get_compiler_version', 'get_compiling_cuda_version']) + + def get_compiler_version(): + return ext_module.get_compiler_version() + + def get_compiling_cuda_version(): + return ext_module.get_compiling_cuda_version() + + +def get_onnxruntime_op_path(): + wildcard = os.path.join( + os.path.abspath(os.path.dirname(os.path.dirname(__file__))), + '_ext_ort.*.so') + + paths = glob.glob(wildcard) + if len(paths) > 0: + return paths[0] + else: + return '' diff --git a/lavis/common/annotator/uniformer/mmcv/ops/iou3d.py b/lavis/common/annotator/uniformer/mmcv/ops/iou3d.py new file mode 100644 index 0000000000000000000000000000000000000000..6fc71979190323f44c09f8b7e1761cf49cd2d76b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/iou3d.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'iou3d_boxes_iou_bev_forward', 'iou3d_nms_forward', + 'iou3d_nms_normal_forward' +]) + + +def boxes_iou_bev(boxes_a, boxes_b): + """Calculate boxes IoU in the Bird's Eye View. + + Args: + boxes_a (torch.Tensor): Input boxes a with shape (M, 5). + boxes_b (torch.Tensor): Input boxes b with shape (N, 5). + + Returns: + ans_iou (torch.Tensor): IoU result with shape (M, N). + """ + ans_iou = boxes_a.new_zeros( + torch.Size((boxes_a.shape[0], boxes_b.shape[0]))) + + ext_module.iou3d_boxes_iou_bev_forward(boxes_a.contiguous(), + boxes_b.contiguous(), ans_iou) + + return ans_iou + + +def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None): + """NMS function GPU implementation (for BEV boxes). The overlap of two + boxes for IoU calculation is defined as the exact overlapping area of the + two boxes. In this function, one can also set ``pre_max_size`` and + ``post_max_size``. + + Args: + boxes (torch.Tensor): Input boxes with the shape of [N, 5] + ([x1, y1, x2, y2, ry]). + scores (torch.Tensor): Scores of boxes with the shape of [N]. + thresh (float): Overlap threshold of NMS. + pre_max_size (int, optional): Max size of boxes before NMS. + Default: None. + post_max_size (int, optional): Max size of boxes after NMS. + Default: None. + + Returns: + torch.Tensor: Indexes after NMS. + """ + assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]' + order = scores.sort(0, descending=True)[1] + + if pre_max_size is not None: + order = order[:pre_max_size] + boxes = boxes[order].contiguous() + + keep = torch.zeros(boxes.size(0), dtype=torch.long) + num_out = ext_module.iou3d_nms_forward(boxes, keep, thresh) + keep = order[keep[:num_out].cuda(boxes.device)].contiguous() + if post_max_size is not None: + keep = keep[:post_max_size] + return keep + + +def nms_normal_bev(boxes, scores, thresh): + """Normal NMS function GPU implementation (for BEV boxes). The overlap of + two boxes for IoU calculation is defined as the exact overlapping area of + the two boxes WITH their yaw angle set to 0. + + Args: + boxes (torch.Tensor): Input boxes with shape (N, 5). + scores (torch.Tensor): Scores of predicted boxes with shape (N). + thresh (float): Overlap threshold of NMS. + + Returns: + torch.Tensor: Remaining indices with scores in descending order. + """ + assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]' + order = scores.sort(0, descending=True)[1] + + boxes = boxes[order].contiguous() + + keep = torch.zeros(boxes.size(0), dtype=torch.long) + num_out = ext_module.iou3d_nms_normal_forward(boxes, keep, thresh) + return order[keep[:num_out].cuda(boxes.device)].contiguous() diff --git a/lavis/common/annotator/uniformer/mmcv/ops/knn.py b/lavis/common/annotator/uniformer/mmcv/ops/knn.py new file mode 100644 index 0000000000000000000000000000000000000000..f335785036669fc19239825b0aae6dde3f73bf92 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/knn.py @@ -0,0 +1,77 @@ +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['knn_forward']) + + +class KNN(Function): + r"""KNN (CUDA) based on heap data structure. + Modified from `PAConv `_. + + Find k-nearest points. + """ + + @staticmethod + def forward(ctx, + k: int, + xyz: torch.Tensor, + center_xyz: torch.Tensor = None, + transposed: bool = False) -> torch.Tensor: + """ + Args: + k (int): number of nearest neighbors. + xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N). + xyz coordinates of the features. + center_xyz (Tensor, optional): (B, npoint, 3) if transposed == + False, else (B, 3, npoint). centers of the knn query. + Default: None. + transposed (bool, optional): whether the input tensors are + transposed. Should not explicitly use this keyword when + calling knn (=KNN.apply), just add the fourth param. + Default: False. + + Returns: + Tensor: (B, k, npoint) tensor with the indices of + the features that form k-nearest neighbours. + """ + assert (k > 0) & (k < 100), 'k should be in range(0, 100)' + + if center_xyz is None: + center_xyz = xyz + + if transposed: + xyz = xyz.transpose(2, 1).contiguous() + center_xyz = center_xyz.transpose(2, 1).contiguous() + + assert xyz.is_contiguous() # [B, N, 3] + assert center_xyz.is_contiguous() # [B, npoint, 3] + + center_xyz_device = center_xyz.get_device() + assert center_xyz_device == xyz.get_device(), \ + 'center_xyz and xyz should be put on the same device' + if torch.cuda.current_device() != center_xyz_device: + torch.cuda.set_device(center_xyz_device) + + B, npoint, _ = center_xyz.shape + N = xyz.shape[1] + + idx = center_xyz.new_zeros((B, npoint, k)).int() + dist2 = center_xyz.new_zeros((B, npoint, k)).float() + + ext_module.knn_forward( + xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k) + # idx shape to [B, k, npoint] + idx = idx.transpose(2, 1).contiguous() + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(idx) + return idx + + @staticmethod + def backward(ctx, a=None): + return None, None, None + + +knn = KNN.apply diff --git a/lavis/common/annotator/uniformer/mmcv/ops/masked_conv.py b/lavis/common/annotator/uniformer/mmcv/ops/masked_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..cd514cc204c1d571ea5dc7e74b038c0f477a008b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/masked_conv.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['masked_im2col_forward', 'masked_col2im_forward']) + + +class MaskedConv2dFunction(Function): + + @staticmethod + def symbolic(g, features, mask, weight, bias, padding, stride): + return g.op( + 'mmcv::MMCVMaskedConv2d', + features, + mask, + weight, + bias, + padding_i=padding, + stride_i=stride) + + @staticmethod + def forward(ctx, features, mask, weight, bias, padding=0, stride=1): + assert mask.dim() == 3 and mask.size(0) == 1 + assert features.dim() == 4 and features.size(0) == 1 + assert features.size()[2:] == mask.size()[1:] + pad_h, pad_w = _pair(padding) + stride_h, stride_w = _pair(stride) + if stride_h != 1 or stride_w != 1: + raise ValueError( + 'Stride could not only be 1 in masked_conv2d currently.') + out_channel, in_channel, kernel_h, kernel_w = weight.size() + + batch_size = features.size(0) + out_h = int( + math.floor((features.size(2) + 2 * pad_h - + (kernel_h - 1) - 1) / stride_h + 1)) + out_w = int( + math.floor((features.size(3) + 2 * pad_w - + (kernel_h - 1) - 1) / stride_w + 1)) + mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False) + output = features.new_zeros(batch_size, out_channel, out_h, out_w) + if mask_inds.numel() > 0: + mask_h_idx = mask_inds[:, 0].contiguous() + mask_w_idx = mask_inds[:, 1].contiguous() + data_col = features.new_zeros(in_channel * kernel_h * kernel_w, + mask_inds.size(0)) + ext_module.masked_im2col_forward( + features, + mask_h_idx, + mask_w_idx, + data_col, + kernel_h=kernel_h, + kernel_w=kernel_w, + pad_h=pad_h, + pad_w=pad_w) + + masked_output = torch.addmm(1, bias[:, None], 1, + weight.view(out_channel, -1), data_col) + ext_module.masked_col2im_forward( + masked_output, + mask_h_idx, + mask_w_idx, + output, + height=out_h, + width=out_w, + channels=out_channel) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + return (None, ) * 5 + + +masked_conv2d = MaskedConv2dFunction.apply + + +class MaskedConv2d(nn.Conv2d): + """A MaskedConv2d which inherits the official Conv2d. + + The masked forward doesn't implement the backward function and only + supports the stride parameter to be 1 currently. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True): + super(MaskedConv2d, + self).__init__(in_channels, out_channels, kernel_size, stride, + padding, dilation, groups, bias) + + def forward(self, input, mask=None): + if mask is None: # fallback to the normal Conv2d + return super(MaskedConv2d, self).forward(input) + else: + return masked_conv2d(input, mask, self.weight, self.bias, + self.padding) diff --git a/lavis/common/annotator/uniformer/mmcv/ops/merge_cells.py b/lavis/common/annotator/uniformer/mmcv/ops/merge_cells.py new file mode 100644 index 0000000000000000000000000000000000000000..48ca8cc0a8aca8432835bd760c0403a3c35b34cf --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/merge_cells.py @@ -0,0 +1,149 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractmethod + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..cnn import ConvModule + + +class BaseMergeCell(nn.Module): + """The basic class for cells used in NAS-FPN and NAS-FCOS. + + BaseMergeCell takes 2 inputs. After applying convolution + on them, they are resized to the target size. Then, + they go through binary_op, which depends on the type of cell. + If with_out_conv is True, the result of output will go through + another convolution layer. + + Args: + in_channels (int): number of input channels in out_conv layer. + out_channels (int): number of output channels in out_conv layer. + with_out_conv (bool): Whether to use out_conv layer + out_conv_cfg (dict): Config dict for convolution layer, which should + contain "groups", "kernel_size", "padding", "bias" to build + out_conv layer. + out_norm_cfg (dict): Config dict for normalization layer in out_conv. + out_conv_order (tuple): The order of conv/norm/activation layers in + out_conv. + with_input1_conv (bool): Whether to use convolution on input1. + with_input2_conv (bool): Whether to use convolution on input2. + input_conv_cfg (dict): Config dict for building input1_conv layer and + input2_conv layer, which is expected to contain the type of + convolution. + Default: None, which means using conv2d. + input_norm_cfg (dict): Config dict for normalization layer in + input1_conv and input2_conv layer. Default: None. + upsample_mode (str): Interpolation method used to resize the output + of input1_conv and input2_conv to target size. Currently, we + support ['nearest', 'bilinear']. Default: 'nearest'. + """ + + def __init__(self, + fused_channels=256, + out_channels=256, + with_out_conv=True, + out_conv_cfg=dict( + groups=1, kernel_size=3, padding=1, bias=True), + out_norm_cfg=None, + out_conv_order=('act', 'conv', 'norm'), + with_input1_conv=False, + with_input2_conv=False, + input_conv_cfg=None, + input_norm_cfg=None, + upsample_mode='nearest'): + super(BaseMergeCell, self).__init__() + assert upsample_mode in ['nearest', 'bilinear'] + self.with_out_conv = with_out_conv + self.with_input1_conv = with_input1_conv + self.with_input2_conv = with_input2_conv + self.upsample_mode = upsample_mode + + if self.with_out_conv: + self.out_conv = ConvModule( + fused_channels, + out_channels, + **out_conv_cfg, + norm_cfg=out_norm_cfg, + order=out_conv_order) + + self.input1_conv = self._build_input_conv( + out_channels, input_conv_cfg, + input_norm_cfg) if with_input1_conv else nn.Sequential() + self.input2_conv = self._build_input_conv( + out_channels, input_conv_cfg, + input_norm_cfg) if with_input2_conv else nn.Sequential() + + def _build_input_conv(self, channel, conv_cfg, norm_cfg): + return ConvModule( + channel, + channel, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + bias=True) + + @abstractmethod + def _binary_op(self, x1, x2): + pass + + def _resize(self, x, size): + if x.shape[-2:] == size: + return x + elif x.shape[-2:] < size: + return F.interpolate(x, size=size, mode=self.upsample_mode) + else: + assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0 + kernel_size = x.shape[-1] // size[-1] + x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size) + return x + + def forward(self, x1, x2, out_size=None): + assert x1.shape[:2] == x2.shape[:2] + assert out_size is None or len(out_size) == 2 + if out_size is None: # resize to larger one + out_size = max(x1.size()[2:], x2.size()[2:]) + + x1 = self.input1_conv(x1) + x2 = self.input2_conv(x2) + + x1 = self._resize(x1, out_size) + x2 = self._resize(x2, out_size) + + x = self._binary_op(x1, x2) + if self.with_out_conv: + x = self.out_conv(x) + return x + + +class SumCell(BaseMergeCell): + + def __init__(self, in_channels, out_channels, **kwargs): + super(SumCell, self).__init__(in_channels, out_channels, **kwargs) + + def _binary_op(self, x1, x2): + return x1 + x2 + + +class ConcatCell(BaseMergeCell): + + def __init__(self, in_channels, out_channels, **kwargs): + super(ConcatCell, self).__init__(in_channels * 2, out_channels, + **kwargs) + + def _binary_op(self, x1, x2): + ret = torch.cat([x1, x2], dim=1) + return ret + + +class GlobalPoolingCell(BaseMergeCell): + + def __init__(self, in_channels=None, out_channels=None, **kwargs): + super().__init__(in_channels, out_channels, **kwargs) + self.global_pool = nn.AdaptiveAvgPool2d((1, 1)) + + def _binary_op(self, x1, x2): + x2_att = self.global_pool(x2).sigmoid() + return x2 + x2_att * x1 diff --git a/lavis/common/annotator/uniformer/mmcv/ops/modulated_deform_conv.py b/lavis/common/annotator/uniformer/mmcv/ops/modulated_deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..75559579cf053abcc99538606cbb88c723faf783 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/modulated_deform_conv.py @@ -0,0 +1,282 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair, _single + +from annotator.uniformer.mmcv.utils import deprecated_api_warning +from ..cnn import CONV_LAYERS +from ..utils import ext_loader, print_log + +ext_module = ext_loader.load_ext( + '_ext', + ['modulated_deform_conv_forward', 'modulated_deform_conv_backward']) + + +class ModulatedDeformConv2dFunction(Function): + + @staticmethod + def symbolic(g, input, offset, mask, weight, bias, stride, padding, + dilation, groups, deform_groups): + input_tensors = [input, offset, mask, weight] + if bias is not None: + input_tensors.append(bias) + return g.op( + 'mmcv::MMCVModulatedDeformConv2d', + *input_tensors, + stride_i=stride, + padding_i=padding, + dilation_i=dilation, + groups_i=groups, + deform_groups_i=deform_groups) + + @staticmethod + def forward(ctx, + input, + offset, + mask, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + deform_groups=1): + if input is not None and input.dim() != 4: + raise ValueError( + f'Expected 4D tensor as input, got {input.dim()}D tensor \ + instead.') + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deform_groups = deform_groups + ctx.with_bias = bias is not None + if not ctx.with_bias: + bias = input.new_empty(0) # fake tensor + # When pytorch version >= 1.6.0, amp is adopted for fp16 mode; + # amp won't cast the type of model (float32), but "offset" is cast + # to float16 by nn.Conv2d automatically, leading to the type + # mismatch with input (when it is float32) or weight. + # The flag for whether to use fp16 or amp is the type of "offset", + # we cast weight and input to temporarily support fp16 and amp + # whatever the pytorch version is. + input = input.type_as(offset) + weight = weight.type_as(input) + ctx.save_for_backward(input, offset, mask, weight, bias) + output = input.new_empty( + ModulatedDeformConv2dFunction._output_size(ctx, input, weight)) + ctx._bufs = [input.new_empty(0), input.new_empty(0)] + ext_module.modulated_deform_conv_forward( + input, + weight, + bias, + ctx._bufs[0], + offset, + mask, + output, + ctx._bufs[1], + kernel_h=weight.size(2), + kernel_w=weight.size(3), + stride_h=ctx.stride[0], + stride_w=ctx.stride[1], + pad_h=ctx.padding[0], + pad_w=ctx.padding[1], + dilation_h=ctx.dilation[0], + dilation_w=ctx.dilation[1], + group=ctx.groups, + deformable_group=ctx.deform_groups, + with_bias=ctx.with_bias) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + grad_mask = torch.zeros_like(mask) + grad_weight = torch.zeros_like(weight) + grad_bias = torch.zeros_like(bias) + grad_output = grad_output.contiguous() + ext_module.modulated_deform_conv_backward( + input, + weight, + bias, + ctx._bufs[0], + offset, + mask, + ctx._bufs[1], + grad_input, + grad_weight, + grad_bias, + grad_offset, + grad_mask, + grad_output, + kernel_h=weight.size(2), + kernel_w=weight.size(3), + stride_h=ctx.stride[0], + stride_w=ctx.stride[1], + pad_h=ctx.padding[0], + pad_w=ctx.padding[1], + dilation_h=ctx.dilation[0], + dilation_w=ctx.dilation[1], + group=ctx.groups, + deformable_group=ctx.deform_groups, + with_bias=ctx.with_bias) + if not ctx.with_bias: + grad_bias = None + + return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, + None, None, None, None, None) + + @staticmethod + def _output_size(ctx, input, weight): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = ctx.padding[d] + kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = ctx.stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + 'convolution input is too small (output would be ' + + 'x'.join(map(str, output_size)) + ')') + return output_size + + +modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply + + +class ModulatedDeformConv2d(nn.Module): + + @deprecated_api_warning({'deformable_groups': 'deform_groups'}, + cls_name='ModulatedDeformConv2d') + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deform_groups=1, + bias=True): + super(ModulatedDeformConv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deform_groups = deform_groups + # enable compatibility with nn.Conv2d + self.transposed = False + self.output_padding = _single(0) + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // groups, + *self.kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + self.init_weights() + + def init_weights(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.zero_() + + def forward(self, x, offset, mask): + return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, + self.dilation, self.groups, + self.deform_groups) + + +@CONV_LAYERS.register_module('DCNv2') +class ModulatedDeformConv2dPack(ModulatedDeformConv2d): + """A ModulatedDeformable Conv Encapsulation that acts as normal Conv + layers. + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int): Same as nn.Conv2d, while tuple is not supported. + padding (int): Same as nn.Conv2d, while tuple is not supported. + dilation (int): Same as nn.Conv2d, while tuple is not supported. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if norm_cfg is None, otherwise + False. + """ + + _version = 2 + + def __init__(self, *args, **kwargs): + super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs) + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1], + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + bias=True) + self.init_weights() + + def init_weights(self): + super(ModulatedDeformConv2dPack, self).init_weights() + if hasattr(self, 'conv_offset'): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x): + out = self.conv_offset(x) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, + self.dilation, self.groups, + self.deform_groups) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + + if version is None or version < 2: + # the key is different in early versions + # In version < 2, ModulatedDeformConvPack + # loads previous benchmark models. + if (prefix + 'conv_offset.weight' not in state_dict + and prefix[:-1] + '_offset.weight' in state_dict): + state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( + prefix[:-1] + '_offset.weight') + if (prefix + 'conv_offset.bias' not in state_dict + and prefix[:-1] + '_offset.bias' in state_dict): + state_dict[prefix + + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + + '_offset.bias') + + if version is not None and version > 1: + print_log( + f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to ' + 'version 2.', + logger='root') + + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) diff --git a/lavis/common/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py b/lavis/common/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..c52dda18b41705705b47dd0e995b124048c16fba --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py @@ -0,0 +1,358 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import warnings + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd.function import Function, once_differentiable + +from annotator.uniformer.mmcv import deprecated_api_warning +from annotator.uniformer.mmcv.cnn import constant_init, xavier_init +from annotator.uniformer.mmcv.cnn.bricks.registry import ATTENTION +from annotator.uniformer.mmcv.runner import BaseModule +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +class MultiScaleDeformableAttnFunction(Function): + + @staticmethod + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index,\ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None + + +def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, + sampling_locations, attention_weights): + """CPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + + bs, _, num_heads, embed_dims = value.shape + _, num_queries, num_heads, num_levels, num_points, _ =\ + sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], + dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (H_, W_) in enumerate(value_spatial_shapes): + # bs, H_*W_, num_heads, embed_dims -> + # bs, H_*W_, num_heads*embed_dims -> + # bs, num_heads*embed_dims, H_*W_ -> + # bs*num_heads, embed_dims, H_, W_ + value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape( + bs * num_heads, embed_dims, H_, W_) + # bs, num_queries, num_heads, num_points, 2 -> + # bs, num_heads, num_queries, num_points, 2 -> + # bs*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, + level].transpose(1, 2).flatten(0, 1) + # bs*num_heads, embed_dims, num_queries, num_points + sampling_value_l_ = F.grid_sample( + value_l_, + sampling_grid_l_, + mode='bilinear', + padding_mode='zeros', + align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (bs, num_queries, num_heads, num_levels, num_points) -> + # (bs, num_heads, num_queries, num_levels, num_points) -> + # (bs, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + bs * num_heads, 1, num_queries, num_levels * num_points) + output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * + attention_weights).sum(-1).view(bs, num_heads * embed_dims, + num_queries) + return output.transpose(1, 2).contiguous() + + +@ATTENTION.register_module() +class MultiScaleDeformableAttention(BaseModule): + """An attention module used in Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + im2col_step=64, + dropout=0.1, + batch_first=False, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + @deprecated_api_warning({'residual': 'identity'}, + cls_name='MultiScaleDeformableAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + output = self.output_proj(output) + + if not self.batch_first: + # (num_query, bs ,embed_dims) + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/lavis/common/annotator/uniformer/mmcv/ops/nms.py b/lavis/common/annotator/uniformer/mmcv/ops/nms.py new file mode 100644 index 0000000000000000000000000000000000000000..6d9634281f486ab284091786886854c451368052 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/nms.py @@ -0,0 +1,417 @@ +import os + +import numpy as np +import torch + +from annotator.uniformer.mmcv.utils import deprecated_api_warning +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated']) + + +# This function is modified from: https://github.com/pytorch/vision/ +class NMSop(torch.autograd.Function): + + @staticmethod + def forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold, + max_num): + is_filtering_by_score = score_threshold > 0 + if is_filtering_by_score: + valid_mask = scores > score_threshold + bboxes, scores = bboxes[valid_mask], scores[valid_mask] + valid_inds = torch.nonzero( + valid_mask, as_tuple=False).squeeze(dim=1) + + inds = ext_module.nms( + bboxes, scores, iou_threshold=float(iou_threshold), offset=offset) + + if max_num > 0: + inds = inds[:max_num] + if is_filtering_by_score: + inds = valid_inds[inds] + return inds + + @staticmethod + def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold, + max_num): + from ..onnx import is_custom_op_loaded + has_custom_op = is_custom_op_loaded() + # TensorRT nms plugin is aligned with original nms in ONNXRuntime + is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT' + if has_custom_op and (not is_trt_backend): + return g.op( + 'mmcv::NonMaxSuppression', + bboxes, + scores, + iou_threshold_f=float(iou_threshold), + offset_i=int(offset)) + else: + from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze + from ..onnx.onnx_utils.symbolic_helper import _size_helper + + boxes = unsqueeze(g, bboxes, 0) + scores = unsqueeze(g, unsqueeze(g, scores, 0), 0) + + if max_num > 0: + max_num = g.op( + 'Constant', + value_t=torch.tensor(max_num, dtype=torch.long)) + else: + dim = g.op('Constant', value_t=torch.tensor(0)) + max_num = _size_helper(g, bboxes, dim) + max_output_per_class = max_num + iou_threshold = g.op( + 'Constant', + value_t=torch.tensor([iou_threshold], dtype=torch.float)) + score_threshold = g.op( + 'Constant', + value_t=torch.tensor([score_threshold], dtype=torch.float)) + nms_out = g.op('NonMaxSuppression', boxes, scores, + max_output_per_class, iou_threshold, + score_threshold) + return squeeze( + g, + select( + g, nms_out, 1, + g.op( + 'Constant', + value_t=torch.tensor([2], dtype=torch.long))), 1) + + +class SoftNMSop(torch.autograd.Function): + + @staticmethod + def forward(ctx, boxes, scores, iou_threshold, sigma, min_score, method, + offset): + dets = boxes.new_empty((boxes.size(0), 5), device='cpu') + inds = ext_module.softnms( + boxes.cpu(), + scores.cpu(), + dets.cpu(), + iou_threshold=float(iou_threshold), + sigma=float(sigma), + min_score=float(min_score), + method=int(method), + offset=int(offset)) + return dets, inds + + @staticmethod + def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method, + offset): + from packaging import version + assert version.parse(torch.__version__) >= version.parse('1.7.0') + nms_out = g.op( + 'mmcv::SoftNonMaxSuppression', + boxes, + scores, + iou_threshold_f=float(iou_threshold), + sigma_f=float(sigma), + min_score_f=float(min_score), + method_i=int(method), + offset_i=int(offset), + outputs=2) + return nms_out + + +@deprecated_api_warning({'iou_thr': 'iou_threshold'}) +def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1): + """Dispatch to either CPU or GPU NMS implementations. + + The input can be either torch tensor or numpy array. GPU NMS will be used + if the input is gpu tensor, otherwise CPU NMS + will be used. The returned type will always be the same as inputs. + + Arguments: + boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4). + scores (torch.Tensor or np.ndarray): scores in shape (N, ). + iou_threshold (float): IoU threshold for NMS. + offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset). + score_threshold (float): score threshold for NMS. + max_num (int): maximum number of boxes after NMS. + + Returns: + tuple: kept dets(boxes and scores) and indice, which is always the \ + same data type as the input. + + Example: + >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9], + >>> [49.3, 32.9, 51.0, 35.3], + >>> [49.2, 31.8, 51.0, 35.4], + >>> [35.1, 11.5, 39.1, 15.7], + >>> [35.6, 11.8, 39.3, 14.2], + >>> [35.3, 11.5, 39.9, 14.5], + >>> [35.2, 11.7, 39.7, 15.7]], dtype=np.float32) + >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\ + dtype=np.float32) + >>> iou_threshold = 0.6 + >>> dets, inds = nms(boxes, scores, iou_threshold) + >>> assert len(inds) == len(dets) == 3 + """ + assert isinstance(boxes, (torch.Tensor, np.ndarray)) + assert isinstance(scores, (torch.Tensor, np.ndarray)) + is_numpy = False + if isinstance(boxes, np.ndarray): + is_numpy = True + boxes = torch.from_numpy(boxes) + if isinstance(scores, np.ndarray): + scores = torch.from_numpy(scores) + assert boxes.size(1) == 4 + assert boxes.size(0) == scores.size(0) + assert offset in (0, 1) + + if torch.__version__ == 'parrots': + indata_list = [boxes, scores] + indata_dict = { + 'iou_threshold': float(iou_threshold), + 'offset': int(offset) + } + inds = ext_module.nms(*indata_list, **indata_dict) + else: + inds = NMSop.apply(boxes, scores, iou_threshold, offset, + score_threshold, max_num) + dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1) + if is_numpy: + dets = dets.cpu().numpy() + inds = inds.cpu().numpy() + return dets, inds + + +@deprecated_api_warning({'iou_thr': 'iou_threshold'}) +def soft_nms(boxes, + scores, + iou_threshold=0.3, + sigma=0.5, + min_score=1e-3, + method='linear', + offset=0): + """Dispatch to only CPU Soft NMS implementations. + + The input can be either a torch tensor or numpy array. + The returned type will always be the same as inputs. + + Arguments: + boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4). + scores (torch.Tensor or np.ndarray): scores in shape (N, ). + iou_threshold (float): IoU threshold for NMS. + sigma (float): hyperparameter for gaussian method + min_score (float): score filter threshold + method (str): either 'linear' or 'gaussian' + offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset). + + Returns: + tuple: kept dets(boxes and scores) and indice, which is always the \ + same data type as the input. + + Example: + >>> boxes = np.array([[4., 3., 5., 3.], + >>> [4., 3., 5., 4.], + >>> [3., 1., 3., 1.], + >>> [3., 1., 3., 1.], + >>> [3., 1., 3., 1.], + >>> [3., 1., 3., 1.]], dtype=np.float32) + >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32) + >>> iou_threshold = 0.6 + >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5) + >>> assert len(inds) == len(dets) == 5 + """ + + assert isinstance(boxes, (torch.Tensor, np.ndarray)) + assert isinstance(scores, (torch.Tensor, np.ndarray)) + is_numpy = False + if isinstance(boxes, np.ndarray): + is_numpy = True + boxes = torch.from_numpy(boxes) + if isinstance(scores, np.ndarray): + scores = torch.from_numpy(scores) + assert boxes.size(1) == 4 + assert boxes.size(0) == scores.size(0) + assert offset in (0, 1) + method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2} + assert method in method_dict.keys() + + if torch.__version__ == 'parrots': + dets = boxes.new_empty((boxes.size(0), 5), device='cpu') + indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()] + indata_dict = { + 'iou_threshold': float(iou_threshold), + 'sigma': float(sigma), + 'min_score': min_score, + 'method': method_dict[method], + 'offset': int(offset) + } + inds = ext_module.softnms(*indata_list, **indata_dict) + else: + dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(), + float(iou_threshold), float(sigma), + float(min_score), method_dict[method], + int(offset)) + + dets = dets[:inds.size(0)] + + if is_numpy: + dets = dets.cpu().numpy() + inds = inds.cpu().numpy() + return dets, inds + else: + return dets.to(device=boxes.device), inds.to(device=boxes.device) + + +def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False): + """Performs non-maximum suppression in a batched fashion. + + Modified from https://github.com/pytorch/vision/blob + /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39. + In order to perform NMS independently per class, we add an offset to all + the boxes. The offset is dependent only on the class idx, and is large + enough so that boxes from different classes do not overlap. + + Arguments: + boxes (torch.Tensor): boxes in shape (N, 4). + scores (torch.Tensor): scores in shape (N, ). + idxs (torch.Tensor): each index value correspond to a bbox cluster, + and NMS will not be applied between elements of different idxs, + shape (N, ). + nms_cfg (dict): specify nms type and other parameters like iou_thr. + Possible keys includes the following. + + - iou_thr (float): IoU threshold used for NMS. + - split_thr (float): threshold number of boxes. In some cases the + number of boxes is large (e.g., 200k). To avoid OOM during + training, the users could set `split_thr` to a small value. + If the number of boxes is greater than the threshold, it will + perform NMS on each group of boxes separately and sequentially. + Defaults to 10000. + class_agnostic (bool): if true, nms is class agnostic, + i.e. IoU thresholding happens over all boxes, + regardless of the predicted class. + + Returns: + tuple: kept dets and indice. + """ + nms_cfg_ = nms_cfg.copy() + class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic) + if class_agnostic: + boxes_for_nms = boxes + else: + max_coordinate = boxes.max() + offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes)) + boxes_for_nms = boxes + offsets[:, None] + + nms_type = nms_cfg_.pop('type', 'nms') + nms_op = eval(nms_type) + + split_thr = nms_cfg_.pop('split_thr', 10000) + # Won't split to multiple nms nodes when exporting to onnx + if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export(): + dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_) + boxes = boxes[keep] + # -1 indexing works abnormal in TensorRT + # This assumes `dets` has 5 dimensions where + # the last dimension is score. + # TODO: more elegant way to handle the dimension issue. + # Some type of nms would reweight the score, such as SoftNMS + scores = dets[:, 4] + else: + max_num = nms_cfg_.pop('max_num', -1) + total_mask = scores.new_zeros(scores.size(), dtype=torch.bool) + # Some type of nms would reweight the score, such as SoftNMS + scores_after_nms = scores.new_zeros(scores.size()) + for id in torch.unique(idxs): + mask = (idxs == id).nonzero(as_tuple=False).view(-1) + dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_) + total_mask[mask[keep]] = True + scores_after_nms[mask[keep]] = dets[:, -1] + keep = total_mask.nonzero(as_tuple=False).view(-1) + + scores, inds = scores_after_nms[keep].sort(descending=True) + keep = keep[inds] + boxes = boxes[keep] + + if max_num > 0: + keep = keep[:max_num] + boxes = boxes[:max_num] + scores = scores[:max_num] + + return torch.cat([boxes, scores[:, None]], -1), keep + + +def nms_match(dets, iou_threshold): + """Matched dets into different groups by NMS. + + NMS match is Similar to NMS but when a bbox is suppressed, nms match will + record the indice of suppressed bbox and form a group with the indice of + kept bbox. In each group, indice is sorted as score order. + + Arguments: + dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5). + iou_thr (float): IoU thresh for NMS. + + Returns: + List[torch.Tensor | np.ndarray]: The outer list corresponds different + matched group, the inner Tensor corresponds the indices for a group + in score order. + """ + if dets.shape[0] == 0: + matched = [] + else: + assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \ + f'but get {dets.shape}' + if isinstance(dets, torch.Tensor): + dets_t = dets.detach().cpu() + else: + dets_t = torch.from_numpy(dets) + indata_list = [dets_t] + indata_dict = {'iou_threshold': float(iou_threshold)} + matched = ext_module.nms_match(*indata_list, **indata_dict) + if torch.__version__ == 'parrots': + matched = matched.tolist() + + if isinstance(dets, torch.Tensor): + return [dets.new_tensor(m, dtype=torch.long) for m in matched] + else: + return [np.array(m, dtype=np.int) for m in matched] + + +def nms_rotated(dets, scores, iou_threshold, labels=None): + """Performs non-maximum suppression (NMS) on the rotated boxes according to + their intersection-over-union (IoU). + + Rotated NMS iteratively removes lower scoring rotated boxes which have an + IoU greater than iou_threshold with another (higher scoring) rotated box. + + Args: + boxes (Tensor): Rotated boxes in shape (N, 5). They are expected to \ + be in (x_ctr, y_ctr, width, height, angle_radian) format. + scores (Tensor): scores in shape (N, ). + iou_threshold (float): IoU thresh for NMS. + labels (Tensor): boxes' label in shape (N,). + + Returns: + tuple: kept dets(boxes and scores) and indice, which is always the \ + same data type as the input. + """ + if dets.shape[0] == 0: + return dets, None + multi_label = labels is not None + if multi_label: + dets_wl = torch.cat((dets, labels.unsqueeze(1)), 1) + else: + dets_wl = dets + _, order = scores.sort(0, descending=True) + dets_sorted = dets_wl.index_select(0, order) + + if torch.__version__ == 'parrots': + keep_inds = ext_module.nms_rotated( + dets_wl, + scores, + order, + dets_sorted, + iou_threshold=iou_threshold, + multi_label=multi_label) + else: + keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted, + iou_threshold, multi_label) + dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)), + dim=1) + return dets, keep_inds diff --git a/lavis/common/annotator/uniformer/mmcv/ops/pixel_group.py b/lavis/common/annotator/uniformer/mmcv/ops/pixel_group.py new file mode 100644 index 0000000000000000000000000000000000000000..2143c75f835a467c802fc3c37ecd3ac0f85bcda4 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/pixel_group.py @@ -0,0 +1,75 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['pixel_group']) + + +def pixel_group(score, mask, embedding, kernel_label, kernel_contour, + kernel_region_num, distance_threshold): + """Group pixels into text instances, which is widely used text detection + methods. + + Arguments: + score (np.array or Tensor): The foreground score with size hxw. + mask (np.array or Tensor): The foreground mask with size hxw. + embedding (np.array or Tensor): The embedding with size hxwxc to + distinguish instances. + kernel_label (np.array or Tensor): The instance kernel index with + size hxw. + kernel_contour (np.array or Tensor): The kernel contour with size hxw. + kernel_region_num (int): The instance kernel region number. + distance_threshold (float): The embedding distance threshold between + kernel and pixel in one instance. + + Returns: + pixel_assignment (List[List[float]]): The instance coordinate list. + Each element consists of averaged confidence, pixel number, and + coordinates (x_i, y_i for all pixels) in order. + """ + assert isinstance(score, (torch.Tensor, np.ndarray)) + assert isinstance(mask, (torch.Tensor, np.ndarray)) + assert isinstance(embedding, (torch.Tensor, np.ndarray)) + assert isinstance(kernel_label, (torch.Tensor, np.ndarray)) + assert isinstance(kernel_contour, (torch.Tensor, np.ndarray)) + assert isinstance(kernel_region_num, int) + assert isinstance(distance_threshold, float) + + if isinstance(score, np.ndarray): + score = torch.from_numpy(score) + if isinstance(mask, np.ndarray): + mask = torch.from_numpy(mask) + if isinstance(embedding, np.ndarray): + embedding = torch.from_numpy(embedding) + if isinstance(kernel_label, np.ndarray): + kernel_label = torch.from_numpy(kernel_label) + if isinstance(kernel_contour, np.ndarray): + kernel_contour = torch.from_numpy(kernel_contour) + + if torch.__version__ == 'parrots': + label = ext_module.pixel_group( + score, + mask, + embedding, + kernel_label, + kernel_contour, + kernel_region_num=kernel_region_num, + distance_threshold=distance_threshold) + label = label.tolist() + label = label[0] + list_index = kernel_region_num + pixel_assignment = [] + for x in range(kernel_region_num): + pixel_assignment.append( + np.array( + label[list_index:list_index + int(label[x])], + dtype=np.float)) + list_index = list_index + int(label[x]) + else: + pixel_assignment = ext_module.pixel_group(score, mask, embedding, + kernel_label, kernel_contour, + kernel_region_num, + distance_threshold) + return pixel_assignment diff --git a/lavis/common/annotator/uniformer/mmcv/ops/point_sample.py b/lavis/common/annotator/uniformer/mmcv/ops/point_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..267f4b3c56630acd85f9bdc630b7be09abab0aba --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/point_sample.py @@ -0,0 +1,336 @@ +# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend # noqa + +from os import path as osp + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules.utils import _pair +from torch.onnx.operators import shape_as_tensor + + +def bilinear_grid_sample(im, grid, align_corners=False): + """Given an input and a flow-field grid, computes the output using input + values and pixel locations from grid. Supported only bilinear interpolation + method to sample the input pixels. + + Args: + im (torch.Tensor): Input feature map, shape (N, C, H, W) + grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2) + align_corners {bool}: If set to True, the extrema (-1 and 1) are + considered as referring to the center points of the input’s + corner pixels. If set to False, they are instead considered as + referring to the corner points of the input’s corner pixels, + making the sampling more resolution agnostic. + Returns: + torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg) + """ + n, c, h, w = im.shape + gn, gh, gw, _ = grid.shape + assert n == gn + + x = grid[:, :, :, 0] + y = grid[:, :, :, 1] + + if align_corners: + x = ((x + 1) / 2) * (w - 1) + y = ((y + 1) / 2) * (h - 1) + else: + x = ((x + 1) * w - 1) / 2 + y = ((y + 1) * h - 1) / 2 + + x = x.view(n, -1) + y = y.view(n, -1) + + x0 = torch.floor(x).long() + y0 = torch.floor(y).long() + x1 = x0 + 1 + y1 = y0 + 1 + + wa = ((x1 - x) * (y1 - y)).unsqueeze(1) + wb = ((x1 - x) * (y - y0)).unsqueeze(1) + wc = ((x - x0) * (y1 - y)).unsqueeze(1) + wd = ((x - x0) * (y - y0)).unsqueeze(1) + + # Apply default for grid_sample function zero padding + im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0) + padded_h = h + 2 + padded_w = w + 2 + # save points positions after padding + x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1 + + # Clip coordinates to padded image size + x0 = torch.where(x0 < 0, torch.tensor(0), x0) + x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0) + x1 = torch.where(x1 < 0, torch.tensor(0), x1) + x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1) + y0 = torch.where(y0 < 0, torch.tensor(0), y0) + y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0) + y1 = torch.where(y1 < 0, torch.tensor(0), y1) + y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1) + + im_padded = im_padded.view(n, c, -1) + + x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1) + x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1) + x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1) + x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1) + + Ia = torch.gather(im_padded, 2, x0_y0) + Ib = torch.gather(im_padded, 2, x0_y1) + Ic = torch.gather(im_padded, 2, x1_y0) + Id = torch.gather(im_padded, 2, x1_y1) + + return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw) + + +def is_in_onnx_export_without_custom_ops(): + from annotator.uniformer.mmcv.ops import get_onnxruntime_op_path + ort_custom_op_path = get_onnxruntime_op_path() + return torch.onnx.is_in_onnx_export( + ) and not osp.exists(ort_custom_op_path) + + +def normalize(grid): + """Normalize input grid from [-1, 1] to [0, 1] + Args: + grid (Tensor): The grid to be normalize, range [-1, 1]. + Returns: + Tensor: Normalized grid, range [0, 1]. + """ + + return (grid + 1.0) / 2.0 + + +def denormalize(grid): + """Denormalize input grid from range [0, 1] to [-1, 1] + Args: + grid (Tensor): The grid to be denormalize, range [0, 1]. + Returns: + Tensor: Denormalized grid, range [-1, 1]. + """ + + return grid * 2.0 - 1.0 + + +def generate_grid(num_grid, size, device): + """Generate regular square grid of points in [0, 1] x [0, 1] coordinate + space. + + Args: + num_grid (int): The number of grids to sample, one for each region. + size (tuple(int, int)): The side size of the regular grid. + device (torch.device): Desired device of returned tensor. + + Returns: + (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that + contains coordinates for the regular grids. + """ + + affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device) + grid = F.affine_grid( + affine_trans, torch.Size((1, 1, *size)), align_corners=False) + grid = normalize(grid) + return grid.view(1, -1, 2).expand(num_grid, -1, -1) + + +def rel_roi_point_to_abs_img_point(rois, rel_roi_points): + """Convert roi based relative point coordinates to image based absolute + point coordinates. + + Args: + rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5) + rel_roi_points (Tensor): Point coordinates inside RoI, relative to + RoI, location, range (0, 1), shape (N, P, 2) + Returns: + Tensor: Image based absolute point coordinates, shape (N, P, 2) + """ + + with torch.no_grad(): + assert rel_roi_points.size(0) == rois.size(0) + assert rois.dim() == 2 + assert rel_roi_points.dim() == 3 + assert rel_roi_points.size(2) == 2 + # remove batch idx + if rois.size(1) == 5: + rois = rois[:, 1:] + abs_img_points = rel_roi_points.clone() + # To avoid an error during exporting to onnx use independent + # variables instead inplace computation + xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0]) + ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1]) + xs += rois[:, None, 0] + ys += rois[:, None, 1] + abs_img_points = torch.stack([xs, ys], dim=2) + return abs_img_points + + +def get_shape_from_feature_map(x): + """Get spatial resolution of input feature map considering exporting to + onnx mode. + + Args: + x (torch.Tensor): Input tensor, shape (N, C, H, W) + Returns: + torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2) + """ + if torch.onnx.is_in_onnx_export(): + img_shape = shape_as_tensor(x)[2:].flip(0).view(1, 1, 2).to( + x.device).float() + else: + img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1, 2).to( + x.device).float() + return img_shape + + +def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.): + """Convert image based absolute point coordinates to image based relative + coordinates for sampling. + + Args: + abs_img_points (Tensor): Image based absolute point coordinates, + shape (N, P, 2) + img (tuple/Tensor): (height, width) of image or feature map. + spatial_scale (float): Scale points by this factor. Default: 1. + + Returns: + Tensor: Image based relative point coordinates for sampling, + shape (N, P, 2) + """ + + assert (isinstance(img, tuple) and len(img) == 2) or \ + (isinstance(img, torch.Tensor) and len(img.shape) == 4) + + if isinstance(img, tuple): + h, w = img + scale = torch.tensor([w, h], + dtype=torch.float, + device=abs_img_points.device) + scale = scale.view(1, 1, 2) + else: + scale = get_shape_from_feature_map(img) + + return abs_img_points / scale * spatial_scale + + +def rel_roi_point_to_rel_img_point(rois, + rel_roi_points, + img, + spatial_scale=1.): + """Convert roi based relative point coordinates to image based absolute + point coordinates. + + Args: + rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5) + rel_roi_points (Tensor): Point coordinates inside RoI, relative to + RoI, location, range (0, 1), shape (N, P, 2) + img (tuple/Tensor): (height, width) of image or feature map. + spatial_scale (float): Scale points by this factor. Default: 1. + + Returns: + Tensor: Image based relative point coordinates for sampling, + shape (N, P, 2) + """ + + abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points) + rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img, + spatial_scale) + + return rel_img_point + + +def point_sample(input, points, align_corners=False, **kwargs): + """A wrapper around :func:`grid_sample` to support 3D point_coords tensors + Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to + lie inside ``[0, 1] x [0, 1]`` square. + + Args: + input (Tensor): Feature map, shape (N, C, H, W). + points (Tensor): Image based absolute point coordinates (normalized), + range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2). + align_corners (bool): Whether align_corners. Default: False + + Returns: + Tensor: Features of `point` on `input`, shape (N, C, P) or + (N, C, Hgrid, Wgrid). + """ + + add_dim = False + if points.dim() == 3: + add_dim = True + points = points.unsqueeze(2) + if is_in_onnx_export_without_custom_ops(): + # If custom ops for onnx runtime not compiled use python + # implementation of grid_sample function to make onnx graph + # with supported nodes + output = bilinear_grid_sample( + input, denormalize(points), align_corners=align_corners) + else: + output = F.grid_sample( + input, denormalize(points), align_corners=align_corners, **kwargs) + if add_dim: + output = output.squeeze(3) + return output + + +class SimpleRoIAlign(nn.Module): + + def __init__(self, output_size, spatial_scale, aligned=True): + """Simple RoI align in PointRend, faster than standard RoIAlign. + + Args: + output_size (tuple[int]): h, w + spatial_scale (float): scale the input boxes by this number + aligned (bool): if False, use the legacy implementation in + MMDetection, align_corners=True will be used in F.grid_sample. + If True, align the results more perfectly. + """ + + super(SimpleRoIAlign, self).__init__() + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + # to be consistent with other RoI ops + self.use_torchvision = False + self.aligned = aligned + + def forward(self, features, rois): + num_imgs = features.size(0) + num_rois = rois.size(0) + rel_roi_points = generate_grid( + num_rois, self.output_size, device=rois.device) + + if torch.onnx.is_in_onnx_export(): + rel_img_points = rel_roi_point_to_rel_img_point( + rois, rel_roi_points, features, self.spatial_scale) + rel_img_points = rel_img_points.reshape(num_imgs, -1, + *rel_img_points.shape[1:]) + point_feats = point_sample( + features, rel_img_points, align_corners=not self.aligned) + point_feats = point_feats.transpose(1, 2) + else: + point_feats = [] + for batch_ind in range(num_imgs): + # unravel batch dim + feat = features[batch_ind].unsqueeze(0) + inds = (rois[:, 0].long() == batch_ind) + if inds.any(): + rel_img_points = rel_roi_point_to_rel_img_point( + rois[inds], rel_roi_points[inds], feat, + self.spatial_scale).unsqueeze(0) + point_feat = point_sample( + feat, rel_img_points, align_corners=not self.aligned) + point_feat = point_feat.squeeze(0).transpose(0, 1) + point_feats.append(point_feat) + + point_feats = torch.cat(point_feats, dim=0) + + channels = features.size(1) + roi_feats = point_feats.reshape(num_rois, channels, *self.output_size) + + return roi_feats + + def __repr__(self): + format_str = self.__class__.__name__ + format_str += '(output_size={}, spatial_scale={}'.format( + self.output_size, self.spatial_scale) + return format_str diff --git a/lavis/common/annotator/uniformer/mmcv/ops/points_in_boxes.py b/lavis/common/annotator/uniformer/mmcv/ops/points_in_boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..4003173a53052161dbcd687a2fa1d755642fdab8 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/points_in_boxes.py @@ -0,0 +1,133 @@ +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward', + 'points_in_boxes_all_forward' +]) + + +def points_in_boxes_part(points, boxes): + """Find the box in which each point is (CUDA). + + Args: + points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate + boxes (torch.Tensor): [B, T, 7], + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in + LiDAR/DEPTH coordinate, (x, y, z) is the bottom center + + Returns: + box_idxs_of_pts (torch.Tensor): (B, M), default background = -1 + """ + assert points.shape[0] == boxes.shape[0], \ + 'Points and boxes should have the same batch size, ' \ + f'but got {points.shape[0]} and {boxes.shape[0]}' + assert boxes.shape[2] == 7, \ + 'boxes dimension should be 7, ' \ + f'but got unexpected shape {boxes.shape[2]}' + assert points.shape[2] == 3, \ + 'points dimension should be 3, ' \ + f'but got unexpected shape {points.shape[2]}' + batch_size, num_points, _ = points.shape + + box_idxs_of_pts = points.new_zeros((batch_size, num_points), + dtype=torch.int).fill_(-1) + + # If manually put the tensor 'points' or 'boxes' on a device + # which is not the current device, some temporary variables + # will be created on the current device in the cuda op, + # and the output will be incorrect. + # Therefore, we force the current device to be the same + # as the device of the tensors if it was not. + # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305 + # for the incorrect output before the fix. + points_device = points.get_device() + assert points_device == boxes.get_device(), \ + 'Points and boxes should be put on the same device' + if torch.cuda.current_device() != points_device: + torch.cuda.set_device(points_device) + + ext_module.points_in_boxes_part_forward(boxes.contiguous(), + points.contiguous(), + box_idxs_of_pts) + + return box_idxs_of_pts + + +def points_in_boxes_cpu(points, boxes): + """Find all boxes in which each point is (CPU). The CPU version of + :meth:`points_in_boxes_all`. + + Args: + points (torch.Tensor): [B, M, 3], [x, y, z] in + LiDAR/DEPTH coordinate + boxes (torch.Tensor): [B, T, 7], + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], + (x, y, z) is the bottom center. + + Returns: + box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0. + """ + assert points.shape[0] == boxes.shape[0], \ + 'Points and boxes should have the same batch size, ' \ + f'but got {points.shape[0]} and {boxes.shape[0]}' + assert boxes.shape[2] == 7, \ + 'boxes dimension should be 7, ' \ + f'but got unexpected shape {boxes.shape[2]}' + assert points.shape[2] == 3, \ + 'points dimension should be 3, ' \ + f'but got unexpected shape {points.shape[2]}' + batch_size, num_points, _ = points.shape + num_boxes = boxes.shape[1] + + point_indices = points.new_zeros((batch_size, num_boxes, num_points), + dtype=torch.int) + for b in range(batch_size): + ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(), + points[b].float().contiguous(), + point_indices[b]) + point_indices = point_indices.transpose(1, 2) + + return point_indices + + +def points_in_boxes_all(points, boxes): + """Find all boxes in which each point is (CUDA). + + Args: + points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate + boxes (torch.Tensor): [B, T, 7], + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], + (x, y, z) is the bottom center. + + Returns: + box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0. + """ + assert boxes.shape[0] == points.shape[0], \ + 'Points and boxes should have the same batch size, ' \ + f'but got {boxes.shape[0]} and {boxes.shape[0]}' + assert boxes.shape[2] == 7, \ + 'boxes dimension should be 7, ' \ + f'but got unexpected shape {boxes.shape[2]}' + assert points.shape[2] == 3, \ + 'points dimension should be 3, ' \ + f'but got unexpected shape {points.shape[2]}' + batch_size, num_points, _ = points.shape + num_boxes = boxes.shape[1] + + box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes), + dtype=torch.int).fill_(0) + + # Same reason as line 25-32 + points_device = points.get_device() + assert points_device == boxes.get_device(), \ + 'Points and boxes should be put on the same device' + if torch.cuda.current_device() != points_device: + torch.cuda.set_device(points_device) + + ext_module.points_in_boxes_all_forward(boxes.contiguous(), + points.contiguous(), + box_idxs_of_pts) + + return box_idxs_of_pts diff --git a/lavis/common/annotator/uniformer/mmcv/ops/points_sampler.py b/lavis/common/annotator/uniformer/mmcv/ops/points_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..a802a74fd6c3610d9ae178e6201f47423eca7ad1 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/points_sampler.py @@ -0,0 +1,177 @@ +from typing import List + +import torch +from torch import nn as nn + +from annotator.uniformer.mmcv.runner import force_fp32 +from .furthest_point_sample import (furthest_point_sample, + furthest_point_sample_with_dist) + + +def calc_square_dist(point_feat_a, point_feat_b, norm=True): + """Calculating square distance between a and b. + + Args: + point_feat_a (Tensor): (B, N, C) Feature vector of each point. + point_feat_b (Tensor): (B, M, C) Feature vector of each point. + norm (Bool, optional): Whether to normalize the distance. + Default: True. + + Returns: + Tensor: (B, N, M) Distance between each pair points. + """ + num_channel = point_feat_a.shape[-1] + # [bs, n, 1] + a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1) + # [bs, 1, m] + b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1) + + corr_matrix = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2)) + + dist = a_square + b_square - 2 * corr_matrix + if norm: + dist = torch.sqrt(dist) / num_channel + return dist + + +def get_sampler_cls(sampler_type): + """Get the type and mode of points sampler. + + Args: + sampler_type (str): The type of points sampler. + The valid value are "D-FPS", "F-FPS", or "FS". + + Returns: + class: Points sampler type. + """ + sampler_mappings = { + 'D-FPS': DFPSSampler, + 'F-FPS': FFPSSampler, + 'FS': FSSampler, + } + try: + return sampler_mappings[sampler_type] + except KeyError: + raise KeyError( + f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \ + {sampler_type}') + + +class PointsSampler(nn.Module): + """Points sampling. + + Args: + num_point (list[int]): Number of sample points. + fps_mod_list (list[str], optional): Type of FPS method, valid mod + ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. + F-FPS: using feature distances for FPS. + D-FPS: using Euclidean distances of points for FPS. + FS: using F-FPS and D-FPS simultaneously. + fps_sample_range_list (list[int], optional): + Range of points to apply FPS. Default: [-1]. + """ + + def __init__(self, + num_point: List[int], + fps_mod_list: List[str] = ['D-FPS'], + fps_sample_range_list: List[int] = [-1]): + super().__init__() + # FPS would be applied to different fps_mod in the list, + # so the length of the num_point should be equal to + # fps_mod_list and fps_sample_range_list. + assert len(num_point) == len(fps_mod_list) == len( + fps_sample_range_list) + self.num_point = num_point + self.fps_sample_range_list = fps_sample_range_list + self.samplers = nn.ModuleList() + for fps_mod in fps_mod_list: + self.samplers.append(get_sampler_cls(fps_mod)()) + self.fp16_enabled = False + + @force_fp32() + def forward(self, points_xyz, features): + """ + Args: + points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. + features (Tensor): (B, C, N) Descriptors of the features. + + Returns: + Tensor: (B, npoint, sample_num) Indices of sampled points. + """ + indices = [] + last_fps_end_index = 0 + + for fps_sample_range, sampler, npoint in zip( + self.fps_sample_range_list, self.samplers, self.num_point): + assert fps_sample_range < points_xyz.shape[1] + + if fps_sample_range == -1: + sample_points_xyz = points_xyz[:, last_fps_end_index:] + if features is not None: + sample_features = features[:, :, last_fps_end_index:] + else: + sample_features = None + else: + sample_points_xyz = \ + points_xyz[:, last_fps_end_index:fps_sample_range] + if features is not None: + sample_features = features[:, :, last_fps_end_index: + fps_sample_range] + else: + sample_features = None + + fps_idx = sampler(sample_points_xyz.contiguous(), sample_features, + npoint) + + indices.append(fps_idx + last_fps_end_index) + last_fps_end_index += fps_sample_range + indices = torch.cat(indices, dim=1) + + return indices + + +class DFPSSampler(nn.Module): + """Using Euclidean distances of points for FPS.""" + + def __init__(self): + super().__init__() + + def forward(self, points, features, npoint): + """Sampling points with D-FPS.""" + fps_idx = furthest_point_sample(points.contiguous(), npoint) + return fps_idx + + +class FFPSSampler(nn.Module): + """Using feature distances for FPS.""" + + def __init__(self): + super().__init__() + + def forward(self, points, features, npoint): + """Sampling points with F-FPS.""" + assert features is not None, \ + 'feature input to FFPS_Sampler should not be None' + features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2) + features_dist = calc_square_dist( + features_for_fps, features_for_fps, norm=False) + fps_idx = furthest_point_sample_with_dist(features_dist, npoint) + return fps_idx + + +class FSSampler(nn.Module): + """Using F-FPS and D-FPS simultaneously.""" + + def __init__(self): + super().__init__() + + def forward(self, points, features, npoint): + """Sampling points with FS_Sampling.""" + assert features is not None, \ + 'feature input to FS_Sampler should not be None' + ffps_sampler = FFPSSampler() + dfps_sampler = DFPSSampler() + fps_idx_ffps = ffps_sampler(points, features, npoint) + fps_idx_dfps = dfps_sampler(points, features, npoint) + fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1) + return fps_idx diff --git a/lavis/common/annotator/uniformer/mmcv/ops/psa_mask.py b/lavis/common/annotator/uniformer/mmcv/ops/psa_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..cdf14e62b50e8d4dd6856c94333c703bcc4c9ab6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/psa_mask.py @@ -0,0 +1,92 @@ +# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa +from torch import nn +from torch.autograd import Function +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', + ['psamask_forward', 'psamask_backward']) + + +class PSAMaskFunction(Function): + + @staticmethod + def symbolic(g, input, psa_type, mask_size): + return g.op( + 'mmcv::MMCVPSAMask', + input, + psa_type_i=psa_type, + mask_size_i=mask_size) + + @staticmethod + def forward(ctx, input, psa_type, mask_size): + ctx.psa_type = psa_type + ctx.mask_size = _pair(mask_size) + ctx.save_for_backward(input) + + h_mask, w_mask = ctx.mask_size + batch_size, channels, h_feature, w_feature = input.size() + assert channels == h_mask * w_mask + output = input.new_zeros( + (batch_size, h_feature * w_feature, h_feature, w_feature)) + + ext_module.psamask_forward( + input, + output, + psa_type=psa_type, + num_=batch_size, + h_feature=h_feature, + w_feature=w_feature, + h_mask=h_mask, + w_mask=w_mask, + half_h_mask=(h_mask - 1) // 2, + half_w_mask=(w_mask - 1) // 2) + return output + + @staticmethod + def backward(ctx, grad_output): + input = ctx.saved_tensors[0] + psa_type = ctx.psa_type + h_mask, w_mask = ctx.mask_size + batch_size, channels, h_feature, w_feature = input.size() + grad_input = grad_output.new_zeros( + (batch_size, channels, h_feature, w_feature)) + ext_module.psamask_backward( + grad_output, + grad_input, + psa_type=psa_type, + num_=batch_size, + h_feature=h_feature, + w_feature=w_feature, + h_mask=h_mask, + w_mask=w_mask, + half_h_mask=(h_mask - 1) // 2, + half_w_mask=(w_mask - 1) // 2) + return grad_input, None, None, None + + +psa_mask = PSAMaskFunction.apply + + +class PSAMask(nn.Module): + + def __init__(self, psa_type, mask_size=None): + super(PSAMask, self).__init__() + assert psa_type in ['collect', 'distribute'] + if psa_type == 'collect': + psa_type_enum = 0 + else: + psa_type_enum = 1 + self.psa_type_enum = psa_type_enum + self.mask_size = mask_size + self.psa_type = psa_type + + def forward(self, input): + return psa_mask(input, self.psa_type_enum, self.mask_size) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(psa_type={self.psa_type}, ' + s += f'mask_size={self.mask_size})' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/roi_align.py b/lavis/common/annotator/uniformer/mmcv/ops/roi_align.py new file mode 100644 index 0000000000000000000000000000000000000000..0755aefc66e67233ceae0f4b77948301c443e9fb --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/roi_align.py @@ -0,0 +1,223 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import deprecated_api_warning, ext_loader + +ext_module = ext_loader.load_ext('_ext', + ['roi_align_forward', 'roi_align_backward']) + + +class RoIAlignFunction(Function): + + @staticmethod + def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio, + pool_mode, aligned): + from ..onnx import is_custom_op_loaded + has_custom_op = is_custom_op_loaded() + if has_custom_op: + return g.op( + 'mmcv::MMCVRoiAlign', + input, + rois, + output_height_i=output_size[0], + output_width_i=output_size[1], + spatial_scale_f=spatial_scale, + sampling_ratio_i=sampling_ratio, + mode_s=pool_mode, + aligned_i=aligned) + else: + from torch.onnx.symbolic_opset9 import sub, squeeze + from torch.onnx.symbolic_helper import _slice_helper + from torch.onnx import TensorProtoDataType + # batch_indices = rois[:, 0].long() + batch_indices = _slice_helper( + g, rois, axes=[1], starts=[0], ends=[1]) + batch_indices = squeeze(g, batch_indices, 1) + batch_indices = g.op( + 'Cast', batch_indices, to_i=TensorProtoDataType.INT64) + # rois = rois[:, 1:] + rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5]) + if aligned: + # rois -= 0.5/spatial_scale + aligned_offset = g.op( + 'Constant', + value_t=torch.tensor([0.5 / spatial_scale], + dtype=torch.float32)) + rois = sub(g, rois, aligned_offset) + # roi align + return g.op( + 'RoiAlign', + input, + rois, + batch_indices, + output_height_i=output_size[0], + output_width_i=output_size[1], + spatial_scale_f=spatial_scale, + sampling_ratio_i=max(0, sampling_ratio), + mode_s=pool_mode) + + @staticmethod + def forward(ctx, + input, + rois, + output_size, + spatial_scale=1.0, + sampling_ratio=0, + pool_mode='avg', + aligned=True): + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.sampling_ratio = sampling_ratio + assert pool_mode in ('max', 'avg') + ctx.pool_mode = 0 if pool_mode == 'max' else 1 + ctx.aligned = aligned + ctx.input_shape = input.size() + + assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' + + output_shape = (rois.size(0), input.size(1), ctx.output_size[0], + ctx.output_size[1]) + output = input.new_zeros(output_shape) + if ctx.pool_mode == 0: + argmax_y = input.new_zeros(output_shape) + argmax_x = input.new_zeros(output_shape) + else: + argmax_y = input.new_zeros(0) + argmax_x = input.new_zeros(0) + + ext_module.roi_align_forward( + input, + rois, + output, + argmax_y, + argmax_x, + aligned_height=ctx.output_size[0], + aligned_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + pool_mode=ctx.pool_mode, + aligned=ctx.aligned) + + ctx.save_for_backward(rois, argmax_y, argmax_x) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + rois, argmax_y, argmax_x = ctx.saved_tensors + grad_input = grad_output.new_zeros(ctx.input_shape) + # complex head architecture may cause grad_output uncontiguous. + grad_output = grad_output.contiguous() + ext_module.roi_align_backward( + grad_output, + rois, + argmax_y, + argmax_x, + grad_input, + aligned_height=ctx.output_size[0], + aligned_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + pool_mode=ctx.pool_mode, + aligned=ctx.aligned) + return grad_input, None, None, None, None, None, None + + +roi_align = RoIAlignFunction.apply + + +class RoIAlign(nn.Module): + """RoI align pooling layer. + + Args: + output_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sampling_ratio (int): number of inputs samples to take for each + output sample. 0 to take samples densely for current models. + pool_mode (str, 'avg' or 'max'): pooling mode in each bin. + aligned (bool): if False, use the legacy implementation in + MMDetection. If True, align the results more perfectly. + use_torchvision (bool): whether to use roi_align from torchvision. + + Note: + The implementation of RoIAlign when aligned=True is modified from + https://github.com/facebookresearch/detectron2/ + + The meaning of aligned=True: + + Given a continuous coordinate c, its two neighboring pixel + indices (in our pixel model) are computed by floor(c - 0.5) and + ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete + indices [0] and [1] (which are sampled from the underlying signal + at continuous coordinates 0.5 and 1.5). But the original roi_align + (aligned=False) does not subtract the 0.5 when computing + neighboring pixel indices and therefore it uses pixels with a + slightly incorrect alignment (relative to our pixel model) when + performing bilinear interpolation. + + With `aligned=True`, + we first appropriately scale the ROI and then shift it by -0.5 + prior to calling roi_align. This produces the correct neighbors; + + The difference does not make a difference to the model's + performance if ROIAlign is used together with conv layers. + """ + + @deprecated_api_warning( + { + 'out_size': 'output_size', + 'sample_num': 'sampling_ratio' + }, + cls_name='RoIAlign') + def __init__(self, + output_size, + spatial_scale=1.0, + sampling_ratio=0, + pool_mode='avg', + aligned=True, + use_torchvision=False): + super(RoIAlign, self).__init__() + + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + self.sampling_ratio = int(sampling_ratio) + self.pool_mode = pool_mode + self.aligned = aligned + self.use_torchvision = use_torchvision + + def forward(self, input, rois): + """ + Args: + input: NCHW images + rois: Bx5 boxes. First column is the index into N.\ + The other 4 columns are xyxy. + """ + if self.use_torchvision: + from torchvision.ops import roi_align as tv_roi_align + if 'aligned' in tv_roi_align.__code__.co_varnames: + return tv_roi_align(input, rois, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.aligned) + else: + if self.aligned: + rois -= rois.new_tensor([0.] + + [0.5 / self.spatial_scale] * 4) + return tv_roi_align(input, rois, self.output_size, + self.spatial_scale, self.sampling_ratio) + else: + return roi_align(input, rois, self.output_size, self.spatial_scale, + self.sampling_ratio, self.pool_mode, self.aligned) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(output_size={self.output_size}, ' + s += f'spatial_scale={self.spatial_scale}, ' + s += f'sampling_ratio={self.sampling_ratio}, ' + s += f'pool_mode={self.pool_mode}, ' + s += f'aligned={self.aligned}, ' + s += f'use_torchvision={self.use_torchvision})' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/roi_align_rotated.py b/lavis/common/annotator/uniformer/mmcv/ops/roi_align_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..0ce4961a3555d4da8bc3e32f1f7d5ad50036587d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/roi_align_rotated.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward']) + + +class RoIAlignRotatedFunction(Function): + + @staticmethod + def symbolic(g, features, rois, out_size, spatial_scale, sample_num, + aligned, clockwise): + if isinstance(out_size, int): + out_h = out_size + out_w = out_size + elif isinstance(out_size, tuple): + assert len(out_size) == 2 + assert isinstance(out_size[0], int) + assert isinstance(out_size[1], int) + out_h, out_w = out_size + else: + raise TypeError( + '"out_size" must be an integer or tuple of integers') + return g.op( + 'mmcv::MMCVRoIAlignRotated', + features, + rois, + output_height_i=out_h, + output_width_i=out_h, + spatial_scale_f=spatial_scale, + sampling_ratio_i=sample_num, + aligned_i=aligned, + clockwise_i=clockwise) + + @staticmethod + def forward(ctx, + features, + rois, + out_size, + spatial_scale, + sample_num=0, + aligned=True, + clockwise=False): + if isinstance(out_size, int): + out_h = out_size + out_w = out_size + elif isinstance(out_size, tuple): + assert len(out_size) == 2 + assert isinstance(out_size[0], int) + assert isinstance(out_size[1], int) + out_h, out_w = out_size + else: + raise TypeError( + '"out_size" must be an integer or tuple of integers') + ctx.spatial_scale = spatial_scale + ctx.sample_num = sample_num + ctx.aligned = aligned + ctx.clockwise = clockwise + ctx.save_for_backward(rois) + ctx.feature_size = features.size() + + batch_size, num_channels, data_height, data_width = features.size() + num_rois = rois.size(0) + + output = features.new_zeros(num_rois, num_channels, out_h, out_w) + ext_module.roi_align_rotated_forward( + features, + rois, + output, + pooled_height=out_h, + pooled_width=out_w, + spatial_scale=spatial_scale, + sample_num=sample_num, + aligned=aligned, + clockwise=clockwise) + return output + + @staticmethod + def backward(ctx, grad_output): + feature_size = ctx.feature_size + spatial_scale = ctx.spatial_scale + aligned = ctx.aligned + clockwise = ctx.clockwise + sample_num = ctx.sample_num + rois = ctx.saved_tensors[0] + assert feature_size is not None + batch_size, num_channels, data_height, data_width = feature_size + + out_w = grad_output.size(3) + out_h = grad_output.size(2) + + grad_input = grad_rois = None + + if ctx.needs_input_grad[0]: + grad_input = rois.new_zeros(batch_size, num_channels, data_height, + data_width) + ext_module.roi_align_rotated_backward( + grad_output.contiguous(), + rois, + grad_input, + pooled_height=out_h, + pooled_width=out_w, + spatial_scale=spatial_scale, + sample_num=sample_num, + aligned=aligned, + clockwise=clockwise) + return grad_input, grad_rois, None, None, None, None, None + + +roi_align_rotated = RoIAlignRotatedFunction.apply + + +class RoIAlignRotated(nn.Module): + """RoI align pooling layer for rotated proposals. + + It accepts a feature map of shape (N, C, H, W) and rois with shape + (n, 6) with each roi decoded as (batch_index, center_x, center_y, + w, h, angle). The angle is in radian. + + Args: + out_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sample_num (int): number of inputs samples to take for each + output sample. 0 to take samples densely for current models. + aligned (bool): if False, use the legacy implementation in + MMDetection. If True, align the results more perfectly. + Default: True. + clockwise (bool): If True, the angle in each proposal follows a + clockwise fashion in image space, otherwise, the angle is + counterclockwise. Default: False. + + Note: + The implementation of RoIAlign when aligned=True is modified from + https://github.com/facebookresearch/detectron2/ + + The meaning of aligned=True: + + Given a continuous coordinate c, its two neighboring pixel + indices (in our pixel model) are computed by floor(c - 0.5) and + ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete + indices [0] and [1] (which are sampled from the underlying signal + at continuous coordinates 0.5 and 1.5). But the original roi_align + (aligned=False) does not subtract the 0.5 when computing + neighboring pixel indices and therefore it uses pixels with a + slightly incorrect alignment (relative to our pixel model) when + performing bilinear interpolation. + + With `aligned=True`, + we first appropriately scale the ROI and then shift it by -0.5 + prior to calling roi_align. This produces the correct neighbors; + + The difference does not make a difference to the model's + performance if ROIAlign is used together with conv layers. + """ + + def __init__(self, + out_size, + spatial_scale, + sample_num=0, + aligned=True, + clockwise=False): + super(RoIAlignRotated, self).__init__() + + self.out_size = out_size + self.spatial_scale = float(spatial_scale) + self.sample_num = int(sample_num) + self.aligned = aligned + self.clockwise = clockwise + + def forward(self, features, rois): + return RoIAlignRotatedFunction.apply(features, rois, self.out_size, + self.spatial_scale, + self.sample_num, self.aligned, + self.clockwise) diff --git a/lavis/common/annotator/uniformer/mmcv/ops/roi_pool.py b/lavis/common/annotator/uniformer/mmcv/ops/roi_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..d339d8f2941eabc1cbe181a9c6c5ab5ff4ff4e5f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/roi_pool.py @@ -0,0 +1,86 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', + ['roi_pool_forward', 'roi_pool_backward']) + + +class RoIPoolFunction(Function): + + @staticmethod + def symbolic(g, input, rois, output_size, spatial_scale): + return g.op( + 'MaxRoiPool', + input, + rois, + pooled_shape_i=output_size, + spatial_scale_f=spatial_scale) + + @staticmethod + def forward(ctx, input, rois, output_size, spatial_scale=1.0): + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.input_shape = input.size() + + assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' + + output_shape = (rois.size(0), input.size(1), ctx.output_size[0], + ctx.output_size[1]) + output = input.new_zeros(output_shape) + argmax = input.new_zeros(output_shape, dtype=torch.int) + + ext_module.roi_pool_forward( + input, + rois, + output, + argmax, + pooled_height=ctx.output_size[0], + pooled_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale) + + ctx.save_for_backward(rois, argmax) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + rois, argmax = ctx.saved_tensors + grad_input = grad_output.new_zeros(ctx.input_shape) + + ext_module.roi_pool_backward( + grad_output, + rois, + argmax, + grad_input, + pooled_height=ctx.output_size[0], + pooled_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale) + + return grad_input, None, None, None + + +roi_pool = RoIPoolFunction.apply + + +class RoIPool(nn.Module): + + def __init__(self, output_size, spatial_scale=1.0): + super(RoIPool, self).__init__() + + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + + def forward(self, input, rois): + return roi_pool(input, rois, self.output_size, self.spatial_scale) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(output_size={self.output_size}, ' + s += f'spatial_scale={self.spatial_scale})' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/roiaware_pool3d.py b/lavis/common/annotator/uniformer/mmcv/ops/roiaware_pool3d.py new file mode 100644 index 0000000000000000000000000000000000000000..291b0e5a9b692492c7d7e495ea639c46042e2f18 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/roiaware_pool3d.py @@ -0,0 +1,114 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn as nn +from torch.autograd import Function + +import annotator.uniformer.mmcv as mmcv +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward']) + + +class RoIAwarePool3d(nn.Module): + """Encode the geometry-specific features of each 3D proposal. + + Please refer to `PartA2 `_ for more + details. + + Args: + out_size (int or tuple): The size of output features. n or + [n1, n2, n3]. + max_pts_per_voxel (int, optional): The maximum number of points per + voxel. Default: 128. + mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'. + Default: 'max'. + """ + + def __init__(self, out_size, max_pts_per_voxel=128, mode='max'): + super().__init__() + + self.out_size = out_size + self.max_pts_per_voxel = max_pts_per_voxel + assert mode in ['max', 'avg'] + pool_mapping = {'max': 0, 'avg': 1} + self.mode = pool_mapping[mode] + + def forward(self, rois, pts, pts_feature): + """ + Args: + rois (torch.Tensor): [N, 7], in LiDAR coordinate, + (x, y, z) is the bottom center of rois. + pts (torch.Tensor): [npoints, 3], coordinates of input points. + pts_feature (torch.Tensor): [npoints, C], features of input points. + + Returns: + pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C] + """ + + return RoIAwarePool3dFunction.apply(rois, pts, pts_feature, + self.out_size, + self.max_pts_per_voxel, self.mode) + + +class RoIAwarePool3dFunction(Function): + + @staticmethod + def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel, + mode): + """ + Args: + rois (torch.Tensor): [N, 7], in LiDAR coordinate, + (x, y, z) is the bottom center of rois. + pts (torch.Tensor): [npoints, 3], coordinates of input points. + pts_feature (torch.Tensor): [npoints, C], features of input points. + out_size (int or tuple): The size of output features. n or + [n1, n2, n3]. + max_pts_per_voxel (int): The maximum number of points per voxel. + Default: 128. + mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average + pool). + + Returns: + pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C], output + pooled features. + """ + + if isinstance(out_size, int): + out_x = out_y = out_z = out_size + else: + assert len(out_size) == 3 + assert mmcv.is_tuple_of(out_size, int) + out_x, out_y, out_z = out_size + + num_rois = rois.shape[0] + num_channels = pts_feature.shape[-1] + num_pts = pts.shape[0] + + pooled_features = pts_feature.new_zeros( + (num_rois, out_x, out_y, out_z, num_channels)) + argmax = pts_feature.new_zeros( + (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int) + pts_idx_of_voxels = pts_feature.new_zeros( + (num_rois, out_x, out_y, out_z, max_pts_per_voxel), + dtype=torch.int) + + ext_module.roiaware_pool3d_forward(rois, pts, pts_feature, argmax, + pts_idx_of_voxels, pooled_features, + mode) + + ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode, + num_pts, num_channels) + return pooled_features + + @staticmethod + def backward(ctx, grad_out): + ret = ctx.roiaware_pool3d_for_backward + pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret + + grad_in = grad_out.new_zeros((num_pts, num_channels)) + ext_module.roiaware_pool3d_backward(pts_idx_of_voxels, argmax, + grad_out.contiguous(), grad_in, + mode) + + return None, None, grad_in, None, None, None diff --git a/lavis/common/annotator/uniformer/mmcv/ops/roipoint_pool3d.py b/lavis/common/annotator/uniformer/mmcv/ops/roipoint_pool3d.py new file mode 100644 index 0000000000000000000000000000000000000000..0a21412c0728431c04b84245bc2e3109eea9aefc --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/roipoint_pool3d.py @@ -0,0 +1,77 @@ +from torch import nn as nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward']) + + +class RoIPointPool3d(nn.Module): + """Encode the geometry-specific features of each 3D proposal. + + Please refer to `Paper of PartA2 `_ + for more details. + + Args: + num_sampled_points (int, optional): Number of samples in each roi. + Default: 512. + """ + + def __init__(self, num_sampled_points=512): + super().__init__() + self.num_sampled_points = num_sampled_points + + def forward(self, points, point_features, boxes3d): + """ + Args: + points (torch.Tensor): Input points whose shape is (B, N, C). + point_features (torch.Tensor): Features of input points whose shape + is (B, N, C). + boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7). + + Returns: + pooled_features (torch.Tensor): The output pooled features whose + shape is (B, M, 512, 3 + C). + pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M). + """ + return RoIPointPool3dFunction.apply(points, point_features, boxes3d, + self.num_sampled_points) + + +class RoIPointPool3dFunction(Function): + + @staticmethod + def forward(ctx, points, point_features, boxes3d, num_sampled_points=512): + """ + Args: + points (torch.Tensor): Input points whose shape is (B, N, C). + point_features (torch.Tensor): Features of input points whose shape + is (B, N, C). + boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7). + num_sampled_points (int, optional): The num of sampled points. + Default: 512. + + Returns: + pooled_features (torch.Tensor): The output pooled features whose + shape is (B, M, 512, 3 + C). + pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M). + """ + assert len(points.shape) == 3 and points.shape[2] == 3 + batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[ + 1], point_features.shape[2] + pooled_boxes3d = boxes3d.view(batch_size, -1, 7) + pooled_features = point_features.new_zeros( + (batch_size, boxes_num, num_sampled_points, 3 + feature_len)) + pooled_empty_flag = point_features.new_zeros( + (batch_size, boxes_num)).int() + + ext_module.roipoint_pool3d_forward(points.contiguous(), + pooled_boxes3d.contiguous(), + point_features.contiguous(), + pooled_features, pooled_empty_flag) + + return pooled_features, pooled_empty_flag + + @staticmethod + def backward(ctx, grad_out): + raise NotImplementedError diff --git a/lavis/common/annotator/uniformer/mmcv/ops/saconv.py b/lavis/common/annotator/uniformer/mmcv/ops/saconv.py new file mode 100644 index 0000000000000000000000000000000000000000..b4ee3978e097fca422805db4e31ae481006d7971 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/saconv.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from annotator.uniformer.mmcv.cnn import CONV_LAYERS, ConvAWS2d, constant_init +from annotator.uniformer.mmcv.ops.deform_conv import deform_conv2d +from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version + + +@CONV_LAYERS.register_module(name='SAC') +class SAConv2d(ConvAWS2d): + """SAC (Switchable Atrous Convolution) + + This is an implementation of SAC in DetectoRS + (https://arxiv.org/pdf/2006.02334.pdf). + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + padding_mode (string, optional): ``'zeros'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` + dilation (int or tuple, optional): Spacing between kernel elements. + Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + use_deform: If ``True``, replace convolution with deformable + convolution. Default: ``False``. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + use_deform=False): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + self.use_deform = use_deform + self.switch = nn.Conv2d( + self.in_channels, 1, kernel_size=1, stride=stride, bias=True) + self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size())) + self.pre_context = nn.Conv2d( + self.in_channels, self.in_channels, kernel_size=1, bias=True) + self.post_context = nn.Conv2d( + self.out_channels, self.out_channels, kernel_size=1, bias=True) + if self.use_deform: + self.offset_s = nn.Conv2d( + self.in_channels, + 18, + kernel_size=3, + padding=1, + stride=stride, + bias=True) + self.offset_l = nn.Conv2d( + self.in_channels, + 18, + kernel_size=3, + padding=1, + stride=stride, + bias=True) + self.init_weights() + + def init_weights(self): + constant_init(self.switch, 0, bias=1) + self.weight_diff.data.zero_() + constant_init(self.pre_context, 0) + constant_init(self.post_context, 0) + if self.use_deform: + constant_init(self.offset_s, 0) + constant_init(self.offset_l, 0) + + def forward(self, x): + # pre-context + avg_x = F.adaptive_avg_pool2d(x, output_size=1) + avg_x = self.pre_context(avg_x) + avg_x = avg_x.expand_as(x) + x = x + avg_x + # switch + avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect') + avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0) + switch = self.switch(avg_x) + # sac + weight = self._get_weight(self.weight) + zero_bias = torch.zeros( + self.out_channels, device=weight.device, dtype=weight.dtype) + + if self.use_deform: + offset = self.offset_s(avg_x) + out_s = deform_conv2d(x, offset, weight, self.stride, self.padding, + self.dilation, self.groups, 1) + else: + if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.5.0')): + out_s = super().conv2d_forward(x, weight) + elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'): + # bias is a required argument of _conv_forward in torch 1.8.0 + out_s = super()._conv_forward(x, weight, zero_bias) + else: + out_s = super()._conv_forward(x, weight) + ori_p = self.padding + ori_d = self.dilation + self.padding = tuple(3 * p for p in self.padding) + self.dilation = tuple(3 * d for d in self.dilation) + weight = weight + self.weight_diff + if self.use_deform: + offset = self.offset_l(avg_x) + out_l = deform_conv2d(x, offset, weight, self.stride, self.padding, + self.dilation, self.groups, 1) + else: + if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.5.0')): + out_l = super().conv2d_forward(x, weight) + elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'): + # bias is a required argument of _conv_forward in torch 1.8.0 + out_l = super()._conv_forward(x, weight, zero_bias) + else: + out_l = super()._conv_forward(x, weight) + + out = switch * out_s + (1 - switch) * out_l + self.padding = ori_p + self.dilation = ori_d + # post-context + avg_x = F.adaptive_avg_pool2d(out, output_size=1) + avg_x = self.post_context(avg_x) + avg_x = avg_x.expand_as(out) + out = out + avg_x + return out diff --git a/lavis/common/annotator/uniformer/mmcv/ops/scatter_points.py b/lavis/common/annotator/uniformer/mmcv/ops/scatter_points.py new file mode 100644 index 0000000000000000000000000000000000000000..2b8aa4169e9f6ca4a6f845ce17d6d1e4db416bb8 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/scatter_points.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', + ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward']) + + +class _DynamicScatter(Function): + + @staticmethod + def forward(ctx, feats, coors, reduce_type='max'): + """convert kitti points(N, >=3) to voxels. + + Args: + feats (torch.Tensor): [N, C]. Points features to be reduced + into voxels. + coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates + (specifically multi-dim voxel index) of each points. + reduce_type (str, optional): Reduce op. support 'max', 'sum' and + 'mean'. Default: 'max'. + + Returns: + voxel_feats (torch.Tensor): [M, C]. Reduced features, input + features that shares the same voxel coordinates are reduced to + one row. + voxel_coors (torch.Tensor): [M, ndim]. Voxel coordinates. + """ + results = ext_module.dynamic_point_to_voxel_forward( + feats, coors, reduce_type) + (voxel_feats, voxel_coors, point2voxel_map, + voxel_points_count) = results + ctx.reduce_type = reduce_type + ctx.save_for_backward(feats, voxel_feats, point2voxel_map, + voxel_points_count) + ctx.mark_non_differentiable(voxel_coors) + return voxel_feats, voxel_coors + + @staticmethod + def backward(ctx, grad_voxel_feats, grad_voxel_coors=None): + (feats, voxel_feats, point2voxel_map, + voxel_points_count) = ctx.saved_tensors + grad_feats = torch.zeros_like(feats) + # TODO: whether to use index put or use cuda_backward + # To use index put, need point to voxel index + ext_module.dynamic_point_to_voxel_backward( + grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats, + point2voxel_map, voxel_points_count, ctx.reduce_type) + return grad_feats, None, None + + +dynamic_scatter = _DynamicScatter.apply + + +class DynamicScatter(nn.Module): + """Scatters points into voxels, used in the voxel encoder with dynamic + voxelization. + + Note: + The CPU and GPU implementation get the same output, but have numerical + difference after summation and division (e.g., 5e-7). + + Args: + voxel_size (list): list [x, y, z] size of three dimension. + point_cloud_range (list): The coordinate range of points, [x_min, + y_min, z_min, x_max, y_max, z_max]. + average_points (bool): whether to use avg pooling to scatter points + into voxel. + """ + + def __init__(self, voxel_size, point_cloud_range, average_points: bool): + super().__init__() + + self.voxel_size = voxel_size + self.point_cloud_range = point_cloud_range + self.average_points = average_points + + def forward_single(self, points, coors): + """Scatters points into voxels. + + Args: + points (torch.Tensor): Points to be reduced into voxels. + coors (torch.Tensor): Corresponding voxel coordinates (specifically + multi-dim voxel index) of each points. + + Returns: + voxel_feats (torch.Tensor): Reduced features, input features that + shares the same voxel coordinates are reduced to one row. + voxel_coors (torch.Tensor): Voxel coordinates. + """ + reduce = 'mean' if self.average_points else 'max' + return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce) + + def forward(self, points, coors): + """Scatters points/features into voxels. + + Args: + points (torch.Tensor): Points to be reduced into voxels. + coors (torch.Tensor): Corresponding voxel coordinates (specifically + multi-dim voxel index) of each points. + + Returns: + voxel_feats (torch.Tensor): Reduced features, input features that + shares the same voxel coordinates are reduced to one row. + voxel_coors (torch.Tensor): Voxel coordinates. + """ + if coors.size(-1) == 3: + return self.forward_single(points, coors) + else: + batch_size = coors[-1, 0] + 1 + voxels, voxel_coors = [], [] + for i in range(batch_size): + inds = torch.where(coors[:, 0] == i) + voxel, voxel_coor = self.forward_single( + points[inds], coors[inds][:, 1:]) + coor_pad = nn.functional.pad( + voxel_coor, (1, 0), mode='constant', value=i) + voxel_coors.append(coor_pad) + voxels.append(voxel) + features = torch.cat(voxels, dim=0) + feature_coors = torch.cat(voxel_coors, dim=0) + + return features, feature_coors + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'voxel_size=' + str(self.voxel_size) + s += ', point_cloud_range=' + str(self.point_cloud_range) + s += ', average_points=' + str(self.average_points) + s += ')' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/sync_bn.py b/lavis/common/annotator/uniformer/mmcv/ops/sync_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b016fcbe860989c56cd1040034bcfa60e146d2 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/sync_bn.py @@ -0,0 +1,279 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.module import Module +from torch.nn.parameter import Parameter + +from annotator.uniformer.mmcv.cnn import NORM_LAYERS +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output', + 'sync_bn_backward_param', 'sync_bn_backward_data' +]) + + +class SyncBatchNormFunction(Function): + + @staticmethod + def symbolic(g, input, running_mean, running_var, weight, bias, momentum, + eps, group, group_size, stats_mode): + return g.op( + 'mmcv::MMCVSyncBatchNorm', + input, + running_mean, + running_var, + weight, + bias, + momentum_f=momentum, + eps_f=eps, + group_i=group, + group_size_i=group_size, + stats_mode=stats_mode) + + @staticmethod + def forward(self, input, running_mean, running_var, weight, bias, momentum, + eps, group, group_size, stats_mode): + self.momentum = momentum + self.eps = eps + self.group = group + self.group_size = group_size + self.stats_mode = stats_mode + + assert isinstance( + input, (torch.HalfTensor, torch.FloatTensor, + torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \ + f'only support Half or Float Tensor, but {input.type()}' + output = torch.zeros_like(input) + input3d = input.flatten(start_dim=2) + output3d = output.view_as(input3d) + num_channels = input3d.size(1) + + # ensure mean/var/norm/std are initialized as zeros + # ``torch.empty()`` does not guarantee that + mean = torch.zeros( + num_channels, dtype=torch.float, device=input3d.device) + var = torch.zeros( + num_channels, dtype=torch.float, device=input3d.device) + norm = torch.zeros_like( + input3d, dtype=torch.float, device=input3d.device) + std = torch.zeros( + num_channels, dtype=torch.float, device=input3d.device) + + batch_size = input3d.size(0) + if batch_size > 0: + ext_module.sync_bn_forward_mean(input3d, mean) + batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype) + else: + # skip updating mean and leave it as zeros when the input is empty + batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype) + + # synchronize mean and the batch flag + vec = torch.cat([mean, batch_flag]) + if self.stats_mode == 'N': + vec *= batch_size + if self.group_size > 1: + dist.all_reduce(vec, group=self.group) + total_batch = vec[-1].detach() + mean = vec[:num_channels] + + if self.stats_mode == 'default': + mean = mean / self.group_size + elif self.stats_mode == 'N': + mean = mean / total_batch.clamp(min=1) + else: + raise NotImplementedError + + # leave var as zeros when the input is empty + if batch_size > 0: + ext_module.sync_bn_forward_var(input3d, mean, var) + + if self.stats_mode == 'N': + var *= batch_size + if self.group_size > 1: + dist.all_reduce(var, group=self.group) + + if self.stats_mode == 'default': + var /= self.group_size + elif self.stats_mode == 'N': + var /= total_batch.clamp(min=1) + else: + raise NotImplementedError + + # if the total batch size over all the ranks is zero, + # we should not update the statistics in the current batch + update_flag = total_batch.clamp(max=1) + momentum = update_flag * self.momentum + ext_module.sync_bn_forward_output( + input3d, + mean, + var, + weight, + bias, + running_mean, + running_var, + norm, + std, + output3d, + eps=self.eps, + momentum=momentum, + group_size=self.group_size) + self.save_for_backward(norm, std, weight) + return output + + @staticmethod + @once_differentiable + def backward(self, grad_output): + norm, std, weight = self.saved_tensors + grad_weight = torch.zeros_like(weight) + grad_bias = torch.zeros_like(weight) + grad_input = torch.zeros_like(grad_output) + grad_output3d = grad_output.flatten(start_dim=2) + grad_input3d = grad_input.view_as(grad_output3d) + + batch_size = grad_input3d.size(0) + if batch_size > 0: + ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight, + grad_bias) + + # all reduce + if self.group_size > 1: + dist.all_reduce(grad_weight, group=self.group) + dist.all_reduce(grad_bias, group=self.group) + grad_weight /= self.group_size + grad_bias /= self.group_size + + if batch_size > 0: + ext_module.sync_bn_backward_data(grad_output3d, weight, + grad_weight, grad_bias, norm, std, + grad_input3d) + + return grad_input, None, None, grad_weight, grad_bias, \ + None, None, None, None, None + + +@NORM_LAYERS.register_module(name='MMSyncBN') +class SyncBatchNorm(Module): + """Synchronized Batch Normalization. + + Args: + num_features (int): number of features/chennels in input tensor + eps (float, optional): a value added to the denominator for numerical + stability. Defaults to 1e-5. + momentum (float, optional): the value used for the running_mean and + running_var computation. Defaults to 0.1. + affine (bool, optional): whether to use learnable affine parameters. + Defaults to True. + track_running_stats (bool, optional): whether to track the running + mean and variance during training. When set to False, this + module does not track such statistics, and initializes statistics + buffers ``running_mean`` and ``running_var`` as ``None``. When + these buffers are ``None``, this module always uses batch + statistics in both training and eval modes. Defaults to True. + group (int, optional): synchronization of stats happen within + each process group individually. By default it is synchronization + across the whole world. Defaults to None. + stats_mode (str, optional): The statistical mode. Available options + includes ``'default'`` and ``'N'``. Defaults to 'default'. + When ``stats_mode=='default'``, it computes the overall statistics + using those from each worker with equal weight, i.e., the + statistics are synchronized and simply divied by ``group``. This + mode will produce inaccurate statistics when empty tensors occur. + When ``stats_mode=='N'``, it compute the overall statistics using + the total number of batches in each worker ignoring the number of + group, i.e., the statistics are synchronized and then divied by + the total batch ``N``. This mode is beneficial when empty tensors + occur during training, as it average the total mean by the real + number of batch. + """ + + def __init__(self, + num_features, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + group=None, + stats_mode='default'): + super(SyncBatchNorm, self).__init__() + self.num_features = num_features + self.eps = eps + self.momentum = momentum + self.affine = affine + self.track_running_stats = track_running_stats + group = dist.group.WORLD if group is None else group + self.group = group + self.group_size = dist.get_world_size(group) + assert stats_mode in ['default', 'N'], \ + f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"' + self.stats_mode = stats_mode + if self.affine: + self.weight = Parameter(torch.Tensor(num_features)) + self.bias = Parameter(torch.Tensor(num_features)) + else: + self.register_parameter('weight', None) + self.register_parameter('bias', None) + if self.track_running_stats: + self.register_buffer('running_mean', torch.zeros(num_features)) + self.register_buffer('running_var', torch.ones(num_features)) + self.register_buffer('num_batches_tracked', + torch.tensor(0, dtype=torch.long)) + else: + self.register_buffer('running_mean', None) + self.register_buffer('running_var', None) + self.register_buffer('num_batches_tracked', None) + self.reset_parameters() + + def reset_running_stats(self): + if self.track_running_stats: + self.running_mean.zero_() + self.running_var.fill_(1) + self.num_batches_tracked.zero_() + + def reset_parameters(self): + self.reset_running_stats() + if self.affine: + self.weight.data.uniform_() # pytorch use ones_() + self.bias.data.zero_() + + def forward(self, input): + if input.dim() < 2: + raise ValueError( + f'expected at least 2D input, got {input.dim()}D input') + if self.momentum is None: + exponential_average_factor = 0.0 + else: + exponential_average_factor = self.momentum + + if self.training and self.track_running_stats: + if self.num_batches_tracked is not None: + self.num_batches_tracked += 1 + if self.momentum is None: # use cumulative moving average + exponential_average_factor = 1.0 / float( + self.num_batches_tracked) + else: # use exponential moving average + exponential_average_factor = self.momentum + + if self.training or not self.track_running_stats: + return SyncBatchNormFunction.apply( + input, self.running_mean, self.running_var, self.weight, + self.bias, exponential_average_factor, self.eps, self.group, + self.group_size, self.stats_mode) + else: + return F.batch_norm(input, self.running_mean, self.running_var, + self.weight, self.bias, False, + exponential_average_factor, self.eps) + + def __repr__(self): + s = self.__class__.__name__ + s += f'({self.num_features}, ' + s += f'eps={self.eps}, ' + s += f'momentum={self.momentum}, ' + s += f'affine={self.affine}, ' + s += f'track_running_stats={self.track_running_stats}, ' + s += f'group_size={self.group_size},' + s += f'stats_mode={self.stats_mode})' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/ops/three_interpolate.py b/lavis/common/annotator/uniformer/mmcv/ops/three_interpolate.py new file mode 100644 index 0000000000000000000000000000000000000000..203f47f05d58087e034fb3cd8cd6a09233947b4a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/three_interpolate.py @@ -0,0 +1,68 @@ +from typing import Tuple + +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['three_interpolate_forward', 'three_interpolate_backward']) + + +class ThreeInterpolate(Function): + """Performs weighted linear interpolation on 3 features. + + Please refer to `Paper of PointNet++ `_ + for more details. + """ + + @staticmethod + def forward(ctx, features: torch.Tensor, indices: torch.Tensor, + weight: torch.Tensor) -> torch.Tensor: + """ + Args: + features (Tensor): (B, C, M) Features descriptors to be + interpolated + indices (Tensor): (B, n, 3) index three nearest neighbors + of the target features in features + weight (Tensor): (B, n, 3) weights of interpolation + + Returns: + Tensor: (B, C, N) tensor of the interpolated features + """ + assert features.is_contiguous() + assert indices.is_contiguous() + assert weight.is_contiguous() + + B, c, m = features.size() + n = indices.size(1) + ctx.three_interpolate_for_backward = (indices, weight, m) + output = torch.cuda.FloatTensor(B, c, n) + + ext_module.three_interpolate_forward( + features, indices, weight, output, b=B, c=c, m=m, n=n) + return output + + @staticmethod + def backward( + ctx, grad_out: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Args: + grad_out (Tensor): (B, C, N) tensor with gradients of outputs + + Returns: + Tensor: (B, C, M) tensor with gradients of features + """ + idx, weight, m = ctx.three_interpolate_for_backward + B, c, n = grad_out.size() + + grad_features = torch.cuda.FloatTensor(B, c, m).zero_() + grad_out_data = grad_out.data.contiguous() + + ext_module.three_interpolate_backward( + grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m) + return grad_features, None, None + + +three_interpolate = ThreeInterpolate.apply diff --git a/lavis/common/annotator/uniformer/mmcv/ops/three_nn.py b/lavis/common/annotator/uniformer/mmcv/ops/three_nn.py new file mode 100644 index 0000000000000000000000000000000000000000..2b01047a129989cd5545a0a86f23a487f4a13ce1 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/three_nn.py @@ -0,0 +1,51 @@ +from typing import Tuple + +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['three_nn_forward']) + + +class ThreeNN(Function): + """Find the top-3 nearest neighbors of the target set from the source set. + + Please refer to `Paper of PointNet++ `_ + for more details. + """ + + @staticmethod + def forward(ctx, target: torch.Tensor, + source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + target (Tensor): shape (B, N, 3), points set that needs to + find the nearest neighbors. + source (Tensor): shape (B, M, 3), points set that is used + to find the nearest neighbors of points in target set. + + Returns: + Tensor: shape (B, N, 3), L2 distance of each point in target + set to their corresponding nearest neighbors. + """ + target = target.contiguous() + source = source.contiguous() + + B, N, _ = target.size() + m = source.size(1) + dist2 = torch.cuda.FloatTensor(B, N, 3) + idx = torch.cuda.IntTensor(B, N, 3) + + ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(idx) + + return torch.sqrt(dist2), idx + + @staticmethod + def backward(ctx, a=None, b=None): + return None, None + + +three_nn = ThreeNN.apply diff --git a/lavis/common/annotator/uniformer/mmcv/ops/tin_shift.py b/lavis/common/annotator/uniformer/mmcv/ops/tin_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..472c9fcfe45a124e819b7ed5653e585f94a8811e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/tin_shift.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Code reference from "Temporal Interlacing Network" +# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py +# Hao Shao, Shengju Qian, Yu Liu +# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk + +import torch +import torch.nn as nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', + ['tin_shift_forward', 'tin_shift_backward']) + + +class TINShiftFunction(Function): + + @staticmethod + def forward(ctx, input, shift): + C = input.size(2) + num_segments = shift.size(1) + if C // num_segments <= 0 or C % num_segments != 0: + raise ValueError('C should be a multiple of num_segments, ' + f'but got C={C} and num_segments={num_segments}.') + + ctx.save_for_backward(shift) + + out = torch.zeros_like(input) + ext_module.tin_shift_forward(input, shift, out) + + return out + + @staticmethod + def backward(ctx, grad_output): + + shift = ctx.saved_tensors[0] + data_grad_input = grad_output.new(*grad_output.size()).zero_() + shift_grad_input = shift.new(*shift.size()).zero_() + ext_module.tin_shift_backward(grad_output, shift, data_grad_input) + + return data_grad_input, shift_grad_input + + +tin_shift = TINShiftFunction.apply + + +class TINShift(nn.Module): + """Temporal Interlace Shift. + + Temporal Interlace shift is a differentiable temporal-wise frame shifting + which is proposed in "Temporal Interlacing Network" + + Please refer to https://arxiv.org/abs/2001.06499 for more details. + Code is modified from https://github.com/mit-han-lab/temporal-shift-module + """ + + def forward(self, input, shift): + """Perform temporal interlace shift. + + Args: + input (Tensor): Feature map with shape [N, num_segments, C, H * W]. + shift (Tensor): Shift tensor with shape [N, num_segments]. + + Returns: + Feature map after temporal interlace shift. + """ + return tin_shift(input, shift) diff --git a/lavis/common/annotator/uniformer/mmcv/ops/upfirdn2d.py b/lavis/common/annotator/uniformer/mmcv/ops/upfirdn2d.py new file mode 100644 index 0000000000000000000000000000000000000000..c8bb2c3c949eed38a6465ed369fa881538dca010 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/upfirdn2d.py @@ -0,0 +1,330 @@ +# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py # noqa:E501 + +# Copyright (c) 2021, NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator +# Augmentation (ADA) +# ======================================================================= + +# 1. Definitions + +# "Licensor" means any person or entity that distributes its Work. + +# "Software" means the original work of authorship made available under +# this License. + +# "Work" means the Software and any additions to or derivative works of +# the Software that are made available under this License. + +# The terms "reproduce," "reproduction," "derivative works," and +# "distribution" have the meaning as provided under U.S. copyright law; +# provided, however, that for the purposes of this License, derivative +# works shall not include works that remain separable from, or merely +# link (or bind by name) to the interfaces of, the Work. + +# Works, including the Software, are "made available" under this License +# by including in or with the Work either (a) a copyright notice +# referencing the applicability of this License to the Work, or (b) a +# copy of this License. + +# 2. License Grants + +# 2.1 Copyright Grant. Subject to the terms and conditions of this +# License, each Licensor grants to you a perpetual, worldwide, +# non-exclusive, royalty-free, copyright license to reproduce, +# prepare derivative works of, publicly display, publicly perform, +# sublicense and distribute its Work and any resulting derivative +# works in any form. + +# 3. Limitations + +# 3.1 Redistribution. You may reproduce or distribute the Work only +# if (a) you do so under this License, (b) you include a complete +# copy of this License with your distribution, and (c) you retain +# without modification any copyright, patent, trademark, or +# attribution notices that are present in the Work. + +# 3.2 Derivative Works. You may specify that additional or different +# terms apply to the use, reproduction, and distribution of your +# derivative works of the Work ("Your Terms") only if (a) Your Terms +# provide that the use limitation in Section 3.3 applies to your +# derivative works, and (b) you identify the specific derivative +# works that are subject to Your Terms. Notwithstanding Your Terms, +# this License (including the redistribution requirements in Section +# 3.1) will continue to apply to the Work itself. + +# 3.3 Use Limitation. The Work and any derivative works thereof only +# may be used or intended for use non-commercially. Notwithstanding +# the foregoing, NVIDIA and its affiliates may use the Work and any +# derivative works commercially. As used herein, "non-commercially" +# means for research or evaluation purposes only. + +# 3.4 Patent Claims. If you bring or threaten to bring a patent claim +# against any Licensor (including any claim, cross-claim or +# counterclaim in a lawsuit) to enforce any patents that you allege +# are infringed by any Work, then your rights under this License from +# such Licensor (including the grant in Section 2.1) will terminate +# immediately. + +# 3.5 Trademarks. This License does not grant any rights to use any +# Licensor’s or its affiliates’ names, logos, or trademarks, except +# as necessary to reproduce the notices described in this License. + +# 3.6 Termination. If you violate any term of this License, then your +# rights under this License (including the grant in Section 2.1) will +# terminate immediately. + +# 4. Disclaimer of Warranty. + +# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR +# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER +# THIS LICENSE. + +# 5. Limitation of Liability. + +# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL +# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE +# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, +# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF +# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK +# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, +# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER +# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGES. + +# ======================================================================= + +import torch +from torch.autograd import Function +from torch.nn import functional as F + +from annotator.uniformer.mmcv.utils import to_2tuple +from ..utils import ext_loader + +upfirdn2d_ext = ext_loader.load_ext('_ext', ['upfirdn2d']) + + +class UpFirDn2dBackward(Function): + + @staticmethod + def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, + in_size, out_size): + + up_x, up_y = up + down_x, down_y = down + g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad + + grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1) + + grad_input = upfirdn2d_ext.upfirdn2d( + grad_output, + grad_kernel, + up_x=down_x, + up_y=down_y, + down_x=up_x, + down_y=up_y, + pad_x0=g_pad_x0, + pad_x1=g_pad_x1, + pad_y0=g_pad_y0, + pad_y1=g_pad_y1) + grad_input = grad_input.view(in_size[0], in_size[1], in_size[2], + in_size[3]) + + ctx.save_for_backward(kernel) + + pad_x0, pad_x1, pad_y0, pad_y1 = pad + + ctx.up_x = up_x + ctx.up_y = up_y + ctx.down_x = down_x + ctx.down_y = down_y + ctx.pad_x0 = pad_x0 + ctx.pad_x1 = pad_x1 + ctx.pad_y0 = pad_y0 + ctx.pad_y1 = pad_y1 + ctx.in_size = in_size + ctx.out_size = out_size + + return grad_input + + @staticmethod + def backward(ctx, gradgrad_input): + kernel, = ctx.saved_tensors + + gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], + ctx.in_size[3], 1) + + gradgrad_out = upfirdn2d_ext.upfirdn2d( + gradgrad_input, + kernel, + up_x=ctx.up_x, + up_y=ctx.up_y, + down_x=ctx.down_x, + down_y=ctx.down_y, + pad_x0=ctx.pad_x0, + pad_x1=ctx.pad_x1, + pad_y0=ctx.pad_y0, + pad_y1=ctx.pad_y1) + # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0], + # ctx.out_size[1], ctx.in_size[3]) + gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.in_size[1], + ctx.out_size[0], ctx.out_size[1]) + + return gradgrad_out, None, None, None, None, None, None, None, None + + +class UpFirDn2d(Function): + + @staticmethod + def forward(ctx, input, kernel, up, down, pad): + up_x, up_y = up + down_x, down_y = down + pad_x0, pad_x1, pad_y0, pad_y1 = pad + + kernel_h, kernel_w = kernel.shape + batch, channel, in_h, in_w = input.shape + ctx.in_size = input.shape + + input = input.reshape(-1, in_h, in_w, 1) + + ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1])) + + out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 + out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 + ctx.out_size = (out_h, out_w) + + ctx.up = (up_x, up_y) + ctx.down = (down_x, down_y) + ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1) + + g_pad_x0 = kernel_w - pad_x0 - 1 + g_pad_y0 = kernel_h - pad_y0 - 1 + g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1 + g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1 + + ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1) + + out = upfirdn2d_ext.upfirdn2d( + input, + kernel, + up_x=up_x, + up_y=up_y, + down_x=down_x, + down_y=down_y, + pad_x0=pad_x0, + pad_x1=pad_x1, + pad_y0=pad_y0, + pad_y1=pad_y1) + # out = out.view(major, out_h, out_w, minor) + out = out.view(-1, channel, out_h, out_w) + + return out + + @staticmethod + def backward(ctx, grad_output): + kernel, grad_kernel = ctx.saved_tensors + + grad_input = UpFirDn2dBackward.apply( + grad_output, + kernel, + grad_kernel, + ctx.up, + ctx.down, + ctx.pad, + ctx.g_pad, + ctx.in_size, + ctx.out_size, + ) + + return grad_input, None, None, None, None + + +def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): + """UpFRIDn for 2d features. + + UpFIRDn is short for upsample, apply FIR filter and downsample. More + details can be found in: + https://www.mathworks.com/help/signal/ref/upfirdn.html + + Args: + input (Tensor): Tensor with shape of (n, c, h, w). + kernel (Tensor): Filter kernel. + up (int | tuple[int], optional): Upsampling factor. If given a number, + we will use this factor for the both height and width side. + Defaults to 1. + down (int | tuple[int], optional): Downsampling factor. If given a + number, we will use this factor for the both height and width side. + Defaults to 1. + pad (tuple[int], optional): Padding for tensors, (x_pad, y_pad) or + (x_pad_0, x_pad_1, y_pad_0, y_pad_1). Defaults to (0, 0). + + Returns: + Tensor: Tensor after UpFIRDn. + """ + if input.device.type == 'cpu': + if len(pad) == 2: + pad = (pad[0], pad[1], pad[0], pad[1]) + + up = to_2tuple(up) + + down = to_2tuple(down) + + out = upfirdn2d_native(input, kernel, up[0], up[1], down[0], down[1], + pad[0], pad[1], pad[2], pad[3]) + else: + _up = to_2tuple(up) + + _down = to_2tuple(down) + + if len(pad) == 4: + _pad = pad + elif len(pad) == 2: + _pad = (pad[0], pad[1], pad[0], pad[1]) + + out = UpFirDn2d.apply(input, kernel, _up, _down, _pad) + + return out + + +def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, + pad_y0, pad_y1): + _, channel, in_h, in_w = input.shape + input = input.reshape(-1, in_h, in_w, 1) + + _, in_h, in_w, minor = input.shape + kernel_h, kernel_w = kernel.shape + + out = input.view(-1, in_h, 1, in_w, 1, minor) + out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1]) + out = out.view(-1, in_h * up_y, in_w * up_x, minor) + + out = F.pad( + out, + [0, 0, + max(pad_x0, 0), + max(pad_x1, 0), + max(pad_y0, 0), + max(pad_y1, 0)]) + out = out[:, + max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0), + max(-pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ] + + out = out.permute(0, 3, 1, 2) + out = out.reshape( + [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) + w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) + out = F.conv2d(out, w) + out = out.reshape( + -1, + minor, + in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, + in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, + ) + out = out.permute(0, 2, 3, 1) + out = out[:, ::down_y, ::down_x, :] + + out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 + out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 + + return out.view(-1, channel, out_h, out_w) diff --git a/lavis/common/annotator/uniformer/mmcv/ops/voxelize.py b/lavis/common/annotator/uniformer/mmcv/ops/voxelize.py new file mode 100644 index 0000000000000000000000000000000000000000..ca3226a4fbcbfe58490fa2ea8e1c16b531214121 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/ops/voxelize.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn +from torch.autograd import Function +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward']) + + +class _Voxelization(Function): + + @staticmethod + def forward(ctx, + points, + voxel_size, + coors_range, + max_points=35, + max_voxels=20000): + """Convert kitti points(N, >=3) to voxels. + + Args: + points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points + and points[:, 3:] contain other information like reflectivity. + voxel_size (tuple or float): The size of voxel with the shape of + [3]. + coors_range (tuple or float): The coordinate range of voxel with + the shape of [6]. + max_points (int, optional): maximum points contained in a voxel. if + max_points=-1, it means using dynamic_voxelize. Default: 35. + max_voxels (int, optional): maximum voxels this function create. + for second, 20000 is a good choice. Users should shuffle points + before call this function because max_voxels may drop points. + Default: 20000. + + Returns: + voxels_out (torch.Tensor): Output voxels with the shape of [M, + max_points, ndim]. Only contain points and returned when + max_points != -1. + coors_out (torch.Tensor): Output coordinates with the shape of + [M, 3]. + num_points_per_voxel_out (torch.Tensor): Num points per voxel with + the shape of [M]. Only returned when max_points != -1. + """ + if max_points == -1 or max_voxels == -1: + coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int) + ext_module.dynamic_voxelize_forward(points, coors, voxel_size, + coors_range, 3) + return coors + else: + voxels = points.new_zeros( + size=(max_voxels, max_points, points.size(1))) + coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int) + num_points_per_voxel = points.new_zeros( + size=(max_voxels, ), dtype=torch.int) + voxel_num = ext_module.hard_voxelize_forward( + points, voxels, coors, num_points_per_voxel, voxel_size, + coors_range, max_points, max_voxels, 3) + # select the valid voxels + voxels_out = voxels[:voxel_num] + coors_out = coors[:voxel_num] + num_points_per_voxel_out = num_points_per_voxel[:voxel_num] + return voxels_out, coors_out, num_points_per_voxel_out + + +voxelization = _Voxelization.apply + + +class Voxelization(nn.Module): + """Convert kitti points(N, >=3) to voxels. + + Please refer to `PVCNN `_ for more + details. + + Args: + voxel_size (tuple or float): The size of voxel with the shape of [3]. + point_cloud_range (tuple or float): The coordinate range of voxel with + the shape of [6]. + max_num_points (int): maximum points contained in a voxel. if + max_points=-1, it means using dynamic_voxelize. + max_voxels (int, optional): maximum voxels this function create. + for second, 20000 is a good choice. Users should shuffle points + before call this function because max_voxels may drop points. + Default: 20000. + """ + + def __init__(self, + voxel_size, + point_cloud_range, + max_num_points, + max_voxels=20000): + super().__init__() + + self.voxel_size = voxel_size + self.point_cloud_range = point_cloud_range + self.max_num_points = max_num_points + if isinstance(max_voxels, tuple): + self.max_voxels = max_voxels + else: + self.max_voxels = _pair(max_voxels) + + point_cloud_range = torch.tensor( + point_cloud_range, dtype=torch.float32) + voxel_size = torch.tensor(voxel_size, dtype=torch.float32) + grid_size = (point_cloud_range[3:] - + point_cloud_range[:3]) / voxel_size + grid_size = torch.round(grid_size).long() + input_feat_shape = grid_size[:2] + self.grid_size = grid_size + # the origin shape is as [x-len, y-len, z-len] + # [w, h, d] -> [d, h, w] + self.pcd_shape = [*input_feat_shape, 1][::-1] + + def forward(self, input): + if self.training: + max_voxels = self.max_voxels[0] + else: + max_voxels = self.max_voxels[1] + + return voxelization(input, self.voxel_size, self.point_cloud_range, + self.max_num_points, max_voxels) + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'voxel_size=' + str(self.voxel_size) + s += ', point_cloud_range=' + str(self.point_cloud_range) + s += ', max_num_points=' + str(self.max_num_points) + s += ', max_voxels=' + str(self.max_voxels) + s += ')' + return s diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/__init__.py b/lavis/common/annotator/uniformer/mmcv/parallel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed2c17ad357742e423beeaf4d35db03fe9af469 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .collate import collate +from .data_container import DataContainer +from .data_parallel import MMDataParallel +from .distributed import MMDistributedDataParallel +from .registry import MODULE_WRAPPERS +from .scatter_gather import scatter, scatter_kwargs +from .utils import is_module_wrapper + +__all__ = [ + 'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel', + 'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS' +] diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/_functions.py b/lavis/common/annotator/uniformer/mmcv/parallel/_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5a8a44483ab991411d07122b22a1d027e4be8e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/_functions.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn.parallel._functions import _get_stream + + +def scatter(input, devices, streams=None): + """Scatters tensor across multiple GPUs.""" + if streams is None: + streams = [None] * len(devices) + + if isinstance(input, list): + chunk_size = (len(input) - 1) // len(devices) + 1 + outputs = [ + scatter(input[i], [devices[i // chunk_size]], + [streams[i // chunk_size]]) for i in range(len(input)) + ] + return outputs + elif isinstance(input, torch.Tensor): + output = input.contiguous() + # TODO: copy to a pinned buffer first (if copying from CPU) + stream = streams[0] if output.numel() > 0 else None + if devices != [-1]: + with torch.cuda.device(devices[0]), torch.cuda.stream(stream): + output = output.cuda(devices[0], non_blocking=True) + else: + # unsqueeze the first dimension thus the tensor's shape is the + # same as those scattered with GPU. + output = output.unsqueeze(0) + return output + else: + raise Exception(f'Unknown type {type(input)}.') + + +def synchronize_stream(output, devices, streams): + if isinstance(output, list): + chunk_size = len(output) // len(devices) + for i in range(len(devices)): + for j in range(chunk_size): + synchronize_stream(output[i * chunk_size + j], [devices[i]], + [streams[i]]) + elif isinstance(output, torch.Tensor): + if output.numel() != 0: + with torch.cuda.device(devices[0]): + main_stream = torch.cuda.current_stream() + main_stream.wait_stream(streams[0]) + output.record_stream(main_stream) + else: + raise Exception(f'Unknown type {type(output)}.') + + +def get_input_device(input): + if isinstance(input, list): + for item in input: + input_device = get_input_device(item) + if input_device != -1: + return input_device + return -1 + elif isinstance(input, torch.Tensor): + return input.get_device() if input.is_cuda else -1 + else: + raise Exception(f'Unknown type {type(input)}.') + + +class Scatter: + + @staticmethod + def forward(target_gpus, input): + input_device = get_input_device(input) + streams = None + if input_device == -1 and target_gpus != [-1]: + # Perform CPU to GPU copies in a background stream + streams = [_get_stream(device) for device in target_gpus] + + outputs = scatter(input, target_gpus, streams) + # Synchronize with the copy stream + if streams is not None: + synchronize_stream(outputs, target_gpus, streams) + + return tuple(outputs) diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/collate.py b/lavis/common/annotator/uniformer/mmcv/parallel/collate.py new file mode 100644 index 0000000000000000000000000000000000000000..ad749197df21b0d74297548be5f66a696adebf7f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/collate.py @@ -0,0 +1,84 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Mapping, Sequence + +import torch +import torch.nn.functional as F +from torch.utils.data.dataloader import default_collate + +from .data_container import DataContainer + + +def collate(batch, samples_per_gpu=1): + """Puts each data field into a tensor/DataContainer with outer dimension + batch size. + + Extend default_collate to add support for + :type:`~mmcv.parallel.DataContainer`. There are 3 cases. + + 1. cpu_only = True, e.g., meta data + 2. cpu_only = False, stack = True, e.g., images tensors + 3. cpu_only = False, stack = False, e.g., gt bboxes + """ + + if not isinstance(batch, Sequence): + raise TypeError(f'{batch.dtype} is not supported.') + + if isinstance(batch[0], DataContainer): + stacked = [] + if batch[0].cpu_only: + for i in range(0, len(batch), samples_per_gpu): + stacked.append( + [sample.data for sample in batch[i:i + samples_per_gpu]]) + return DataContainer( + stacked, batch[0].stack, batch[0].padding_value, cpu_only=True) + elif batch[0].stack: + for i in range(0, len(batch), samples_per_gpu): + assert isinstance(batch[i].data, torch.Tensor) + + if batch[i].pad_dims is not None: + ndim = batch[i].dim() + assert ndim > batch[i].pad_dims + max_shape = [0 for _ in range(batch[i].pad_dims)] + for dim in range(1, batch[i].pad_dims + 1): + max_shape[dim - 1] = batch[i].size(-dim) + for sample in batch[i:i + samples_per_gpu]: + for dim in range(0, ndim - batch[i].pad_dims): + assert batch[i].size(dim) == sample.size(dim) + for dim in range(1, batch[i].pad_dims + 1): + max_shape[dim - 1] = max(max_shape[dim - 1], + sample.size(-dim)) + padded_samples = [] + for sample in batch[i:i + samples_per_gpu]: + pad = [0 for _ in range(batch[i].pad_dims * 2)] + for dim in range(1, batch[i].pad_dims + 1): + pad[2 * dim - + 1] = max_shape[dim - 1] - sample.size(-dim) + padded_samples.append( + F.pad( + sample.data, pad, value=sample.padding_value)) + stacked.append(default_collate(padded_samples)) + elif batch[i].pad_dims is None: + stacked.append( + default_collate([ + sample.data + for sample in batch[i:i + samples_per_gpu] + ])) + else: + raise ValueError( + 'pad_dims should be either None or integers (1-3)') + + else: + for i in range(0, len(batch), samples_per_gpu): + stacked.append( + [sample.data for sample in batch[i:i + samples_per_gpu]]) + return DataContainer(stacked, batch[0].stack, batch[0].padding_value) + elif isinstance(batch[0], Sequence): + transposed = zip(*batch) + return [collate(samples, samples_per_gpu) for samples in transposed] + elif isinstance(batch[0], Mapping): + return { + key: collate([d[key] for d in batch], samples_per_gpu) + for key in batch[0] + } + else: + return default_collate(batch) diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/data_container.py b/lavis/common/annotator/uniformer/mmcv/parallel/data_container.py new file mode 100644 index 0000000000000000000000000000000000000000..cedb0d32a51a1f575a622b38de2cee3ab4757821 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/data_container.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools + +import torch + + +def assert_tensor_type(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not isinstance(args[0].data, torch.Tensor): + raise AttributeError( + f'{args[0].__class__.__name__} has no attribute ' + f'{func.__name__} for type {args[0].datatype}') + return func(*args, **kwargs) + + return wrapper + + +class DataContainer: + """A container for any type of objects. + + Typically tensors will be stacked in the collate function and sliced along + some dimension in the scatter function. This behavior has some limitations. + 1. All tensors have to be the same size. + 2. Types are limited (numpy array or Tensor). + + We design `DataContainer` and `MMDataParallel` to overcome these + limitations. The behavior can be either of the following. + + - copy to GPU, pad all tensors to the same size and stack them + - copy to GPU without stacking + - leave the objects as is and pass it to the model + - pad_dims specifies the number of last few dimensions to do padding + """ + + def __init__(self, + data, + stack=False, + padding_value=0, + cpu_only=False, + pad_dims=2): + self._data = data + self._cpu_only = cpu_only + self._stack = stack + self._padding_value = padding_value + assert pad_dims in [None, 1, 2, 3] + self._pad_dims = pad_dims + + def __repr__(self): + return f'{self.__class__.__name__}({repr(self.data)})' + + def __len__(self): + return len(self._data) + + @property + def data(self): + return self._data + + @property + def datatype(self): + if isinstance(self.data, torch.Tensor): + return self.data.type() + else: + return type(self.data) + + @property + def cpu_only(self): + return self._cpu_only + + @property + def stack(self): + return self._stack + + @property + def padding_value(self): + return self._padding_value + + @property + def pad_dims(self): + return self._pad_dims + + @assert_tensor_type + def size(self, *args, **kwargs): + return self.data.size(*args, **kwargs) + + @assert_tensor_type + def dim(self): + return self.data.dim() diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/data_parallel.py b/lavis/common/annotator/uniformer/mmcv/parallel/data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..79b5f69b654cf647dc7ae9174223781ab5c607d2 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/data_parallel.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from itertools import chain + +from torch.nn.parallel import DataParallel + +from .scatter_gather import scatter_kwargs + + +class MMDataParallel(DataParallel): + """The DataParallel module that supports DataContainer. + + MMDataParallel has two main differences with PyTorch DataParallel: + + - It supports a custom type :class:`DataContainer` which allows more + flexible control of input data during both GPU and CPU inference. + - It implement two more APIs ``train_step()`` and ``val_step()``. + + Args: + module (:class:`nn.Module`): Module to be encapsulated. + device_ids (list[int]): Device IDS of modules to be scattered to. + Defaults to None when GPU is not available. + output_device (str | int): Device ID for output. Defaults to None. + dim (int): Dimension used to scatter the data. Defaults to 0. + """ + + def __init__(self, *args, dim=0, **kwargs): + super(MMDataParallel, self).__init__(*args, dim=dim, **kwargs) + self.dim = dim + + def forward(self, *inputs, **kwargs): + """Override the original forward function. + + The main difference lies in the CPU inference where the data in + :class:`DataContainers` will still be gathered. + """ + if not self.device_ids: + # We add the following line thus the module could gather and + # convert data containers as those in GPU inference + inputs, kwargs = self.scatter(inputs, kwargs, [-1]) + return self.module(*inputs[0], **kwargs[0]) + else: + return super().forward(*inputs, **kwargs) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def train_step(self, *inputs, **kwargs): + if not self.device_ids: + # We add the following line thus the module could gather and + # convert data containers as those in GPU inference + inputs, kwargs = self.scatter(inputs, kwargs, [-1]) + return self.module.train_step(*inputs[0], **kwargs[0]) + + assert len(self.device_ids) == 1, \ + ('MMDataParallel only supports single GPU training, if you need to' + ' train with multiple GPUs, please use MMDistributedDataParallel' + 'instead.') + + for t in chain(self.module.parameters(), self.module.buffers()): + if t.device != self.src_device_obj: + raise RuntimeError( + 'module must have its parameters and buffers ' + f'on device {self.src_device_obj} (device_ids[0]) but ' + f'found one of them on device: {t.device}') + + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + return self.module.train_step(*inputs[0], **kwargs[0]) + + def val_step(self, *inputs, **kwargs): + if not self.device_ids: + # We add the following line thus the module could gather and + # convert data containers as those in GPU inference + inputs, kwargs = self.scatter(inputs, kwargs, [-1]) + return self.module.val_step(*inputs[0], **kwargs[0]) + + assert len(self.device_ids) == 1, \ + ('MMDataParallel only supports single GPU training, if you need to' + ' train with multiple GPUs, please use MMDistributedDataParallel' + ' instead.') + + for t in chain(self.module.parameters(), self.module.buffers()): + if t.device != self.src_device_obj: + raise RuntimeError( + 'module must have its parameters and buffers ' + f'on device {self.src_device_obj} (device_ids[0]) but ' + f'found one of them on device: {t.device}') + + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + return self.module.val_step(*inputs[0], **kwargs[0]) diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/distributed.py b/lavis/common/annotator/uniformer/mmcv/parallel/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..1e4c27903db58a54d37ea1ed9ec0104098b486f2 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/distributed.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn.parallel.distributed import (DistributedDataParallel, + _find_tensors) + +from annotator.uniformer.mmcv import print_log +from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version +from .scatter_gather import scatter_kwargs + + +class MMDistributedDataParallel(DistributedDataParallel): + """The DDP module that supports DataContainer. + + MMDDP has two main differences with PyTorch DDP: + + - It supports a custom type :class:`DataContainer` which allows more + flexible control of input data. + - It implement two APIs ``train_step()`` and ``val_step()``. + """ + + def to_kwargs(self, inputs, kwargs, device_id): + # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8 + # to move all tensors to device_id + return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def train_step(self, *inputs, **kwargs): + """train_step() API for module wrapped by DistributedDataParallel. + + This method is basically the same as + ``DistributedDataParallel.forward()``, while replacing + ``self.module.forward()`` with ``self.module.train_step()``. + It is compatible with PyTorch 1.1 - 1.5. + """ + + # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the + # end of backward to the beginning of forward. + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.7') + and self.reducer._rebuild_buckets()): + print_log( + 'Reducer buckets have been rebuilt in this iteration.', + logger='mmcv') + + if getattr(self, 'require_forward_param_sync', True): + self._sync_params() + if self.device_ids: + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + if len(self.device_ids) == 1: + output = self.module.train_step(*inputs[0], **kwargs[0]) + else: + outputs = self.parallel_apply( + self._module_copies[:len(inputs)], inputs, kwargs) + output = self.gather(outputs, self.output_device) + else: + output = self.module.train_step(*inputs, **kwargs) + + if torch.is_grad_enabled() and getattr( + self, 'require_backward_grad_sync', True): + if self.find_unused_parameters: + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) > digit_version('1.2')): + self.require_forward_param_sync = False + return output + + def val_step(self, *inputs, **kwargs): + """val_step() API for module wrapped by DistributedDataParallel. + + This method is basically the same as + ``DistributedDataParallel.forward()``, while replacing + ``self.module.forward()`` with ``self.module.val_step()``. + It is compatible with PyTorch 1.1 - 1.5. + """ + # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the + # end of backward to the beginning of forward. + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.7') + and self.reducer._rebuild_buckets()): + print_log( + 'Reducer buckets have been rebuilt in this iteration.', + logger='mmcv') + + if getattr(self, 'require_forward_param_sync', True): + self._sync_params() + if self.device_ids: + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + if len(self.device_ids) == 1: + output = self.module.val_step(*inputs[0], **kwargs[0]) + else: + outputs = self.parallel_apply( + self._module_copies[:len(inputs)], inputs, kwargs) + output = self.gather(outputs, self.output_device) + else: + output = self.module.val_step(*inputs, **kwargs) + + if torch.is_grad_enabled() and getattr( + self, 'require_backward_grad_sync', True): + if self.find_unused_parameters: + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) > digit_version('1.2')): + self.require_forward_param_sync = False + return output diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/distributed_deprecated.py b/lavis/common/annotator/uniformer/mmcv/parallel/distributed_deprecated.py new file mode 100644 index 0000000000000000000000000000000000000000..676937a2085d4da20fa87923041a200fca6214eb --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/distributed_deprecated.py @@ -0,0 +1,70 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.distributed as dist +import torch.nn as nn +from torch._utils import (_flatten_dense_tensors, _take_tensors, + _unflatten_dense_tensors) + +from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version +from .registry import MODULE_WRAPPERS +from .scatter_gather import scatter_kwargs + + +@MODULE_WRAPPERS.register_module() +class MMDistributedDataParallel(nn.Module): + + def __init__(self, + module, + dim=0, + broadcast_buffers=True, + bucket_cap_mb=25): + super(MMDistributedDataParallel, self).__init__() + self.module = module + self.dim = dim + self.broadcast_buffers = broadcast_buffers + + self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024 + self._sync_params() + + def _dist_broadcast_coalesced(self, tensors, buffer_size): + for tensors in _take_tensors(tensors, buffer_size): + flat_tensors = _flatten_dense_tensors(tensors) + dist.broadcast(flat_tensors, 0) + for tensor, synced in zip( + tensors, _unflatten_dense_tensors(flat_tensors, tensors)): + tensor.copy_(synced) + + def _sync_params(self): + module_states = list(self.module.state_dict().values()) + if len(module_states) > 0: + self._dist_broadcast_coalesced(module_states, + self.broadcast_bucket_size) + if self.broadcast_buffers: + if (TORCH_VERSION != 'parrots' + and digit_version(TORCH_VERSION) < digit_version('1.0')): + buffers = [b.data for b in self.module._all_buffers()] + else: + buffers = [b.data for b in self.module.buffers()] + if len(buffers) > 0: + self._dist_broadcast_coalesced(buffers, + self.broadcast_bucket_size) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def forward(self, *inputs, **kwargs): + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + return self.module(*inputs[0], **kwargs[0]) + + def train_step(self, *inputs, **kwargs): + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + output = self.module.train_step(*inputs[0], **kwargs[0]) + return output + + def val_step(self, *inputs, **kwargs): + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + output = self.module.val_step(*inputs[0], **kwargs[0]) + return output diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/registry.py b/lavis/common/annotator/uniformer/mmcv/parallel/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..a204a07fba10e614223f090d1a57cf9c4d74d4a1 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/registry.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch.nn.parallel import DataParallel, DistributedDataParallel + +from annotator.uniformer.mmcv.utils import Registry + +MODULE_WRAPPERS = Registry('module wrapper') +MODULE_WRAPPERS.register_module(module=DataParallel) +MODULE_WRAPPERS.register_module(module=DistributedDataParallel) diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/scatter_gather.py b/lavis/common/annotator/uniformer/mmcv/parallel/scatter_gather.py new file mode 100644 index 0000000000000000000000000000000000000000..900ff88566f8f14830590459dc4fd16d4b382e47 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/scatter_gather.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn.parallel._functions import Scatter as OrigScatter + +from ._functions import Scatter +from .data_container import DataContainer + + +def scatter(inputs, target_gpus, dim=0): + """Scatter inputs to target gpus. + + The only difference from original :func:`scatter` is to add support for + :type:`~mmcv.parallel.DataContainer`. + """ + + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + if target_gpus != [-1]: + return OrigScatter.apply(target_gpus, None, dim, obj) + else: + # for CPU inference we use self-implemented scatter + return Scatter.forward(target_gpus, obj) + if isinstance(obj, DataContainer): + if obj.cpu_only: + return obj.data + else: + return Scatter.forward(target_gpus, obj.data) + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + out = list(map(list, zip(*map(scatter_map, obj)))) + return out + if isinstance(obj, dict) and len(obj) > 0: + out = list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return out + return [obj for targets in target_gpus] + + # After scatter_map is called, a scatter_map cell will exist. This cell + # has a reference to the actual function scatter_map, which has references + # to a closure that has a reference to the scatter_map cell (because the + # fn is recursive). To avoid this reference cycle, we set the function to + # None, clearing the cell + try: + return scatter_map(inputs) + finally: + scatter_map = None + + +def scatter_kwargs(inputs, kwargs, target_gpus, dim=0): + """Scatter with support for kwargs dictionary.""" + inputs = scatter(inputs, target_gpus, dim) if inputs else [] + kwargs = scatter(kwargs, target_gpus, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs diff --git a/lavis/common/annotator/uniformer/mmcv/parallel/utils.py b/lavis/common/annotator/uniformer/mmcv/parallel/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0f5712cb42c38a2e8563bf563efb6681383cab9b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/parallel/utils.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .registry import MODULE_WRAPPERS + + +def is_module_wrapper(module): + """Check if a module is a module wrapper. + + The following 3 modules in MMCV (and their subclasses) are regarded as + module wrappers: DataParallel, DistributedDataParallel, + MMDistributedDataParallel (the deprecated version). You may add you own + module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS. + + Args: + module (nn.Module): The module to be checked. + + Returns: + bool: True if the input module is a module wrapper. + """ + module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values()) + return isinstance(module, module_wrappers) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/__init__.py b/lavis/common/annotator/uniformer/mmcv/runner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52e4b48d383a84a055dcd7f6236f6e8e58eab924 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/__init__.py @@ -0,0 +1,47 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_module import BaseModule, ModuleList, Sequential +from .base_runner import BaseRunner +from .builder import RUNNERS, build_runner +from .checkpoint import (CheckpointLoader, _load_checkpoint, + _load_checkpoint_with_prefix, load_checkpoint, + load_state_dict, save_checkpoint, weights_to_cpu) +from .default_constructor import DefaultRunnerConstructor +from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info, + init_dist, master_only) +from .epoch_based_runner import EpochBasedRunner, Runner +from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model +from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook, + DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook, + Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook, + GradientCumulativeOptimizerHook, Hook, IterTimerHook, + LoggerHook, LrUpdaterHook, MlflowLoggerHook, + NeptuneLoggerHook, OptimizerHook, PaviLoggerHook, + SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook, + WandbLoggerHook) +from .iter_based_runner import IterBasedRunner, IterLoader +from .log_buffer import LogBuffer +from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS, + DefaultOptimizerConstructor, build_optimizer, + build_optimizer_constructor) +from .priority import Priority, get_priority +from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed + +__all__ = [ + 'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer', + 'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook', + 'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook', + 'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook', + 'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook', + 'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict', + 'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority', + 'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict', + 'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS', + 'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer', + 'build_optimizer_constructor', 'IterLoader', 'set_random_seed', + 'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook', + 'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads', + 'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule', + '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential', + 'ModuleList', 'GradientCumulativeOptimizerHook', + 'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor' +] diff --git a/lavis/common/annotator/uniformer/mmcv/runner/base_module.py b/lavis/common/annotator/uniformer/mmcv/runner/base_module.py new file mode 100644 index 0000000000000000000000000000000000000000..617fad9bb89f10a9a0911d962dfb3bc8f3a3628c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/base_module.py @@ -0,0 +1,195 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from abc import ABCMeta +from collections import defaultdict +from logging import FileHandler + +import torch.nn as nn + +from annotator.uniformer.mmcv.runner.dist_utils import master_only +from annotator.uniformer.mmcv.utils.logging import get_logger, logger_initialized, print_log + + +class BaseModule(nn.Module, metaclass=ABCMeta): + """Base module for all modules in openmmlab. + + ``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional + functionality of parameter initialization. Compared with + ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes. + + - ``init_cfg``: the config to control the initialization. + - ``init_weights``: The function of parameter + initialization and recording initialization + information. + - ``_params_init_info``: Used to track the parameter + initialization information. This attribute only + exists during executing the ``init_weights``. + + Args: + init_cfg (dict, optional): Initialization config dict. + """ + + def __init__(self, init_cfg=None): + """Initialize BaseModule, inherited from `torch.nn.Module`""" + + # NOTE init_cfg can be defined in different levels, but init_cfg + # in low levels has a higher priority. + + super(BaseModule, self).__init__() + # define default value of init_cfg instead of hard code + # in init_weights() function + self._is_init = False + + self.init_cfg = copy.deepcopy(init_cfg) + + # Backward compatibility in derived classes + # if pretrained is not None: + # warnings.warn('DeprecationWarning: pretrained is a deprecated \ + # key, please consider using init_cfg') + # self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + + @property + def is_init(self): + return self._is_init + + def init_weights(self): + """Initialize the weights.""" + + is_top_level_module = False + # check if it is top-level module + if not hasattr(self, '_params_init_info'): + # The `_params_init_info` is used to record the initialization + # information of the parameters + # the key should be the obj:`nn.Parameter` of model and the value + # should be a dict containing + # - init_info (str): The string that describes the initialization. + # - tmp_mean_value (FloatTensor): The mean of the parameter, + # which indicates whether the parameter has been modified. + # this attribute would be deleted after all parameters + # is initialized. + self._params_init_info = defaultdict(dict) + is_top_level_module = True + + # Initialize the `_params_init_info`, + # When detecting the `tmp_mean_value` of + # the corresponding parameter is changed, update related + # initialization information + for name, param in self.named_parameters(): + self._params_init_info[param][ + 'init_info'] = f'The value is the same before and ' \ + f'after calling `init_weights` ' \ + f'of {self.__class__.__name__} ' + self._params_init_info[param][ + 'tmp_mean_value'] = param.data.mean() + + # pass `params_init_info` to all submodules + # All submodules share the same `params_init_info`, + # so it will be updated when parameters are + # modified at any level of the model. + for sub_module in self.modules(): + sub_module._params_init_info = self._params_init_info + + # Get the initialized logger, if not exist, + # create a logger named `mmcv` + logger_names = list(logger_initialized.keys()) + logger_name = logger_names[0] if logger_names else 'mmcv' + + from ..cnn import initialize + from ..cnn.utils.weight_init import update_init_info + module_name = self.__class__.__name__ + if not self._is_init: + if self.init_cfg: + print_log( + f'initialize {module_name} with init_cfg {self.init_cfg}', + logger=logger_name) + initialize(self, self.init_cfg) + if isinstance(self.init_cfg, dict): + # prevent the parameters of + # the pre-trained model + # from being overwritten by + # the `init_weights` + if self.init_cfg['type'] == 'Pretrained': + return + + for m in self.children(): + if hasattr(m, 'init_weights'): + m.init_weights() + # users may overload the `init_weights` + update_init_info( + m, + init_info=f'Initialized by ' + f'user-defined `init_weights`' + f' in {m.__class__.__name__} ') + + self._is_init = True + else: + warnings.warn(f'init_weights of {self.__class__.__name__} has ' + f'been called more than once.') + + if is_top_level_module: + self._dump_init_info(logger_name) + + for sub_module in self.modules(): + del sub_module._params_init_info + + @master_only + def _dump_init_info(self, logger_name): + """Dump the initialization information to a file named + `initialization.log.json` in workdir. + + Args: + logger_name (str): The name of logger. + """ + + logger = get_logger(logger_name) + + with_file_handler = False + # dump the information to the logger file if there is a `FileHandler` + for handler in logger.handlers: + if isinstance(handler, FileHandler): + handler.stream.write( + 'Name of parameter - Initialization information\n') + for name, param in self.named_parameters(): + handler.stream.write( + f'\n{name} - {param.shape}: ' + f"\n{self._params_init_info[param]['init_info']} \n") + handler.stream.flush() + with_file_handler = True + if not with_file_handler: + for name, param in self.named_parameters(): + print_log( + f'\n{name} - {param.shape}: ' + f"\n{self._params_init_info[param]['init_info']} \n ", + logger=logger_name) + + def __repr__(self): + s = super().__repr__() + if self.init_cfg: + s += f'\ninit_cfg={self.init_cfg}' + return s + + +class Sequential(BaseModule, nn.Sequential): + """Sequential module in openmmlab. + + Args: + init_cfg (dict, optional): Initialization config dict. + """ + + def __init__(self, *args, init_cfg=None): + BaseModule.__init__(self, init_cfg) + nn.Sequential.__init__(self, *args) + + +class ModuleList(BaseModule, nn.ModuleList): + """ModuleList in openmmlab. + + Args: + modules (iterable, optional): an iterable of modules to add. + init_cfg (dict, optional): Initialization config dict. + """ + + def __init__(self, modules=None, init_cfg=None): + BaseModule.__init__(self, init_cfg) + nn.ModuleList.__init__(self, modules) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/base_runner.py b/lavis/common/annotator/uniformer/mmcv/runner/base_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..4928db0a73b56fe0218a4bf66ec4ffa082d31ccc --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/base_runner.py @@ -0,0 +1,542 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import logging +import os.path as osp +import warnings +from abc import ABCMeta, abstractmethod + +import torch +from torch.optim import Optimizer + +import annotator.uniformer.mmcv as mmcv +from ..parallel import is_module_wrapper +from .checkpoint import load_checkpoint +from .dist_utils import get_dist_info +from .hooks import HOOKS, Hook +from .log_buffer import LogBuffer +from .priority import Priority, get_priority +from .utils import get_time_str + + +class BaseRunner(metaclass=ABCMeta): + """The base class of Runner, a training helper for PyTorch. + + All subclasses should implement the following APIs: + + - ``run()`` + - ``train()`` + - ``val()`` + - ``save_checkpoint()`` + + Args: + model (:obj:`torch.nn.Module`): The model to be run. + batch_processor (callable): A callable method that process a data + batch. The interface of this method should be + `batch_processor(model, data, train_mode) -> dict` + optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an + optimizer (in most cases) or a dict of optimizers (in models that + requires more than one optimizer, e.g., GAN). + work_dir (str, optional): The working directory to save checkpoints + and logs. Defaults to None. + logger (:obj:`logging.Logger`): Logger used during training. + Defaults to None. (The default value is just for backward + compatibility) + meta (dict | None): A dict records some import information such as + environment info and seed, which will be logged in logger hook. + Defaults to None. + max_epochs (int, optional): Total training epochs. + max_iters (int, optional): Total training iterations. + """ + + def __init__(self, + model, + batch_processor=None, + optimizer=None, + work_dir=None, + logger=None, + meta=None, + max_iters=None, + max_epochs=None): + if batch_processor is not None: + if not callable(batch_processor): + raise TypeError('batch_processor must be callable, ' + f'but got {type(batch_processor)}') + warnings.warn('batch_processor is deprecated, please implement ' + 'train_step() and val_step() in the model instead.') + # raise an error is `batch_processor` is not None and + # `model.train_step()` exists. + if is_module_wrapper(model): + _model = model.module + else: + _model = model + if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'): + raise RuntimeError( + 'batch_processor and model.train_step()/model.val_step() ' + 'cannot be both available.') + else: + assert hasattr(model, 'train_step') + + # check the type of `optimizer` + if isinstance(optimizer, dict): + for name, optim in optimizer.items(): + if not isinstance(optim, Optimizer): + raise TypeError( + f'optimizer must be a dict of torch.optim.Optimizers, ' + f'but optimizer["{name}"] is a {type(optim)}') + elif not isinstance(optimizer, Optimizer) and optimizer is not None: + raise TypeError( + f'optimizer must be a torch.optim.Optimizer object ' + f'or dict or None, but got {type(optimizer)}') + + # check the type of `logger` + if not isinstance(logger, logging.Logger): + raise TypeError(f'logger must be a logging.Logger object, ' + f'but got {type(logger)}') + + # check the type of `meta` + if meta is not None and not isinstance(meta, dict): + raise TypeError( + f'meta must be a dict or None, but got {type(meta)}') + + self.model = model + self.batch_processor = batch_processor + self.optimizer = optimizer + self.logger = logger + self.meta = meta + # create work_dir + if mmcv.is_str(work_dir): + self.work_dir = osp.abspath(work_dir) + mmcv.mkdir_or_exist(self.work_dir) + elif work_dir is None: + self.work_dir = None + else: + raise TypeError('"work_dir" must be a str or None') + + # get model name from the model class + if hasattr(self.model, 'module'): + self._model_name = self.model.module.__class__.__name__ + else: + self._model_name = self.model.__class__.__name__ + + self._rank, self._world_size = get_dist_info() + self.timestamp = get_time_str() + self.mode = None + self._hooks = [] + self._epoch = 0 + self._iter = 0 + self._inner_iter = 0 + + if max_epochs is not None and max_iters is not None: + raise ValueError( + 'Only one of `max_epochs` or `max_iters` can be set.') + + self._max_epochs = max_epochs + self._max_iters = max_iters + # TODO: Redesign LogBuffer, it is not flexible and elegant enough + self.log_buffer = LogBuffer() + + @property + def model_name(self): + """str: Name of the model, usually the module class name.""" + return self._model_name + + @property + def rank(self): + """int: Rank of current process. (distributed training)""" + return self._rank + + @property + def world_size(self): + """int: Number of processes participating in the job. + (distributed training)""" + return self._world_size + + @property + def hooks(self): + """list[:obj:`Hook`]: A list of registered hooks.""" + return self._hooks + + @property + def epoch(self): + """int: Current epoch.""" + return self._epoch + + @property + def iter(self): + """int: Current iteration.""" + return self._iter + + @property + def inner_iter(self): + """int: Iteration in an epoch.""" + return self._inner_iter + + @property + def max_epochs(self): + """int: Maximum training epochs.""" + return self._max_epochs + + @property + def max_iters(self): + """int: Maximum training iterations.""" + return self._max_iters + + @abstractmethod + def train(self): + pass + + @abstractmethod + def val(self): + pass + + @abstractmethod + def run(self, data_loaders, workflow, **kwargs): + pass + + @abstractmethod + def save_checkpoint(self, + out_dir, + filename_tmpl, + save_optimizer=True, + meta=None, + create_symlink=True): + pass + + def current_lr(self): + """Get current learning rates. + + Returns: + list[float] | dict[str, list[float]]: Current learning rates of all + param groups. If the runner has a dict of optimizers, this + method will return a dict. + """ + if isinstance(self.optimizer, torch.optim.Optimizer): + lr = [group['lr'] for group in self.optimizer.param_groups] + elif isinstance(self.optimizer, dict): + lr = dict() + for name, optim in self.optimizer.items(): + lr[name] = [group['lr'] for group in optim.param_groups] + else: + raise RuntimeError( + 'lr is not applicable because optimizer does not exist.') + return lr + + def current_momentum(self): + """Get current momentums. + + Returns: + list[float] | dict[str, list[float]]: Current momentums of all + param groups. If the runner has a dict of optimizers, this + method will return a dict. + """ + + def _get_momentum(optimizer): + momentums = [] + for group in optimizer.param_groups: + if 'momentum' in group.keys(): + momentums.append(group['momentum']) + elif 'betas' in group.keys(): + momentums.append(group['betas'][0]) + else: + momentums.append(0) + return momentums + + if self.optimizer is None: + raise RuntimeError( + 'momentum is not applicable because optimizer does not exist.') + elif isinstance(self.optimizer, torch.optim.Optimizer): + momentums = _get_momentum(self.optimizer) + elif isinstance(self.optimizer, dict): + momentums = dict() + for name, optim in self.optimizer.items(): + momentums[name] = _get_momentum(optim) + return momentums + + def register_hook(self, hook, priority='NORMAL'): + """Register a hook into the hook list. + + The hook will be inserted into a priority queue, with the specified + priority (See :class:`Priority` for details of priorities). + For hooks with the same priority, they will be triggered in the same + order as they are registered. + + Args: + hook (:obj:`Hook`): The hook to be registered. + priority (int or str or :obj:`Priority`): Hook priority. + Lower value means higher priority. + """ + assert isinstance(hook, Hook) + if hasattr(hook, 'priority'): + raise ValueError('"priority" is a reserved attribute for hooks') + priority = get_priority(priority) + hook.priority = priority + # insert the hook to a sorted list + inserted = False + for i in range(len(self._hooks) - 1, -1, -1): + if priority >= self._hooks[i].priority: + self._hooks.insert(i + 1, hook) + inserted = True + break + if not inserted: + self._hooks.insert(0, hook) + + def register_hook_from_cfg(self, hook_cfg): + """Register a hook from its cfg. + + Args: + hook_cfg (dict): Hook config. It should have at least keys 'type' + and 'priority' indicating its type and priority. + + Notes: + The specific hook class to register should not use 'type' and + 'priority' arguments during initialization. + """ + hook_cfg = hook_cfg.copy() + priority = hook_cfg.pop('priority', 'NORMAL') + hook = mmcv.build_from_cfg(hook_cfg, HOOKS) + self.register_hook(hook, priority=priority) + + def call_hook(self, fn_name): + """Call all hooks. + + Args: + fn_name (str): The function name in each hook to be called, such as + "before_train_epoch". + """ + for hook in self._hooks: + getattr(hook, fn_name)(self) + + def get_hook_info(self): + # Get hooks info in each stage + stage_hook_map = {stage: [] for stage in Hook.stages} + for hook in self.hooks: + try: + priority = Priority(hook.priority).name + except ValueError: + priority = hook.priority + classname = hook.__class__.__name__ + hook_info = f'({priority:<12}) {classname:<35}' + for trigger_stage in hook.get_triggered_stages(): + stage_hook_map[trigger_stage].append(hook_info) + + stage_hook_infos = [] + for stage in Hook.stages: + hook_infos = stage_hook_map[stage] + if len(hook_infos) > 0: + info = f'{stage}:\n' + info += '\n'.join(hook_infos) + info += '\n -------------------- ' + stage_hook_infos.append(info) + return '\n'.join(stage_hook_infos) + + def load_checkpoint(self, + filename, + map_location='cpu', + strict=False, + revise_keys=[(r'^module.', '')]): + return load_checkpoint( + self.model, + filename, + map_location, + strict, + self.logger, + revise_keys=revise_keys) + + def resume(self, + checkpoint, + resume_optimizer=True, + map_location='default'): + if map_location == 'default': + if torch.cuda.is_available(): + device_id = torch.cuda.current_device() + checkpoint = self.load_checkpoint( + checkpoint, + map_location=lambda storage, loc: storage.cuda(device_id)) + else: + checkpoint = self.load_checkpoint(checkpoint) + else: + checkpoint = self.load_checkpoint( + checkpoint, map_location=map_location) + + self._epoch = checkpoint['meta']['epoch'] + self._iter = checkpoint['meta']['iter'] + if self.meta is None: + self.meta = {} + self.meta.setdefault('hook_msgs', {}) + # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages + self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {})) + + # Re-calculate the number of iterations when resuming + # models with different number of GPUs + if 'config' in checkpoint['meta']: + config = mmcv.Config.fromstring( + checkpoint['meta']['config'], file_format='.py') + previous_gpu_ids = config.get('gpu_ids', None) + if previous_gpu_ids and len(previous_gpu_ids) > 0 and len( + previous_gpu_ids) != self.world_size: + self._iter = int(self._iter * len(previous_gpu_ids) / + self.world_size) + self.logger.info('the iteration number is changed due to ' + 'change of GPU number') + + # resume meta information meta + self.meta = checkpoint['meta'] + + if 'optimizer' in checkpoint and resume_optimizer: + if isinstance(self.optimizer, Optimizer): + self.optimizer.load_state_dict(checkpoint['optimizer']) + elif isinstance(self.optimizer, dict): + for k in self.optimizer.keys(): + self.optimizer[k].load_state_dict( + checkpoint['optimizer'][k]) + else: + raise TypeError( + 'Optimizer should be dict or torch.optim.Optimizer ' + f'but got {type(self.optimizer)}') + + self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter) + + def register_lr_hook(self, lr_config): + if lr_config is None: + return + elif isinstance(lr_config, dict): + assert 'policy' in lr_config + policy_type = lr_config.pop('policy') + # If the type of policy is all in lower case, e.g., 'cyclic', + # then its first letter will be capitalized, e.g., to be 'Cyclic'. + # This is for the convenient usage of Lr updater. + # Since this is not applicable for ` + # CosineAnnealingLrUpdater`, + # the string will not be changed if it contains capital letters. + if policy_type == policy_type.lower(): + policy_type = policy_type.title() + hook_type = policy_type + 'LrUpdaterHook' + lr_config['type'] = hook_type + hook = mmcv.build_from_cfg(lr_config, HOOKS) + else: + hook = lr_config + self.register_hook(hook, priority='VERY_HIGH') + + def register_momentum_hook(self, momentum_config): + if momentum_config is None: + return + if isinstance(momentum_config, dict): + assert 'policy' in momentum_config + policy_type = momentum_config.pop('policy') + # If the type of policy is all in lower case, e.g., 'cyclic', + # then its first letter will be capitalized, e.g., to be 'Cyclic'. + # This is for the convenient usage of momentum updater. + # Since this is not applicable for + # `CosineAnnealingMomentumUpdater`, + # the string will not be changed if it contains capital letters. + if policy_type == policy_type.lower(): + policy_type = policy_type.title() + hook_type = policy_type + 'MomentumUpdaterHook' + momentum_config['type'] = hook_type + hook = mmcv.build_from_cfg(momentum_config, HOOKS) + else: + hook = momentum_config + self.register_hook(hook, priority='HIGH') + + def register_optimizer_hook(self, optimizer_config): + if optimizer_config is None: + return + if isinstance(optimizer_config, dict): + optimizer_config.setdefault('type', 'OptimizerHook') + hook = mmcv.build_from_cfg(optimizer_config, HOOKS) + else: + hook = optimizer_config + self.register_hook(hook, priority='ABOVE_NORMAL') + + def register_checkpoint_hook(self, checkpoint_config): + if checkpoint_config is None: + return + if isinstance(checkpoint_config, dict): + checkpoint_config.setdefault('type', 'CheckpointHook') + hook = mmcv.build_from_cfg(checkpoint_config, HOOKS) + else: + hook = checkpoint_config + self.register_hook(hook, priority='NORMAL') + + def register_logger_hooks(self, log_config): + if log_config is None: + return + log_interval = log_config['interval'] + for info in log_config['hooks']: + logger_hook = mmcv.build_from_cfg( + info, HOOKS, default_args=dict(interval=log_interval)) + self.register_hook(logger_hook, priority='VERY_LOW') + + def register_timer_hook(self, timer_config): + if timer_config is None: + return + if isinstance(timer_config, dict): + timer_config_ = copy.deepcopy(timer_config) + hook = mmcv.build_from_cfg(timer_config_, HOOKS) + else: + hook = timer_config + self.register_hook(hook, priority='LOW') + + def register_custom_hooks(self, custom_config): + if custom_config is None: + return + + if not isinstance(custom_config, list): + custom_config = [custom_config] + + for item in custom_config: + if isinstance(item, dict): + self.register_hook_from_cfg(item) + else: + self.register_hook(item, priority='NORMAL') + + def register_profiler_hook(self, profiler_config): + if profiler_config is None: + return + if isinstance(profiler_config, dict): + profiler_config.setdefault('type', 'ProfilerHook') + hook = mmcv.build_from_cfg(profiler_config, HOOKS) + else: + hook = profiler_config + self.register_hook(hook) + + def register_training_hooks(self, + lr_config, + optimizer_config=None, + checkpoint_config=None, + log_config=None, + momentum_config=None, + timer_config=dict(type='IterTimerHook'), + custom_hooks_config=None): + """Register default and custom hooks for training. + + Default and custom hooks include: + + +----------------------+-------------------------+ + | Hooks | Priority | + +======================+=========================+ + | LrUpdaterHook | VERY_HIGH (10) | + +----------------------+-------------------------+ + | MomentumUpdaterHook | HIGH (30) | + +----------------------+-------------------------+ + | OptimizerStepperHook | ABOVE_NORMAL (40) | + +----------------------+-------------------------+ + | CheckpointSaverHook | NORMAL (50) | + +----------------------+-------------------------+ + | IterTimerHook | LOW (70) | + +----------------------+-------------------------+ + | LoggerHook(s) | VERY_LOW (90) | + +----------------------+-------------------------+ + | CustomHook(s) | defaults to NORMAL (50) | + +----------------------+-------------------------+ + + If custom hooks have same priority with default hooks, custom hooks + will be triggered after default hooks. + """ + self.register_lr_hook(lr_config) + self.register_momentum_hook(momentum_config) + self.register_optimizer_hook(optimizer_config) + self.register_checkpoint_hook(checkpoint_config) + self.register_timer_hook(timer_config) + self.register_logger_hooks(log_config) + self.register_custom_hooks(custom_hooks_config) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/builder.py b/lavis/common/annotator/uniformer/mmcv/runner/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..77c96ba0b2f30ead9da23f293c5dc84dd3e4a74f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/builder.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +from ..utils import Registry + +RUNNERS = Registry('runner') +RUNNER_BUILDERS = Registry('runner builder') + + +def build_runner_constructor(cfg): + return RUNNER_BUILDERS.build(cfg) + + +def build_runner(cfg, default_args=None): + runner_cfg = copy.deepcopy(cfg) + constructor_type = runner_cfg.pop('constructor', + 'DefaultRunnerConstructor') + runner_constructor = build_runner_constructor( + dict( + type=constructor_type, + runner_cfg=runner_cfg, + default_args=default_args)) + runner = runner_constructor() + return runner diff --git a/lavis/common/annotator/uniformer/mmcv/runner/checkpoint.py b/lavis/common/annotator/uniformer/mmcv/runner/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..b29ca320679164432f446adad893e33fb2b4b29e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/checkpoint.py @@ -0,0 +1,707 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import io +import os +import os.path as osp +import pkgutil +import re +import time +import warnings +from collections import OrderedDict +from importlib import import_module +from tempfile import TemporaryDirectory + +import torch +import torchvision +from torch.optim import Optimizer +from torch.utils import model_zoo + +import annotator.uniformer.mmcv as mmcv +from ..fileio import FileClient +from ..fileio import load as load_file +from ..parallel import is_module_wrapper +from ..utils import mkdir_or_exist +from .dist_utils import get_dist_info + +ENV_MMCV_HOME = 'MMCV_HOME' +ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' +DEFAULT_CACHE_DIR = '~/.cache' + + +def _get_mmcv_home(): + mmcv_home = os.path.expanduser( + os.getenv( + ENV_MMCV_HOME, + os.path.join( + os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv'))) + + mkdir_or_exist(mmcv_home) + return mmcv_home + + +def load_state_dict(module, state_dict, strict=False, logger=None): + """Load state_dict to a module. + + This method is modified from :meth:`torch.nn.Module.load_state_dict`. + Default value for ``strict`` is set to ``False`` and the message for + param mismatch will be shown even if strict is False. + + Args: + module (Module): Module that receives the state_dict. + state_dict (OrderedDict): Weights. + strict (bool): whether to strictly enforce that the keys + in :attr:`state_dict` match the keys returned by this module's + :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. + logger (:obj:`logging.Logger`, optional): Logger to log the error + message. If not specified, print function will be used. + """ + unexpected_keys = [] + all_missing_keys = [] + err_msg = [] + + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + # use _load_from_state_dict to enable checkpoint version control + def load(module, prefix=''): + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_module_wrapper(module): + module = module.module + local_metadata = {} if metadata is None else metadata.get( + prefix[:-1], {}) + module._load_from_state_dict(state_dict, prefix, local_metadata, True, + all_missing_keys, unexpected_keys, + err_msg) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(module) + load = None # break load->load reference cycle + + # ignore "num_batches_tracked" of BN layers + missing_keys = [ + key for key in all_missing_keys if 'num_batches_tracked' not in key + ] + + if unexpected_keys: + err_msg.append('unexpected key in source ' + f'state_dict: {", ".join(unexpected_keys)}\n') + if missing_keys: + err_msg.append( + f'missing keys in source state_dict: {", ".join(missing_keys)}\n') + + rank, _ = get_dist_info() + if len(err_msg) > 0 and rank == 0: + err_msg.insert( + 0, 'The model and loaded state dict do not match exactly\n') + err_msg = '\n'.join(err_msg) + if strict: + raise RuntimeError(err_msg) + elif logger is not None: + logger.warning(err_msg) + else: + print(err_msg) + + +def get_torchvision_models(): + model_urls = dict() + for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): + if ispkg: + continue + _zoo = import_module(f'torchvision.models.{name}') + if hasattr(_zoo, 'model_urls'): + _urls = getattr(_zoo, 'model_urls') + model_urls.update(_urls) + return model_urls + + +def get_external_models(): + mmcv_home = _get_mmcv_home() + default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json') + default_urls = load_file(default_json_path) + assert isinstance(default_urls, dict) + external_json_path = osp.join(mmcv_home, 'open_mmlab.json') + if osp.exists(external_json_path): + external_urls = load_file(external_json_path) + assert isinstance(external_urls, dict) + default_urls.update(external_urls) + + return default_urls + + +def get_mmcls_models(): + mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json') + mmcls_urls = load_file(mmcls_json_path) + + return mmcls_urls + + +def get_deprecated_model_names(): + deprecate_json_path = osp.join(mmcv.__path__[0], + 'model_zoo/deprecated.json') + deprecate_urls = load_file(deprecate_json_path) + assert isinstance(deprecate_urls, dict) + + return deprecate_urls + + +def _process_mmcls_checkpoint(checkpoint): + state_dict = checkpoint['state_dict'] + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + if k.startswith('backbone.'): + new_state_dict[k[9:]] = v + new_checkpoint = dict(state_dict=new_state_dict) + + return new_checkpoint + + +class CheckpointLoader: + """A general checkpoint loader to manage all schemes.""" + + _schemes = {} + + @classmethod + def _register_scheme(cls, prefixes, loader, force=False): + if isinstance(prefixes, str): + prefixes = [prefixes] + else: + assert isinstance(prefixes, (list, tuple)) + for prefix in prefixes: + if (prefix not in cls._schemes) or force: + cls._schemes[prefix] = loader + else: + raise KeyError( + f'{prefix} is already registered as a loader backend, ' + 'add "force=True" if you want to override it') + # sort, longer prefixes take priority + cls._schemes = OrderedDict( + sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True)) + + @classmethod + def register_scheme(cls, prefixes, loader=None, force=False): + """Register a loader to CheckpointLoader. + + This method can be used as a normal class method or a decorator. + + Args: + prefixes (str or list[str] or tuple[str]): + The prefix of the registered loader. + loader (function, optional): The loader function to be registered. + When this method is used as a decorator, loader is None. + Defaults to None. + force (bool, optional): Whether to override the loader + if the prefix has already been registered. Defaults to False. + """ + + if loader is not None: + cls._register_scheme(prefixes, loader, force=force) + return + + def _register(loader_cls): + cls._register_scheme(prefixes, loader_cls, force=force) + return loader_cls + + return _register + + @classmethod + def _get_checkpoint_loader(cls, path): + """Finds a loader that supports the given path. Falls back to the local + loader if no other loader is found. + + Args: + path (str): checkpoint path + + Returns: + loader (function): checkpoint loader + """ + + for p in cls._schemes: + if path.startswith(p): + return cls._schemes[p] + + @classmethod + def load_checkpoint(cls, filename, map_location=None, logger=None): + """load checkpoint through URL scheme path. + + Args: + filename (str): checkpoint file name with given prefix + map_location (str, optional): Same as :func:`torch.load`. + Default: None + logger (:mod:`logging.Logger`, optional): The logger for message. + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + checkpoint_loader = cls._get_checkpoint_loader(filename) + class_name = checkpoint_loader.__name__ + mmcv.print_log( + f'load checkpoint from {class_name[10:]} path: {filename}', logger) + return checkpoint_loader(filename, map_location) + + +@CheckpointLoader.register_scheme(prefixes='') +def load_from_local(filename, map_location): + """load checkpoint by local file path. + + Args: + filename (str): local checkpoint file path + map_location (str, optional): Same as :func:`torch.load`. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + if not osp.isfile(filename): + raise IOError(f'{filename} is not a checkpoint file') + checkpoint = torch.load(filename, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes=('http://', 'https://')) +def load_from_http(filename, map_location=None, model_dir=None): + """load checkpoint through HTTP or HTTPS scheme path. In distributed + setting, this function only download checkpoint at local rank 0. + + Args: + filename (str): checkpoint file path with modelzoo or + torchvision prefix + map_location (str, optional): Same as :func:`torch.load`. + model_dir (string, optional): directory in which to save the object, + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + rank, world_size = get_dist_info() + rank = int(os.environ.get('LOCAL_RANK', rank)) + if rank == 0: + checkpoint = model_zoo.load_url( + filename, model_dir=model_dir, map_location=map_location) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + checkpoint = model_zoo.load_url( + filename, model_dir=model_dir, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes='pavi://') +def load_from_pavi(filename, map_location=None): + """load checkpoint through the file path prefixed with pavi. In distributed + setting, this function download ckpt at all ranks to different temporary + directories. + + Args: + filename (str): checkpoint file path with pavi prefix + map_location (str, optional): Same as :func:`torch.load`. + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + assert filename.startswith('pavi://'), \ + f'Expected filename startswith `pavi://`, but get {filename}' + model_path = filename[7:] + + try: + from pavi import modelcloud + except ImportError: + raise ImportError( + 'Please install pavi to load checkpoint from modelcloud.') + + model = modelcloud.get(model_path) + with TemporaryDirectory() as tmp_dir: + downloaded_file = osp.join(tmp_dir, model.name) + model.download(downloaded_file) + checkpoint = torch.load(downloaded_file, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes='s3://') +def load_from_ceph(filename, map_location=None, backend='petrel'): + """load checkpoint through the file path prefixed with s3. In distributed + setting, this function download ckpt at all ranks to different temporary + directories. + + Args: + filename (str): checkpoint file path with s3 prefix + map_location (str, optional): Same as :func:`torch.load`. + backend (str, optional): The storage backend type. Options are 'ceph', + 'petrel'. Default: 'petrel'. + + .. warning:: + :class:`mmcv.fileio.file_client.CephBackend` will be deprecated, + please use :class:`mmcv.fileio.file_client.PetrelBackend` instead. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + allowed_backends = ['ceph', 'petrel'] + if backend not in allowed_backends: + raise ValueError(f'Load from Backend {backend} is not supported.') + + if backend == 'ceph': + warnings.warn( + 'CephBackend will be deprecated, please use PetrelBackend instead') + + # CephClient and PetrelBackend have the same prefix 's3://' and the latter + # will be chosen as default. If PetrelBackend can not be instantiated + # successfully, the CephClient will be chosen. + try: + file_client = FileClient(backend=backend) + except ImportError: + allowed_backends.remove(backend) + file_client = FileClient(backend=allowed_backends[0]) + + with io.BytesIO(file_client.get(filename)) as buffer: + checkpoint = torch.load(buffer, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://')) +def load_from_torchvision(filename, map_location=None): + """load checkpoint through the file path prefixed with modelzoo or + torchvision. + + Args: + filename (str): checkpoint file path with modelzoo or + torchvision prefix + map_location (str, optional): Same as :func:`torch.load`. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + model_urls = get_torchvision_models() + if filename.startswith('modelzoo://'): + warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' + 'use "torchvision://" instead') + model_name = filename[11:] + else: + model_name = filename[14:] + return load_from_http(model_urls[model_name], map_location=map_location) + + +@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://')) +def load_from_openmmlab(filename, map_location=None): + """load checkpoint through the file path prefixed with open-mmlab or + openmmlab. + + Args: + filename (str): checkpoint file path with open-mmlab or + openmmlab prefix + map_location (str, optional): Same as :func:`torch.load`. + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + model_urls = get_external_models() + prefix_str = 'open-mmlab://' + if filename.startswith(prefix_str): + model_name = filename[13:] + else: + model_name = filename[12:] + prefix_str = 'openmmlab://' + + deprecated_urls = get_deprecated_model_names() + if model_name in deprecated_urls: + warnings.warn(f'{prefix_str}{model_name} is deprecated in favor ' + f'of {prefix_str}{deprecated_urls[model_name]}') + model_name = deprecated_urls[model_name] + model_url = model_urls[model_name] + # check if is url + if model_url.startswith(('http://', 'https://')): + checkpoint = load_from_http(model_url, map_location=map_location) + else: + filename = osp.join(_get_mmcv_home(), model_url) + if not osp.isfile(filename): + raise IOError(f'{filename} is not a checkpoint file') + checkpoint = torch.load(filename, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes='mmcls://') +def load_from_mmcls(filename, map_location=None): + """load checkpoint through the file path prefixed with mmcls. + + Args: + filename (str): checkpoint file path with mmcls prefix + map_location (str, optional): Same as :func:`torch.load`. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + model_urls = get_mmcls_models() + model_name = filename[8:] + checkpoint = load_from_http( + model_urls[model_name], map_location=map_location) + checkpoint = _process_mmcls_checkpoint(checkpoint) + return checkpoint + + +def _load_checkpoint(filename, map_location=None, logger=None): + """Load checkpoint from somewhere (modelzoo, file, url). + + Args: + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str, optional): Same as :func:`torch.load`. + Default: None. + logger (:mod:`logging.Logger`, optional): The logger for error message. + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. It can be either an + OrderedDict storing model weights or a dict containing other + information, which depends on the checkpoint. + """ + return CheckpointLoader.load_checkpoint(filename, map_location, logger) + + +def _load_checkpoint_with_prefix(prefix, filename, map_location=None): + """Load partial pretrained model with specific prefix. + + Args: + prefix (str): The prefix of sub-module. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str | None): Same as :func:`torch.load`. Default: None. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + checkpoint = _load_checkpoint(filename, map_location=map_location) + + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + if not prefix.endswith('.'): + prefix += '.' + prefix_len = len(prefix) + + state_dict = { + k[prefix_len:]: v + for k, v in state_dict.items() if k.startswith(prefix) + } + + assert state_dict, f'{prefix} is not in the pretrained model' + return state_dict + + +def load_checkpoint(model, + filename, + map_location=None, + strict=False, + logger=None, + revise_keys=[(r'^module\.', '')]): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + revise_keys (list): A list of customized keywords to modify the + state_dict in checkpoint. Each item is a (pattern, replacement) + pair of the regular expression operations. Default: strip + the prefix 'module.' by [(r'^module\\.', '')]. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + checkpoint = _load_checkpoint(filename, map_location, logger) + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + # strip prefix of state_dict + metadata = getattr(state_dict, '_metadata', OrderedDict()) + for p, r in revise_keys: + state_dict = OrderedDict( + {re.sub(p, r, k): v + for k, v in state_dict.items()}) + # Keep metadata in state_dict + state_dict._metadata = metadata + + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint + + +def weights_to_cpu(state_dict): + """Copy a model state_dict to cpu. + + Args: + state_dict (OrderedDict): Model weights on GPU. + + Returns: + OrderedDict: Model weights on GPU. + """ + state_dict_cpu = OrderedDict() + for key, val in state_dict.items(): + state_dict_cpu[key] = val.cpu() + # Keep metadata in state_dict + state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict()) + return state_dict_cpu + + +def _save_to_state_dict(module, destination, prefix, keep_vars): + """Saves module state to `destination` dictionary. + + This method is modified from :meth:`torch.nn.Module._save_to_state_dict`. + + Args: + module (nn.Module): The module to generate state_dict. + destination (dict): A dict where state will be stored. + prefix (str): The prefix for parameters and buffers used in this + module. + """ + for name, param in module._parameters.items(): + if param is not None: + destination[prefix + name] = param if keep_vars else param.detach() + for name, buf in module._buffers.items(): + # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d + if buf is not None: + destination[prefix + name] = buf if keep_vars else buf.detach() + + +def get_state_dict(module, destination=None, prefix='', keep_vars=False): + """Returns a dictionary containing a whole state of the module. + + Both parameters and persistent buffers (e.g. running averages) are + included. Keys are corresponding parameter and buffer names. + + This method is modified from :meth:`torch.nn.Module.state_dict` to + recursively check parallel module in case that the model has a complicated + structure, e.g., nn.Module(nn.Module(DDP)). + + Args: + module (nn.Module): The module to generate state_dict. + destination (OrderedDict): Returned dict for the state of the + module. + prefix (str): Prefix of the key. + keep_vars (bool): Whether to keep the variable property of the + parameters. Default: False. + + Returns: + dict: A dictionary containing a whole state of the module. + """ + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_module_wrapper(module): + module = module.module + + # below is the same as torch.nn.Module.state_dict() + if destination is None: + destination = OrderedDict() + destination._metadata = OrderedDict() + destination._metadata[prefix[:-1]] = local_metadata = dict( + version=module._version) + _save_to_state_dict(module, destination, prefix, keep_vars) + for name, child in module._modules.items(): + if child is not None: + get_state_dict( + child, destination, prefix + name + '.', keep_vars=keep_vars) + for hook in module._state_dict_hooks.values(): + hook_result = hook(module, destination, prefix, local_metadata) + if hook_result is not None: + destination = hook_result + return destination + + +def save_checkpoint(model, + filename, + optimizer=None, + meta=None, + file_client_args=None): + """Save checkpoint to file. + + The checkpoint will have 3 fields: ``meta``, ``state_dict`` and + ``optimizer``. By default ``meta`` will contain version and time info. + + Args: + model (Module): Module whose params are to be saved. + filename (str): Checkpoint filename. + optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. + meta (dict, optional): Metadata to be saved in checkpoint. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + `New in version 1.3.16.` + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError(f'meta must be a dict or None, but got {type(meta)}') + meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) + + if is_module_wrapper(model): + model = model.module + + if hasattr(model, 'CLASSES') and model.CLASSES is not None: + # save class name to the meta + meta.update(CLASSES=model.CLASSES) + + checkpoint = { + 'meta': meta, + 'state_dict': weights_to_cpu(get_state_dict(model)) + } + # save optimizer state dict in the checkpoint + if isinstance(optimizer, Optimizer): + checkpoint['optimizer'] = optimizer.state_dict() + elif isinstance(optimizer, dict): + checkpoint['optimizer'] = {} + for name, optim in optimizer.items(): + checkpoint['optimizer'][name] = optim.state_dict() + + if filename.startswith('pavi://'): + if file_client_args is not None: + raise ValueError( + 'file_client_args should be "None" if filename starts with' + f'"pavi://", but got {file_client_args}') + try: + from pavi import modelcloud + from pavi import exception + except ImportError: + raise ImportError( + 'Please install pavi to load checkpoint from modelcloud.') + model_path = filename[7:] + root = modelcloud.Folder() + model_dir, model_name = osp.split(model_path) + try: + model = modelcloud.get(model_dir) + except exception.NodeNotFoundError: + model = root.create_training_model(model_dir) + with TemporaryDirectory() as tmp_dir: + checkpoint_file = osp.join(tmp_dir, model_name) + with open(checkpoint_file, 'wb') as f: + torch.save(checkpoint, f) + f.flush() + model.create_file(checkpoint_file, name=model_name) + else: + file_client = FileClient.infer_client(file_client_args, filename) + with io.BytesIO() as f: + torch.save(checkpoint, f) + file_client.put(f.getvalue(), filename) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/default_constructor.py b/lavis/common/annotator/uniformer/mmcv/runner/default_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..3f1f5b44168768dfda3947393a63a6cf9cf50b41 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/default_constructor.py @@ -0,0 +1,44 @@ +from .builder import RUNNER_BUILDERS, RUNNERS + + +@RUNNER_BUILDERS.register_module() +class DefaultRunnerConstructor: + """Default constructor for runners. + + Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`. + For example, We can inject some new properties and functions for `Runner`. + + Example: + >>> from annotator.uniformer.mmcv.runner import RUNNER_BUILDERS, build_runner + >>> # Define a new RunnerReconstructor + >>> @RUNNER_BUILDERS.register_module() + >>> class MyRunnerConstructor: + ... def __init__(self, runner_cfg, default_args=None): + ... if not isinstance(runner_cfg, dict): + ... raise TypeError('runner_cfg should be a dict', + ... f'but got {type(runner_cfg)}') + ... self.runner_cfg = runner_cfg + ... self.default_args = default_args + ... + ... def __call__(self): + ... runner = RUNNERS.build(self.runner_cfg, + ... default_args=self.default_args) + ... # Add new properties for existing runner + ... runner.my_name = 'my_runner' + ... runner.my_function = lambda self: print(self.my_name) + ... ... + >>> # build your runner + >>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40, + ... constructor='MyRunnerConstructor') + >>> runner = build_runner(runner_cfg) + """ + + def __init__(self, runner_cfg, default_args=None): + if not isinstance(runner_cfg, dict): + raise TypeError('runner_cfg should be a dict', + f'but got {type(runner_cfg)}') + self.runner_cfg = runner_cfg + self.default_args = default_args + + def __call__(self): + return RUNNERS.build(self.runner_cfg, default_args=self.default_args) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/dist_utils.py b/lavis/common/annotator/uniformer/mmcv/runner/dist_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d3a1ef3fda5ceeb31bf15a73779da1b1903ab0fe --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/dist_utils.py @@ -0,0 +1,164 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +import os +import subprocess +from collections import OrderedDict + +import torch +import torch.multiprocessing as mp +from torch import distributed as dist +from torch._utils import (_flatten_dense_tensors, _take_tensors, + _unflatten_dense_tensors) + + +def init_dist(launcher, backend='nccl', **kwargs): + if mp.get_start_method(allow_none=True) is None: + mp.set_start_method('spawn') + if launcher == 'pytorch': + _init_dist_pytorch(backend, **kwargs) + elif launcher == 'mpi': + _init_dist_mpi(backend, **kwargs) + elif launcher == 'slurm': + _init_dist_slurm(backend, **kwargs) + else: + raise ValueError(f'Invalid launcher type: {launcher}') + + +def _init_dist_pytorch(backend, **kwargs): + # TODO: use local_rank instead of rank % num_gpus + rank = int(os.environ['RANK']) + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(rank % num_gpus) + dist.init_process_group(backend=backend, **kwargs) + + +def _init_dist_mpi(backend, **kwargs): + # TODO: use local_rank instead of rank % num_gpus + rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(rank % num_gpus) + dist.init_process_group(backend=backend, **kwargs) + + +def _init_dist_slurm(backend, port=None): + """Initialize slurm distributed training environment. + + If argument ``port`` is not specified, then the master port will be system + environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system + environment variable, then a default port ``29500`` will be used. + + Args: + backend (str): Backend of torch.distributed. + port (int, optional): Master port. Defaults to None. + """ + proc_id = int(os.environ['SLURM_PROCID']) + ntasks = int(os.environ['SLURM_NTASKS']) + node_list = os.environ['SLURM_NODELIST'] + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(proc_id % num_gpus) + addr = subprocess.getoutput( + f'scontrol show hostname {node_list} | head -n1') + # specify master port + if port is not None: + os.environ['MASTER_PORT'] = str(port) + elif 'MASTER_PORT' in os.environ: + pass # use MASTER_PORT in the environment variable + else: + # 29500 is torch.distributed default port + os.environ['MASTER_PORT'] = '29500' + # use MASTER_ADDR in the environment variable if it already exists + if 'MASTER_ADDR' not in os.environ: + os.environ['MASTER_ADDR'] = addr + os.environ['WORLD_SIZE'] = str(ntasks) + os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) + os.environ['RANK'] = str(proc_id) + dist.init_process_group(backend=backend) + + +def get_dist_info(): + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank() + world_size = dist.get_world_size() + else: + rank = 0 + world_size = 1 + return rank, world_size + + +def master_only(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + rank, _ = get_dist_info() + if rank == 0: + return func(*args, **kwargs) + + return wrapper + + +def allreduce_params(params, coalesce=True, bucket_size_mb=-1): + """Allreduce parameters. + + Args: + params (list[torch.Parameters]): List of parameters or buffers of a + model. + coalesce (bool, optional): Whether allreduce parameters as a whole. + Defaults to True. + bucket_size_mb (int, optional): Size of bucket, the unit is MB. + Defaults to -1. + """ + _, world_size = get_dist_info() + if world_size == 1: + return + params = [param.data for param in params] + if coalesce: + _allreduce_coalesced(params, world_size, bucket_size_mb) + else: + for tensor in params: + dist.all_reduce(tensor.div_(world_size)) + + +def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): + """Allreduce gradients. + + Args: + params (list[torch.Parameters]): List of parameters of a model + coalesce (bool, optional): Whether allreduce parameters as a whole. + Defaults to True. + bucket_size_mb (int, optional): Size of bucket, the unit is MB. + Defaults to -1. + """ + grads = [ + param.grad.data for param in params + if param.requires_grad and param.grad is not None + ] + _, world_size = get_dist_info() + if world_size == 1: + return + if coalesce: + _allreduce_coalesced(grads, world_size, bucket_size_mb) + else: + for tensor in grads: + dist.all_reduce(tensor.div_(world_size)) + + +def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): + if bucket_size_mb > 0: + bucket_size_bytes = bucket_size_mb * 1024 * 1024 + buckets = _take_tensors(tensors, bucket_size_bytes) + else: + buckets = OrderedDict() + for tensor in tensors: + tp = tensor.type() + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(tensor) + buckets = buckets.values() + + for bucket in buckets: + flat_tensors = _flatten_dense_tensors(bucket) + dist.all_reduce(flat_tensors) + flat_tensors.div_(world_size) + for tensor, synced in zip( + bucket, _unflatten_dense_tensors(flat_tensors, bucket)): + tensor.copy_(synced) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/epoch_based_runner.py b/lavis/common/annotator/uniformer/mmcv/runner/epoch_based_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..766a9ce6afdf09cd11b1b15005f5132583011348 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/epoch_based_runner.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import platform +import shutil +import time +import warnings + +import torch + +import annotator.uniformer.mmcv as mmcv +from .base_runner import BaseRunner +from .builder import RUNNERS +from .checkpoint import save_checkpoint +from .utils import get_host_info + + +@RUNNERS.register_module() +class EpochBasedRunner(BaseRunner): + """Epoch-based Runner. + + This runner train models epoch by epoch. + """ + + def run_iter(self, data_batch, train_mode, **kwargs): + if self.batch_processor is not None: + outputs = self.batch_processor( + self.model, data_batch, train_mode=train_mode, **kwargs) + elif train_mode: + outputs = self.model.train_step(data_batch, self.optimizer, + **kwargs) + else: + outputs = self.model.val_step(data_batch, self.optimizer, **kwargs) + if not isinstance(outputs, dict): + raise TypeError('"batch_processor()" or "model.train_step()"' + 'and "model.val_step()" must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs + + def train(self, data_loader, **kwargs): + self.model.train() + self.mode = 'train' + self.data_loader = data_loader + self._max_iters = self._max_epochs * len(self.data_loader) + self.call_hook('before_train_epoch') + time.sleep(2) # Prevent possible deadlock during epoch transition + for i, data_batch in enumerate(self.data_loader): + self._inner_iter = i + self.call_hook('before_train_iter') + self.run_iter(data_batch, train_mode=True, **kwargs) + self.call_hook('after_train_iter') + self._iter += 1 + + self.call_hook('after_train_epoch') + self._epoch += 1 + + @torch.no_grad() + def val(self, data_loader, **kwargs): + self.model.eval() + self.mode = 'val' + self.data_loader = data_loader + self.call_hook('before_val_epoch') + time.sleep(2) # Prevent possible deadlock during epoch transition + for i, data_batch in enumerate(self.data_loader): + self._inner_iter = i + self.call_hook('before_val_iter') + self.run_iter(data_batch, train_mode=False) + self.call_hook('after_val_iter') + + self.call_hook('after_val_epoch') + + def run(self, data_loaders, workflow, max_epochs=None, **kwargs): + """Start running. + + Args: + data_loaders (list[:obj:`DataLoader`]): Dataloaders for training + and validation. + workflow (list[tuple]): A list of (phase, epochs) to specify the + running order and epochs. E.g, [('train', 2), ('val', 1)] means + running 2 epochs for training and 1 epoch for validation, + iteratively. + """ + assert isinstance(data_loaders, list) + assert mmcv.is_list_of(workflow, tuple) + assert len(data_loaders) == len(workflow) + if max_epochs is not None: + warnings.warn( + 'setting max_epochs in run is deprecated, ' + 'please set max_epochs in runner_config', DeprecationWarning) + self._max_epochs = max_epochs + + assert self._max_epochs is not None, ( + 'max_epochs must be specified during instantiation') + + for i, flow in enumerate(workflow): + mode, epochs = flow + if mode == 'train': + self._max_iters = self._max_epochs * len(data_loaders[i]) + break + + work_dir = self.work_dir if self.work_dir is not None else 'NONE' + self.logger.info('Start running, host: %s, work_dir: %s', + get_host_info(), work_dir) + self.logger.info('Hooks will be executed in the following order:\n%s', + self.get_hook_info()) + self.logger.info('workflow: %s, max: %d epochs', workflow, + self._max_epochs) + self.call_hook('before_run') + + while self.epoch < self._max_epochs: + for i, flow in enumerate(workflow): + mode, epochs = flow + if isinstance(mode, str): # self.train() + if not hasattr(self, mode): + raise ValueError( + f'runner has no method named "{mode}" to run an ' + 'epoch') + epoch_runner = getattr(self, mode) + else: + raise TypeError( + 'mode in workflow must be a str, but got {}'.format( + type(mode))) + + for _ in range(epochs): + if mode == 'train' and self.epoch >= self._max_epochs: + break + epoch_runner(data_loaders[i], **kwargs) + + time.sleep(1) # wait for some hooks like loggers to finish + self.call_hook('after_run') + + def save_checkpoint(self, + out_dir, + filename_tmpl='epoch_{}.pth', + save_optimizer=True, + meta=None, + create_symlink=True): + """Save the checkpoint. + + Args: + out_dir (str): The directory that checkpoints are saved. + filename_tmpl (str, optional): The checkpoint filename template, + which contains a placeholder for the epoch number. + Defaults to 'epoch_{}.pth'. + save_optimizer (bool, optional): Whether to save the optimizer to + the checkpoint. Defaults to True. + meta (dict, optional): The meta information to be saved in the + checkpoint. Defaults to None. + create_symlink (bool, optional): Whether to create a symlink + "latest.pth" to point to the latest checkpoint. + Defaults to True. + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError( + f'meta should be a dict or None, but got {type(meta)}') + if self.meta is not None: + meta.update(self.meta) + # Note: meta.update(self.meta) should be done before + # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise + # there will be problems with resumed checkpoints. + # More details in https://github.com/open-mmlab/mmcv/pull/1108 + meta.update(epoch=self.epoch + 1, iter=self.iter) + + filename = filename_tmpl.format(self.epoch + 1) + filepath = osp.join(out_dir, filename) + optimizer = self.optimizer if save_optimizer else None + save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) + # in some environments, `os.symlink` is not supported, you may need to + # set `create_symlink` to False + if create_symlink: + dst_file = osp.join(out_dir, 'latest.pth') + if platform.system() != 'Windows': + mmcv.symlink(filename, dst_file) + else: + shutil.copy(filepath, dst_file) + + +@RUNNERS.register_module() +class Runner(EpochBasedRunner): + """Deprecated name of EpochBasedRunner.""" + + def __init__(self, *args, **kwargs): + warnings.warn( + 'Runner was deprecated, please use EpochBasedRunner instead') + super().__init__(*args, **kwargs) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/fp16_utils.py b/lavis/common/annotator/uniformer/mmcv/runner/fp16_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1981011d6859192e3e663e29d13500d56ba47f6c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/fp16_utils.py @@ -0,0 +1,410 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +import warnings +from collections import abc +from inspect import getfullargspec + +import numpy as np +import torch +import torch.nn as nn + +from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version +from .dist_utils import allreduce_grads as _allreduce_grads + +try: + # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported + # and used; otherwise, auto fp16 will adopt mmcv's implementation. + # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16 + # manually, so the behavior may not be consistent with real amp. + from torch.cuda.amp import autocast +except ImportError: + pass + + +def cast_tensor_type(inputs, src_type, dst_type): + """Recursively convert Tensor in inputs from src_type to dst_type. + + Args: + inputs: Inputs that to be casted. + src_type (torch.dtype): Source type.. + dst_type (torch.dtype): Destination type. + + Returns: + The same type with inputs, but all contained Tensors have been cast. + """ + if isinstance(inputs, nn.Module): + return inputs + elif isinstance(inputs, torch.Tensor): + return inputs.to(dst_type) + elif isinstance(inputs, str): + return inputs + elif isinstance(inputs, np.ndarray): + return inputs + elif isinstance(inputs, abc.Mapping): + return type(inputs)({ + k: cast_tensor_type(v, src_type, dst_type) + for k, v in inputs.items() + }) + elif isinstance(inputs, abc.Iterable): + return type(inputs)( + cast_tensor_type(item, src_type, dst_type) for item in inputs) + else: + return inputs + + +def auto_fp16(apply_to=None, out_fp32=False): + """Decorator to enable fp16 training automatically. + + This decorator is useful when you write custom modules and want to support + mixed precision training. If inputs arguments are fp32 tensors, they will + be converted to fp16 automatically. Arguments other than fp32 tensors are + ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the + backend, otherwise, original mmcv implementation will be adopted. + + Args: + apply_to (Iterable, optional): The argument names to be converted. + `None` indicates all arguments. + out_fp32 (bool): Whether to convert the output back to fp32. + + Example: + + >>> import torch.nn as nn + >>> class MyModule1(nn.Module): + >>> + >>> # Convert x and y to fp16 + >>> @auto_fp16() + >>> def forward(self, x, y): + >>> pass + + >>> import torch.nn as nn + >>> class MyModule2(nn.Module): + >>> + >>> # convert pred to fp16 + >>> @auto_fp16(apply_to=('pred', )) + >>> def do_something(self, pred, others): + >>> pass + """ + + def auto_fp16_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # check if the module has set the attribute `fp16_enabled`, if not, + # just fallback to the original method. + if not isinstance(args[0], torch.nn.Module): + raise TypeError('@auto_fp16 can only be used to decorate the ' + 'method of nn.Module') + if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): + return old_func(*args, **kwargs) + + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get the argument names to be casted + args_to_cast = args_info.args if apply_to is None else apply_to + # convert the args that need to be processed + new_args = [] + # NOTE: default args are not taken into consideration + if args: + arg_names = args_info.args[:len(args)] + for i, arg_name in enumerate(arg_names): + if arg_name in args_to_cast: + new_args.append( + cast_tensor_type(args[i], torch.float, torch.half)) + else: + new_args.append(args[i]) + # convert the kwargs that need to be processed + new_kwargs = {} + if kwargs: + for arg_name, arg_value in kwargs.items(): + if arg_name in args_to_cast: + new_kwargs[arg_name] = cast_tensor_type( + arg_value, torch.float, torch.half) + else: + new_kwargs[arg_name] = arg_value + # apply converted arguments to the decorated method + if (TORCH_VERSION != 'parrots' and + digit_version(TORCH_VERSION) >= digit_version('1.6.0')): + with autocast(enabled=True): + output = old_func(*new_args, **new_kwargs) + else: + output = old_func(*new_args, **new_kwargs) + # cast the results back to fp32 if necessary + if out_fp32: + output = cast_tensor_type(output, torch.half, torch.float) + return output + + return new_func + + return auto_fp16_wrapper + + +def force_fp32(apply_to=None, out_fp16=False): + """Decorator to convert input arguments to fp32 in force. + + This decorator is useful when you write custom modules and want to support + mixed precision training. If there are some inputs that must be processed + in fp32 mode, then this decorator can handle it. If inputs arguments are + fp16 tensors, they will be converted to fp32 automatically. Arguments other + than fp16 tensors are ignored. If you are using PyTorch >= 1.6, + torch.cuda.amp is used as the backend, otherwise, original mmcv + implementation will be adopted. + + Args: + apply_to (Iterable, optional): The argument names to be converted. + `None` indicates all arguments. + out_fp16 (bool): Whether to convert the output back to fp16. + + Example: + + >>> import torch.nn as nn + >>> class MyModule1(nn.Module): + >>> + >>> # Convert x and y to fp32 + >>> @force_fp32() + >>> def loss(self, x, y): + >>> pass + + >>> import torch.nn as nn + >>> class MyModule2(nn.Module): + >>> + >>> # convert pred to fp32 + >>> @force_fp32(apply_to=('pred', )) + >>> def post_process(self, pred, others): + >>> pass + """ + + def force_fp32_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # check if the module has set the attribute `fp16_enabled`, if not, + # just fallback to the original method. + if not isinstance(args[0], torch.nn.Module): + raise TypeError('@force_fp32 can only be used to decorate the ' + 'method of nn.Module') + if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): + return old_func(*args, **kwargs) + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get the argument names to be casted + args_to_cast = args_info.args if apply_to is None else apply_to + # convert the args that need to be processed + new_args = [] + if args: + arg_names = args_info.args[:len(args)] + for i, arg_name in enumerate(arg_names): + if arg_name in args_to_cast: + new_args.append( + cast_tensor_type(args[i], torch.half, torch.float)) + else: + new_args.append(args[i]) + # convert the kwargs that need to be processed + new_kwargs = dict() + if kwargs: + for arg_name, arg_value in kwargs.items(): + if arg_name in args_to_cast: + new_kwargs[arg_name] = cast_tensor_type( + arg_value, torch.half, torch.float) + else: + new_kwargs[arg_name] = arg_value + # apply converted arguments to the decorated method + if (TORCH_VERSION != 'parrots' and + digit_version(TORCH_VERSION) >= digit_version('1.6.0')): + with autocast(enabled=False): + output = old_func(*new_args, **new_kwargs) + else: + output = old_func(*new_args, **new_kwargs) + # cast the results back to fp32 if necessary + if out_fp16: + output = cast_tensor_type(output, torch.float, torch.half) + return output + + return new_func + + return force_fp32_wrapper + + +def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): + warnings.warning( + '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be ' + 'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads') + _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb) + + +def wrap_fp16_model(model): + """Wrap the FP32 model to FP16. + + If you are using PyTorch >= 1.6, torch.cuda.amp is used as the + backend, otherwise, original mmcv implementation will be adopted. + + For PyTorch >= 1.6, this function will + 1. Set fp16 flag inside the model to True. + + Otherwise: + 1. Convert FP32 model to FP16. + 2. Remain some necessary layers to be FP32, e.g., normalization layers. + 3. Set `fp16_enabled` flag inside the model to True. + + Args: + model (nn.Module): Model in FP32. + """ + if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.6.0')): + # convert model to fp16 + model.half() + # patch the normalization layers to make it work in fp32 mode + patch_norm_fp32(model) + # set `fp16_enabled` flag + for m in model.modules(): + if hasattr(m, 'fp16_enabled'): + m.fp16_enabled = True + + +def patch_norm_fp32(module): + """Recursively convert normalization layers from FP16 to FP32. + + Args: + module (nn.Module): The modules to be converted in FP16. + + Returns: + nn.Module: The converted module, the normalization layers have been + converted to FP32. + """ + if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)): + module.float() + if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3': + module.forward = patch_forward_method(module.forward, torch.half, + torch.float) + for child in module.children(): + patch_norm_fp32(child) + return module + + +def patch_forward_method(func, src_type, dst_type, convert_output=True): + """Patch the forward method of a module. + + Args: + func (callable): The original forward method. + src_type (torch.dtype): Type of input arguments to be converted from. + dst_type (torch.dtype): Type of input arguments to be converted to. + convert_output (bool): Whether to convert the output back to src_type. + + Returns: + callable: The patched forward method. + """ + + def new_forward(*args, **kwargs): + output = func(*cast_tensor_type(args, src_type, dst_type), + **cast_tensor_type(kwargs, src_type, dst_type)) + if convert_output: + output = cast_tensor_type(output, dst_type, src_type) + return output + + return new_forward + + +class LossScaler: + """Class that manages loss scaling in mixed precision training which + supports both dynamic or static mode. + + The implementation refers to + https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py. + Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling. + It's important to understand how :class:`LossScaler` operates. + Loss scaling is designed to combat the problem of underflowing + gradients encountered at long times when training fp16 networks. + Dynamic loss scaling begins by attempting a very high loss + scale. Ironically, this may result in OVERflowing gradients. + If overflowing gradients are encountered, :class:`FP16_Optimizer` then + skips the update step for this particular iteration/minibatch, + and :class:`LossScaler` adjusts the loss scale to a lower value. + If a certain number of iterations occur without overflowing gradients + detected,:class:`LossScaler` increases the loss scale once more. + In this way :class:`LossScaler` attempts to "ride the edge" of always + using the highest loss scale possible without incurring overflow. + + Args: + init_scale (float): Initial loss scale value, default: 2**32. + scale_factor (float): Factor used when adjusting the loss scale. + Default: 2. + mode (str): Loss scaling mode. 'dynamic' or 'static' + scale_window (int): Number of consecutive iterations without an + overflow to wait before increasing the loss scale. Default: 1000. + """ + + def __init__(self, + init_scale=2**32, + mode='dynamic', + scale_factor=2., + scale_window=1000): + self.cur_scale = init_scale + self.cur_iter = 0 + assert mode in ('dynamic', + 'static'), 'mode can only be dynamic or static' + self.mode = mode + self.last_overflow_iter = -1 + self.scale_factor = scale_factor + self.scale_window = scale_window + + def has_overflow(self, params): + """Check if params contain overflow.""" + if self.mode != 'dynamic': + return False + for p in params: + if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data): + return True + return False + + def _has_inf_or_nan(x): + """Check if params contain NaN.""" + try: + cpu_sum = float(x.float().sum()) + except RuntimeError as instance: + if 'value cannot be converted' not in instance.args[0]: + raise + return True + else: + if cpu_sum == float('inf') or cpu_sum == -float('inf') \ + or cpu_sum != cpu_sum: + return True + return False + + def update_scale(self, overflow): + """update the current loss scale value when overflow happens.""" + if self.mode != 'dynamic': + return + if overflow: + self.cur_scale = max(self.cur_scale / self.scale_factor, 1) + self.last_overflow_iter = self.cur_iter + else: + if (self.cur_iter - self.last_overflow_iter) % \ + self.scale_window == 0: + self.cur_scale *= self.scale_factor + self.cur_iter += 1 + + def state_dict(self): + """Returns the state of the scaler as a :class:`dict`.""" + return dict( + cur_scale=self.cur_scale, + cur_iter=self.cur_iter, + mode=self.mode, + last_overflow_iter=self.last_overflow_iter, + scale_factor=self.scale_factor, + scale_window=self.scale_window) + + def load_state_dict(self, state_dict): + """Loads the loss_scaler state dict. + + Args: + state_dict (dict): scaler state. + """ + self.cur_scale = state_dict['cur_scale'] + self.cur_iter = state_dict['cur_iter'] + self.mode = state_dict['mode'] + self.last_overflow_iter = state_dict['last_overflow_iter'] + self.scale_factor = state_dict['scale_factor'] + self.scale_window = state_dict['scale_window'] + + @property + def loss_scale(self): + return self.cur_scale diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/__init__.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..915af28cefab14a14c1188ed861161080fd138a3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .checkpoint import CheckpointHook +from .closure import ClosureHook +from .ema import EMAHook +from .evaluation import DistEvalHook, EvalHook +from .hook import HOOKS, Hook +from .iter_timer import IterTimerHook +from .logger import (DvcliveLoggerHook, LoggerHook, MlflowLoggerHook, + NeptuneLoggerHook, PaviLoggerHook, TensorboardLoggerHook, + TextLoggerHook, WandbLoggerHook) +from .lr_updater import LrUpdaterHook +from .memory import EmptyCacheHook +from .momentum_updater import MomentumUpdaterHook +from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook, + GradientCumulativeOptimizerHook, OptimizerHook) +from .profiler import ProfilerHook +from .sampler_seed import DistSamplerSeedHook +from .sync_buffer import SyncBuffersHook + +__all__ = [ + 'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook', + 'OptimizerHook', 'Fp16OptimizerHook', 'IterTimerHook', + 'DistSamplerSeedHook', 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', + 'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook', + 'NeptuneLoggerHook', 'WandbLoggerHook', 'DvcliveLoggerHook', + 'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook', 'EvalHook', + 'DistEvalHook', 'ProfilerHook', 'GradientCumulativeOptimizerHook', + 'GradientCumulativeFp16OptimizerHook' +] diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/checkpoint.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..6af3fae43ac4b35532641a81eb13557edfc7dfba --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/checkpoint.py @@ -0,0 +1,167 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings + +from annotator.uniformer.mmcv.fileio import FileClient +from ..dist_utils import allreduce_params, master_only +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class CheckpointHook(Hook): + """Save checkpoints periodically. + + Args: + interval (int): The saving period. If ``by_epoch=True``, interval + indicates epochs, otherwise it indicates iterations. + Default: -1, which means "never". + by_epoch (bool): Saving checkpoints by epoch or by iteration. + Default: True. + save_optimizer (bool): Whether to save optimizer state_dict in the + checkpoint. It is usually used for resuming experiments. + Default: True. + out_dir (str, optional): The root directory to save checkpoints. If not + specified, ``runner.work_dir`` will be used by default. If + specified, the ``out_dir`` will be the concatenation of ``out_dir`` + and the last level directory of ``runner.work_dir``. + `Changed in version 1.3.16.` + max_keep_ckpts (int, optional): The maximum checkpoints to keep. + In some cases we want only the latest few checkpoints and would + like to delete old ones to save the disk space. + Default: -1, which means unlimited. + save_last (bool, optional): Whether to force the last checkpoint to be + saved regardless of interval. Default: True. + sync_buffer (bool, optional): Whether to synchronize buffers in + different gpus. Default: False. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + `New in version 1.3.16.` + + .. warning:: + Before v1.3.16, the ``out_dir`` argument indicates the path where the + checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the + root directory and the final path to save checkpoint is the + concatenation of ``out_dir`` and the last level directory of + ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A" + and the value of ``runner.work_dir`` is "/path/of/B", then the final + path will be "/path/of/A/B". + """ + + def __init__(self, + interval=-1, + by_epoch=True, + save_optimizer=True, + out_dir=None, + max_keep_ckpts=-1, + save_last=True, + sync_buffer=False, + file_client_args=None, + **kwargs): + self.interval = interval + self.by_epoch = by_epoch + self.save_optimizer = save_optimizer + self.out_dir = out_dir + self.max_keep_ckpts = max_keep_ckpts + self.save_last = save_last + self.args = kwargs + self.sync_buffer = sync_buffer + self.file_client_args = file_client_args + + def before_run(self, runner): + if not self.out_dir: + self.out_dir = runner.work_dir + + self.file_client = FileClient.infer_client(self.file_client_args, + self.out_dir) + + # if `self.out_dir` is not equal to `runner.work_dir`, it means that + # `self.out_dir` is set so the final `self.out_dir` is the + # concatenation of `self.out_dir` and the last level directory of + # `runner.work_dir` + if self.out_dir != runner.work_dir: + basename = osp.basename(runner.work_dir.rstrip(osp.sep)) + self.out_dir = self.file_client.join_path(self.out_dir, basename) + + runner.logger.info((f'Checkpoints will be saved to {self.out_dir} by ' + f'{self.file_client.name}.')) + + # disable the create_symlink option because some file backends do not + # allow to create a symlink + if 'create_symlink' in self.args: + if self.args[ + 'create_symlink'] and not self.file_client.allow_symlink: + self.args['create_symlink'] = False + warnings.warn( + ('create_symlink is set as True by the user but is changed' + 'to be False because creating symbolic link is not ' + f'allowed in {self.file_client.name}')) + else: + self.args['create_symlink'] = self.file_client.allow_symlink + + def after_train_epoch(self, runner): + if not self.by_epoch: + return + + # save checkpoint for following cases: + # 1. every ``self.interval`` epochs + # 2. reach the last epoch of training + if self.every_n_epochs( + runner, self.interval) or (self.save_last + and self.is_last_epoch(runner)): + runner.logger.info( + f'Saving checkpoint at {runner.epoch + 1} epochs') + if self.sync_buffer: + allreduce_params(runner.model.buffers()) + self._save_checkpoint(runner) + + @master_only + def _save_checkpoint(self, runner): + """Save the current checkpoint and delete unwanted checkpoint.""" + runner.save_checkpoint( + self.out_dir, save_optimizer=self.save_optimizer, **self.args) + if runner.meta is not None: + if self.by_epoch: + cur_ckpt_filename = self.args.get( + 'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1) + else: + cur_ckpt_filename = self.args.get( + 'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1) + runner.meta.setdefault('hook_msgs', dict()) + runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path( + self.out_dir, cur_ckpt_filename) + # remove other checkpoints + if self.max_keep_ckpts > 0: + if self.by_epoch: + name = 'epoch_{}.pth' + current_ckpt = runner.epoch + 1 + else: + name = 'iter_{}.pth' + current_ckpt = runner.iter + 1 + redundant_ckpts = range( + current_ckpt - self.max_keep_ckpts * self.interval, 0, + -self.interval) + filename_tmpl = self.args.get('filename_tmpl', name) + for _step in redundant_ckpts: + ckpt_path = self.file_client.join_path( + self.out_dir, filename_tmpl.format(_step)) + if self.file_client.isfile(ckpt_path): + self.file_client.remove(ckpt_path) + else: + break + + def after_train_iter(self, runner): + if self.by_epoch: + return + + # save checkpoint for following cases: + # 1. every ``self.interval`` iterations + # 2. reach the last iteration of training + if self.every_n_iters( + runner, self.interval) or (self.save_last + and self.is_last_iter(runner)): + runner.logger.info( + f'Saving checkpoint at {runner.iter + 1} iterations') + if self.sync_buffer: + allreduce_params(runner.model.buffers()) + self._save_checkpoint(runner) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py new file mode 100644 index 0000000000000000000000000000000000000000..b955f81f425be4ac3e6bb3f4aac653887989e872 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class ClosureHook(Hook): + + def __init__(self, fn_name, fn): + assert hasattr(self, fn_name) + assert callable(fn) + setattr(self, fn_name, fn) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/ema.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..15c7e68088f019802a59e7ae41cc1fe0c7f28f96 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/ema.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...parallel import is_module_wrapper +from ..hooks.hook import HOOKS, Hook + + +@HOOKS.register_module() +class EMAHook(Hook): + r"""Exponential Moving Average Hook. + + Use Exponential Moving Average on all parameters of model in training + process. All parameters have a ema backup, which update by the formula + as below. EMAHook takes priority over EvalHook and CheckpointSaverHook. + + .. math:: + + \text{Xema\_{t+1}} = (1 - \text{momentum}) \times + \text{Xema\_{t}} + \text{momentum} \times X_t + + Args: + momentum (float): The momentum used for updating ema parameter. + Defaults to 0.0002. + interval (int): Update ema parameter every interval iteration. + Defaults to 1. + warm_up (int): During first warm_up steps, we may use smaller momentum + to update ema parameters more slowly. Defaults to 100. + resume_from (str): The checkpoint path. Defaults to None. + """ + + def __init__(self, + momentum=0.0002, + interval=1, + warm_up=100, + resume_from=None): + assert isinstance(interval, int) and interval > 0 + self.warm_up = warm_up + self.interval = interval + assert momentum > 0 and momentum < 1 + self.momentum = momentum**interval + self.checkpoint = resume_from + + def before_run(self, runner): + """To resume model with it's ema parameters more friendly. + + Register ema parameter as ``named_buffer`` to model + """ + model = runner.model + if is_module_wrapper(model): + model = model.module + self.param_ema_buffer = {} + self.model_parameters = dict(model.named_parameters(recurse=True)) + for name, value in self.model_parameters.items(): + # "." is not allowed in module's buffer name + buffer_name = f"ema_{name.replace('.', '_')}" + self.param_ema_buffer[name] = buffer_name + model.register_buffer(buffer_name, value.data.clone()) + self.model_buffers = dict(model.named_buffers(recurse=True)) + if self.checkpoint is not None: + runner.resume(self.checkpoint) + + def after_train_iter(self, runner): + """Update ema parameter every self.interval iterations.""" + curr_step = runner.iter + # We warm up the momentum considering the instability at beginning + momentum = min(self.momentum, + (1 + curr_step) / (self.warm_up + curr_step)) + if curr_step % self.interval != 0: + return + for name, parameter in self.model_parameters.items(): + buffer_name = self.param_ema_buffer[name] + buffer_parameter = self.model_buffers[buffer_name] + buffer_parameter.mul_(1 - momentum).add_(momentum, parameter.data) + + def after_train_epoch(self, runner): + """We load parameter values from ema backup to model before the + EvalHook.""" + self._swap_ema_parameters() + + def before_train_epoch(self, runner): + """We recover model's parameter from ema backup after last epoch's + EvalHook.""" + self._swap_ema_parameters() + + def _swap_ema_parameters(self): + """Swap the parameter of model with parameter in ema_buffer.""" + for name, value in self.model_parameters.items(): + temp = value.data.clone() + ema_buffer = self.model_buffers[self.param_ema_buffer[name]] + value.data.copy_(ema_buffer.data) + ema_buffer.data.copy_(temp) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/evaluation.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..4d00999ce5665c53bded8de9e084943eee2d230d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/evaluation.py @@ -0,0 +1,509 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings +from math import inf + +import torch.distributed as dist +from torch.nn.modules.batchnorm import _BatchNorm +from torch.utils.data import DataLoader + +from annotator.uniformer.mmcv.fileio import FileClient +from annotator.uniformer.mmcv.utils import is_seq_of +from .hook import Hook +from .logger import LoggerHook + + +class EvalHook(Hook): + """Non-Distributed evaluation hook. + + This hook will regularly perform evaluation in a given interval when + performing in non-distributed environment. + + Args: + dataloader (DataLoader): A PyTorch dataloader, whose dataset has + implemented ``evaluate`` function. + start (int | None, optional): Evaluation starting epoch. It enables + evaluation before the training starts if ``start`` <= the resuming + epoch. If None, whether to evaluate is merely decided by + ``interval``. Default: None. + interval (int): Evaluation interval. Default: 1. + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + Default: True. + save_best (str, optional): If a metric is specified, it would measure + the best checkpoint during evaluation. The information about best + checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep + best score value and best checkpoint path, which will be also + loaded when resume checkpoint. Options are the evaluation metrics + on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox + detection and instance segmentation. ``AR@100`` for proposal + recall. If ``save_best`` is ``auto``, the first key of the returned + ``OrderedDict`` result will be used. Default: None. + rule (str | None, optional): Comparison rule for best score. If set to + None, it will infer a reasonable rule. Keys such as 'acc', 'top' + .etc will be inferred by 'greater' rule. Keys contain 'loss' will + be inferred by 'less' rule. Options are 'greater', 'less', None. + Default: None. + test_fn (callable, optional): test a model with samples from a + dataloader, and return the test results. If ``None``, the default + test function ``mmcv.engine.single_gpu_test`` will be used. + (default: ``None``) + greater_keys (List[str] | None, optional): Metric keys that will be + inferred by 'greater' comparison rule. If ``None``, + _default_greater_keys will be used. (default: ``None``) + less_keys (List[str] | None, optional): Metric keys that will be + inferred by 'less' comparison rule. If ``None``, _default_less_keys + will be used. (default: ``None``) + out_dir (str, optional): The root directory to save checkpoints. If not + specified, `runner.work_dir` will be used by default. If specified, + the `out_dir` will be the concatenation of `out_dir` and the last + level directory of `runner.work_dir`. + `New in version 1.3.16.` + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. Default: None. + `New in version 1.3.16.` + **eval_kwargs: Evaluation arguments fed into the evaluate function of + the dataset. + + Notes: + If new arguments are added for EvalHook, tools/test.py, + tools/eval_metric.py may be affected. + """ + + # Since the key for determine greater or less is related to the downstream + # tasks, downstream repos may need to overwrite the following inner + # variable accordingly. + + rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y} + init_value_map = {'greater': -inf, 'less': inf} + _default_greater_keys = [ + 'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU', + 'mAcc', 'aAcc' + ] + _default_less_keys = ['loss'] + + def __init__(self, + dataloader, + start=None, + interval=1, + by_epoch=True, + save_best=None, + rule=None, + test_fn=None, + greater_keys=None, + less_keys=None, + out_dir=None, + file_client_args=None, + **eval_kwargs): + if not isinstance(dataloader, DataLoader): + raise TypeError(f'dataloader must be a pytorch DataLoader, ' + f'but got {type(dataloader)}') + + if interval <= 0: + raise ValueError(f'interval must be a positive number, ' + f'but got {interval}') + + assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean' + + if start is not None and start < 0: + raise ValueError(f'The evaluation start epoch {start} is smaller ' + f'than 0') + + self.dataloader = dataloader + self.interval = interval + self.start = start + self.by_epoch = by_epoch + + assert isinstance(save_best, str) or save_best is None, \ + '""save_best"" should be a str or None ' \ + f'rather than {type(save_best)}' + self.save_best = save_best + self.eval_kwargs = eval_kwargs + self.initial_flag = True + + if test_fn is None: + from annotator.uniformer.mmcv.engine import single_gpu_test + self.test_fn = single_gpu_test + else: + self.test_fn = test_fn + + if greater_keys is None: + self.greater_keys = self._default_greater_keys + else: + if not isinstance(greater_keys, (list, tuple)): + greater_keys = (greater_keys, ) + assert is_seq_of(greater_keys, str) + self.greater_keys = greater_keys + + if less_keys is None: + self.less_keys = self._default_less_keys + else: + if not isinstance(less_keys, (list, tuple)): + less_keys = (less_keys, ) + assert is_seq_of(less_keys, str) + self.less_keys = less_keys + + if self.save_best is not None: + self.best_ckpt_path = None + self._init_rule(rule, self.save_best) + + self.out_dir = out_dir + self.file_client_args = file_client_args + + def _init_rule(self, rule, key_indicator): + """Initialize rule, key_indicator, comparison_func, and best score. + + Here is the rule to determine which rule is used for key indicator + when the rule is not specific (note that the key indicator matching + is case-insensitive): + 1. If the key indicator is in ``self.greater_keys``, the rule will be + specified as 'greater'. + 2. Or if the key indicator is in ``self.less_keys``, the rule will be + specified as 'less'. + 3. Or if the key indicator is equal to the substring in any one item + in ``self.greater_keys``, the rule will be specified as 'greater'. + 4. Or if the key indicator is equal to the substring in any one item + in ``self.less_keys``, the rule will be specified as 'less'. + + Args: + rule (str | None): Comparison rule for best score. + key_indicator (str | None): Key indicator to determine the + comparison rule. + """ + if rule not in self.rule_map and rule is not None: + raise KeyError(f'rule must be greater, less or None, ' + f'but got {rule}.') + + if rule is None: + if key_indicator != 'auto': + # `_lc` here means we use the lower case of keys for + # case-insensitive matching + key_indicator_lc = key_indicator.lower() + greater_keys = [key.lower() for key in self.greater_keys] + less_keys = [key.lower() for key in self.less_keys] + + if key_indicator_lc in greater_keys: + rule = 'greater' + elif key_indicator_lc in less_keys: + rule = 'less' + elif any(key in key_indicator_lc for key in greater_keys): + rule = 'greater' + elif any(key in key_indicator_lc for key in less_keys): + rule = 'less' + else: + raise ValueError(f'Cannot infer the rule for key ' + f'{key_indicator}, thus a specific rule ' + f'must be specified.') + self.rule = rule + self.key_indicator = key_indicator + if self.rule is not None: + self.compare_func = self.rule_map[self.rule] + + def before_run(self, runner): + if not self.out_dir: + self.out_dir = runner.work_dir + + self.file_client = FileClient.infer_client(self.file_client_args, + self.out_dir) + + # if `self.out_dir` is not equal to `runner.work_dir`, it means that + # `self.out_dir` is set so the final `self.out_dir` is the + # concatenation of `self.out_dir` and the last level directory of + # `runner.work_dir` + if self.out_dir != runner.work_dir: + basename = osp.basename(runner.work_dir.rstrip(osp.sep)) + self.out_dir = self.file_client.join_path(self.out_dir, basename) + runner.logger.info( + (f'The best checkpoint will be saved to {self.out_dir} by ' + f'{self.file_client.name}')) + + if self.save_best is not None: + if runner.meta is None: + warnings.warn('runner.meta is None. Creating an empty one.') + runner.meta = dict() + runner.meta.setdefault('hook_msgs', dict()) + self.best_ckpt_path = runner.meta['hook_msgs'].get( + 'best_ckpt', None) + + def before_train_iter(self, runner): + """Evaluate the model only at the start of training by iteration.""" + if self.by_epoch or not self.initial_flag: + return + if self.start is not None and runner.iter >= self.start: + self.after_train_iter(runner) + self.initial_flag = False + + def before_train_epoch(self, runner): + """Evaluate the model only at the start of training by epoch.""" + if not (self.by_epoch and self.initial_flag): + return + if self.start is not None and runner.epoch >= self.start: + self.after_train_epoch(runner) + self.initial_flag = False + + def after_train_iter(self, runner): + """Called after every training iter to evaluate the results.""" + if not self.by_epoch and self._should_evaluate(runner): + # Because the priority of EvalHook is higher than LoggerHook, the + # training log and the evaluating log are mixed. Therefore, + # we need to dump the training log and clear it before evaluating + # log is generated. In addition, this problem will only appear in + # `IterBasedRunner` whose `self.by_epoch` is False, because + # `EpochBasedRunner` whose `self.by_epoch` is True calls + # `_do_evaluate` in `after_train_epoch` stage, and at this stage + # the training log has been printed, so it will not cause any + # problem. more details at + # https://github.com/open-mmlab/mmsegmentation/issues/694 + for hook in runner._hooks: + if isinstance(hook, LoggerHook): + hook.after_train_iter(runner) + runner.log_buffer.clear() + + self._do_evaluate(runner) + + def after_train_epoch(self, runner): + """Called after every training epoch to evaluate the results.""" + if self.by_epoch and self._should_evaluate(runner): + self._do_evaluate(runner) + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + results = self.test_fn(runner.model, self.dataloader) + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + key_score = self.evaluate(runner, results) + # the key_score may be `None` so it needs to skip the action to save + # the best checkpoint + if self.save_best and key_score: + self._save_ckpt(runner, key_score) + + def _should_evaluate(self, runner): + """Judge whether to perform evaluation. + + Here is the rule to judge whether to perform evaluation: + 1. It will not perform evaluation during the epoch/iteration interval, + which is determined by ``self.interval``. + 2. It will not perform evaluation if the start time is larger than + current time. + 3. It will not perform evaluation when current time is larger than + the start time but during epoch/iteration interval. + + Returns: + bool: The flag indicating whether to perform evaluation. + """ + if self.by_epoch: + current = runner.epoch + check_time = self.every_n_epochs + else: + current = runner.iter + check_time = self.every_n_iters + + if self.start is None: + if not check_time(runner, self.interval): + # No evaluation during the interval. + return False + elif (current + 1) < self.start: + # No evaluation if start is larger than the current time. + return False + else: + # Evaluation only at epochs/iters 3, 5, 7... + # if start==3 and interval==2 + if (current + 1 - self.start) % self.interval: + return False + return True + + def _save_ckpt(self, runner, key_score): + """Save the best checkpoint. + + It will compare the score according to the compare function, write + related information (best score, best checkpoint path) and save the + best checkpoint into ``work_dir``. + """ + if self.by_epoch: + current = f'epoch_{runner.epoch + 1}' + cur_type, cur_time = 'epoch', runner.epoch + 1 + else: + current = f'iter_{runner.iter + 1}' + cur_type, cur_time = 'iter', runner.iter + 1 + + best_score = runner.meta['hook_msgs'].get( + 'best_score', self.init_value_map[self.rule]) + if self.compare_func(key_score, best_score): + best_score = key_score + runner.meta['hook_msgs']['best_score'] = best_score + + if self.best_ckpt_path and self.file_client.isfile( + self.best_ckpt_path): + self.file_client.remove(self.best_ckpt_path) + runner.logger.info( + (f'The previous best checkpoint {self.best_ckpt_path} was ' + 'removed')) + + best_ckpt_name = f'best_{self.key_indicator}_{current}.pth' + self.best_ckpt_path = self.file_client.join_path( + self.out_dir, best_ckpt_name) + runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path + + runner.save_checkpoint( + self.out_dir, best_ckpt_name, create_symlink=False) + runner.logger.info( + f'Now best checkpoint is saved as {best_ckpt_name}.') + runner.logger.info( + f'Best {self.key_indicator} is {best_score:0.4f} ' + f'at {cur_time} {cur_type}.') + + def evaluate(self, runner, results): + """Evaluate the results. + + Args: + runner (:obj:`mmcv.Runner`): The underlined training runner. + results (list): Output results. + """ + eval_res = self.dataloader.dataset.evaluate( + results, logger=runner.logger, **self.eval_kwargs) + + for name, val in eval_res.items(): + runner.log_buffer.output[name] = val + runner.log_buffer.ready = True + + if self.save_best is not None: + # If the performance of model is pool, the `eval_res` may be an + # empty dict and it will raise exception when `self.save_best` is + # not None. More details at + # https://github.com/open-mmlab/mmdetection/issues/6265. + if not eval_res: + warnings.warn( + 'Since `eval_res` is an empty dict, the behavior to save ' + 'the best checkpoint will be skipped in this evaluation.') + return None + + if self.key_indicator == 'auto': + # infer from eval_results + self._init_rule(self.rule, list(eval_res.keys())[0]) + return eval_res[self.key_indicator] + + return None + + +class DistEvalHook(EvalHook): + """Distributed evaluation hook. + + This hook will regularly perform evaluation in a given interval when + performing in distributed environment. + + Args: + dataloader (DataLoader): A PyTorch dataloader, whose dataset has + implemented ``evaluate`` function. + start (int | None, optional): Evaluation starting epoch. It enables + evaluation before the training starts if ``start`` <= the resuming + epoch. If None, whether to evaluate is merely decided by + ``interval``. Default: None. + interval (int): Evaluation interval. Default: 1. + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + default: True. + save_best (str, optional): If a metric is specified, it would measure + the best checkpoint during evaluation. The information about best + checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep + best score value and best checkpoint path, which will be also + loaded when resume checkpoint. Options are the evaluation metrics + on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox + detection and instance segmentation. ``AR@100`` for proposal + recall. If ``save_best`` is ``auto``, the first key of the returned + ``OrderedDict`` result will be used. Default: None. + rule (str | None, optional): Comparison rule for best score. If set to + None, it will infer a reasonable rule. Keys such as 'acc', 'top' + .etc will be inferred by 'greater' rule. Keys contain 'loss' will + be inferred by 'less' rule. Options are 'greater', 'less', None. + Default: None. + test_fn (callable, optional): test a model with samples from a + dataloader in a multi-gpu manner, and return the test results. If + ``None``, the default test function ``mmcv.engine.multi_gpu_test`` + will be used. (default: ``None``) + tmpdir (str | None): Temporary directory to save the results of all + processes. Default: None. + gpu_collect (bool): Whether to use gpu or cpu to collect results. + Default: False. + broadcast_bn_buffer (bool): Whether to broadcast the + buffer(running_mean and running_var) of rank 0 to other rank + before evaluation. Default: True. + out_dir (str, optional): The root directory to save checkpoints. If not + specified, `runner.work_dir` will be used by default. If specified, + the `out_dir` will be the concatenation of `out_dir` and the last + level directory of `runner.work_dir`. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. Default: None. + **eval_kwargs: Evaluation arguments fed into the evaluate function of + the dataset. + """ + + def __init__(self, + dataloader, + start=None, + interval=1, + by_epoch=True, + save_best=None, + rule=None, + test_fn=None, + greater_keys=None, + less_keys=None, + broadcast_bn_buffer=True, + tmpdir=None, + gpu_collect=False, + out_dir=None, + file_client_args=None, + **eval_kwargs): + + if test_fn is None: + from annotator.uniformer.mmcv.engine import multi_gpu_test + test_fn = multi_gpu_test + + super().__init__( + dataloader, + start=start, + interval=interval, + by_epoch=by_epoch, + save_best=save_best, + rule=rule, + test_fn=test_fn, + greater_keys=greater_keys, + less_keys=less_keys, + out_dir=out_dir, + file_client_args=file_client_args, + **eval_kwargs) + + self.broadcast_bn_buffer = broadcast_bn_buffer + self.tmpdir = tmpdir + self.gpu_collect = gpu_collect + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + # Synchronization of BatchNorm's buffer (running_mean + # and running_var) is not supported in the DDP of pytorch, + # which may cause the inconsistent performance of models in + # different ranks, so we broadcast BatchNorm's buffers + # of rank 0 to other ranks to avoid this. + if self.broadcast_bn_buffer: + model = runner.model + for name, module in model.named_modules(): + if isinstance(module, + _BatchNorm) and module.track_running_stats: + dist.broadcast(module.running_var, 0) + dist.broadcast(module.running_mean, 0) + + tmpdir = self.tmpdir + if tmpdir is None: + tmpdir = osp.join(runner.work_dir, '.eval_hook') + + results = self.test_fn( + runner.model, + self.dataloader, + tmpdir=tmpdir, + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + key_score = self.evaluate(runner, results) + # the key_score may be `None` so it needs to skip the action to + # save the best checkpoint + if self.save_best and key_score: + self._save_ckpt(runner, key_score) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/hook.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/hook.py new file mode 100644 index 0000000000000000000000000000000000000000..b8855c107727ecf85b917c890fc8b7f6359238a4 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/hook.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from annotator.uniformer.mmcv.utils import Registry, is_method_overridden + +HOOKS = Registry('hook') + + +class Hook: + stages = ('before_run', 'before_train_epoch', 'before_train_iter', + 'after_train_iter', 'after_train_epoch', 'before_val_epoch', + 'before_val_iter', 'after_val_iter', 'after_val_epoch', + 'after_run') + + def before_run(self, runner): + pass + + def after_run(self, runner): + pass + + def before_epoch(self, runner): + pass + + def after_epoch(self, runner): + pass + + def before_iter(self, runner): + pass + + def after_iter(self, runner): + pass + + def before_train_epoch(self, runner): + self.before_epoch(runner) + + def before_val_epoch(self, runner): + self.before_epoch(runner) + + def after_train_epoch(self, runner): + self.after_epoch(runner) + + def after_val_epoch(self, runner): + self.after_epoch(runner) + + def before_train_iter(self, runner): + self.before_iter(runner) + + def before_val_iter(self, runner): + self.before_iter(runner) + + def after_train_iter(self, runner): + self.after_iter(runner) + + def after_val_iter(self, runner): + self.after_iter(runner) + + def every_n_epochs(self, runner, n): + return (runner.epoch + 1) % n == 0 if n > 0 else False + + def every_n_inner_iters(self, runner, n): + return (runner.inner_iter + 1) % n == 0 if n > 0 else False + + def every_n_iters(self, runner, n): + return (runner.iter + 1) % n == 0 if n > 0 else False + + def end_of_epoch(self, runner): + return runner.inner_iter + 1 == len(runner.data_loader) + + def is_last_epoch(self, runner): + return runner.epoch + 1 == runner._max_epochs + + def is_last_iter(self, runner): + return runner.iter + 1 == runner._max_iters + + def get_triggered_stages(self): + trigger_stages = set() + for stage in Hook.stages: + if is_method_overridden(stage, Hook, self): + trigger_stages.add(stage) + + # some methods will be triggered in multi stages + # use this dict to map method to stages. + method_stages_map = { + 'before_epoch': ['before_train_epoch', 'before_val_epoch'], + 'after_epoch': ['after_train_epoch', 'after_val_epoch'], + 'before_iter': ['before_train_iter', 'before_val_iter'], + 'after_iter': ['after_train_iter', 'after_val_iter'], + } + + for method, map_stages in method_stages_map.items(): + if is_method_overridden(method, Hook, self): + trigger_stages.update(map_stages) + + return [stage for stage in Hook.stages if stage in trigger_stages] diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py new file mode 100644 index 0000000000000000000000000000000000000000..cfd5002fe85ffc6992155ac01003878064a1d9be --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import time + +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class IterTimerHook(Hook): + + def before_epoch(self, runner): + self.t = time.time() + + def before_iter(self, runner): + runner.log_buffer.update({'data_time': time.time() - self.t}) + + def after_iter(self, runner): + runner.log_buffer.update({'time': time.time() - self.t}) + self.t = time.time() diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0b6b345640a895368ac8a647afef6f24333d90e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import LoggerHook +from .dvclive import DvcliveLoggerHook +from .mlflow import MlflowLoggerHook +from .neptune import NeptuneLoggerHook +from .pavi import PaviLoggerHook +from .tensorboard import TensorboardLoggerHook +from .text import TextLoggerHook +from .wandb import WandbLoggerHook + +__all__ = [ + 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook', + 'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook', + 'NeptuneLoggerHook', 'DvcliveLoggerHook' +] diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/base.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f845256729458ced821762a1b8ef881e17ff9955 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/base.py @@ -0,0 +1,166 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numbers +from abc import ABCMeta, abstractmethod + +import numpy as np +import torch + +from ..hook import Hook + + +class LoggerHook(Hook): + """Base class for logger hooks. + + Args: + interval (int): Logging interval (every k iterations). + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + reset_flag (bool): Whether to clear the output buffer after logging. + by_epoch (bool): Whether EpochBasedRunner is used. + """ + + __metaclass__ = ABCMeta + + def __init__(self, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True): + self.interval = interval + self.ignore_last = ignore_last + self.reset_flag = reset_flag + self.by_epoch = by_epoch + + @abstractmethod + def log(self, runner): + pass + + @staticmethod + def is_scalar(val, include_np=True, include_torch=True): + """Tell the input variable is a scalar or not. + + Args: + val: Input variable. + include_np (bool): Whether include 0-d np.ndarray as a scalar. + include_torch (bool): Whether include 0-d torch.Tensor as a scalar. + + Returns: + bool: True or False. + """ + if isinstance(val, numbers.Number): + return True + elif include_np and isinstance(val, np.ndarray) and val.ndim == 0: + return True + elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1: + return True + else: + return False + + def get_mode(self, runner): + if runner.mode == 'train': + if 'time' in runner.log_buffer.output: + mode = 'train' + else: + mode = 'val' + elif runner.mode == 'val': + mode = 'val' + else: + raise ValueError(f"runner mode should be 'train' or 'val', " + f'but got {runner.mode}') + return mode + + def get_epoch(self, runner): + if runner.mode == 'train': + epoch = runner.epoch + 1 + elif runner.mode == 'val': + # normal val mode + # runner.epoch += 1 has been done before val workflow + epoch = runner.epoch + else: + raise ValueError(f"runner mode should be 'train' or 'val', " + f'but got {runner.mode}') + return epoch + + def get_iter(self, runner, inner_iter=False): + """Get the current training iteration step.""" + if self.by_epoch and inner_iter: + current_iter = runner.inner_iter + 1 + else: + current_iter = runner.iter + 1 + return current_iter + + def get_lr_tags(self, runner): + tags = {} + lrs = runner.current_lr() + if isinstance(lrs, dict): + for name, value in lrs.items(): + tags[f'learning_rate/{name}'] = value[0] + else: + tags['learning_rate'] = lrs[0] + return tags + + def get_momentum_tags(self, runner): + tags = {} + momentums = runner.current_momentum() + if isinstance(momentums, dict): + for name, value in momentums.items(): + tags[f'momentum/{name}'] = value[0] + else: + tags['momentum'] = momentums[0] + return tags + + def get_loggable_tags(self, + runner, + allow_scalar=True, + allow_text=False, + add_mode=True, + tags_to_skip=('time', 'data_time')): + tags = {} + for var, val in runner.log_buffer.output.items(): + if var in tags_to_skip: + continue + if self.is_scalar(val) and not allow_scalar: + continue + if isinstance(val, str) and not allow_text: + continue + if add_mode: + var = f'{self.get_mode(runner)}/{var}' + tags[var] = val + tags.update(self.get_lr_tags(runner)) + tags.update(self.get_momentum_tags(runner)) + return tags + + def before_run(self, runner): + for hook in runner.hooks[::-1]: + if isinstance(hook, LoggerHook): + hook.reset_flag = True + break + + def before_epoch(self, runner): + runner.log_buffer.clear() # clear logs of last epoch + + def after_train_iter(self, runner): + if self.by_epoch and self.every_n_inner_iters(runner, self.interval): + runner.log_buffer.average(self.interval) + elif not self.by_epoch and self.every_n_iters(runner, self.interval): + runner.log_buffer.average(self.interval) + elif self.end_of_epoch(runner) and not self.ignore_last: + # not precise but more stable + runner.log_buffer.average(self.interval) + + if runner.log_buffer.ready: + self.log(runner) + if self.reset_flag: + runner.log_buffer.clear_output() + + def after_train_epoch(self, runner): + if runner.log_buffer.ready: + self.log(runner) + if self.reset_flag: + runner.log_buffer.clear_output() + + def after_val_epoch(self, runner): + runner.log_buffer.average() + self.log(runner) + if self.reset_flag: + runner.log_buffer.clear_output() diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/dvclive.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/dvclive.py new file mode 100644 index 0000000000000000000000000000000000000000..687cdc58c0336c92b1e4f9a410ba67ebaab2bc7a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/dvclive.py @@ -0,0 +1,58 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class DvcliveLoggerHook(LoggerHook): + """Class to log metrics with dvclive. + + It requires `dvclive`_ to be installed. + + Args: + path (str): Directory where dvclive will write TSV log files. + interval (int): Logging interval (every k iterations). + Default 10. + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + Default: True. + reset_flag (bool): Whether to clear the output buffer after logging. + Default: True. + by_epoch (bool): Whether EpochBasedRunner is used. + Default: True. + + .. _dvclive: + https://dvc.org/doc/dvclive + """ + + def __init__(self, + path, + interval=10, + ignore_last=True, + reset_flag=True, + by_epoch=True): + + super(DvcliveLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.path = path + self.import_dvclive() + + def import_dvclive(self): + try: + import dvclive + except ImportError: + raise ImportError( + 'Please run "pip install dvclive" to install dvclive') + self.dvclive = dvclive + + @master_only + def before_run(self, runner): + self.dvclive.init(self.path) + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner) + if tags: + for k, v in tags.items(): + self.dvclive.log(k, v, step=self.get_iter(runner)) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/mlflow.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/mlflow.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a72592be47b534ce22573775fd5a7e8e86d72d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/mlflow.py @@ -0,0 +1,78 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class MlflowLoggerHook(LoggerHook): + + def __init__(self, + exp_name=None, + tags=None, + log_model=True, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True): + """Class to log metrics and (optionally) a trained model to MLflow. + + It requires `MLflow`_ to be installed. + + Args: + exp_name (str, optional): Name of the experiment to be used. + Default None. + If not None, set the active experiment. + If experiment does not exist, an experiment with provided name + will be created. + tags (dict of str: str, optional): Tags for the current run. + Default None. + If not None, set tags for the current run. + log_model (bool, optional): Whether to log an MLflow artifact. + Default True. + If True, log runner.model as an MLflow artifact + for the current run. + interval (int): Logging interval (every k iterations). + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + reset_flag (bool): Whether to clear the output buffer after logging + by_epoch (bool): Whether EpochBasedRunner is used. + + .. _MLflow: + https://www.mlflow.org/docs/latest/index.html + """ + super(MlflowLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.import_mlflow() + self.exp_name = exp_name + self.tags = tags + self.log_model = log_model + + def import_mlflow(self): + try: + import mlflow + import mlflow.pytorch as mlflow_pytorch + except ImportError: + raise ImportError( + 'Please run "pip install mlflow" to install mlflow') + self.mlflow = mlflow + self.mlflow_pytorch = mlflow_pytorch + + @master_only + def before_run(self, runner): + super(MlflowLoggerHook, self).before_run(runner) + if self.exp_name is not None: + self.mlflow.set_experiment(self.exp_name) + if self.tags is not None: + self.mlflow.set_tags(self.tags) + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner) + if tags: + self.mlflow.log_metrics(tags, step=self.get_iter(runner)) + + @master_only + def after_run(self, runner): + if self.log_model: + self.mlflow_pytorch.log_model(runner.model, 'models') diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/neptune.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/neptune.py new file mode 100644 index 0000000000000000000000000000000000000000..7a38772b0c93a8608f32c6357b8616e77c139dc9 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/neptune.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class NeptuneLoggerHook(LoggerHook): + """Class to log metrics to NeptuneAI. + + It requires `neptune-client` to be installed. + + Args: + init_kwargs (dict): a dict contains the initialization keys as below: + - project (str): Name of a project in a form of + namespace/project_name. If None, the value of + NEPTUNE_PROJECT environment variable will be taken. + - api_token (str): User’s API token. + If None, the value of NEPTUNE_API_TOKEN environment + variable will be taken. Note: It is strongly recommended + to use NEPTUNE_API_TOKEN environment variable rather than + placing your API token in plain text in your source code. + - name (str, optional, default is 'Untitled'): Editable name of + the run. Name is displayed in the run's Details and in + Runs table as a column. + Check https://docs.neptune.ai/api-reference/neptune#init for + more init arguments. + interval (int): Logging interval (every k iterations). + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + reset_flag (bool): Whether to clear the output buffer after logging + by_epoch (bool): Whether EpochBasedRunner is used. + + .. _NeptuneAI: + https://docs.neptune.ai/you-should-know/logging-metadata + """ + + def __init__(self, + init_kwargs=None, + interval=10, + ignore_last=True, + reset_flag=True, + with_step=True, + by_epoch=True): + + super(NeptuneLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.import_neptune() + self.init_kwargs = init_kwargs + self.with_step = with_step + + def import_neptune(self): + try: + import neptune.new as neptune + except ImportError: + raise ImportError( + 'Please run "pip install neptune-client" to install neptune') + self.neptune = neptune + self.run = None + + @master_only + def before_run(self, runner): + if self.init_kwargs: + self.run = self.neptune.init(**self.init_kwargs) + else: + self.run = self.neptune.init() + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner) + if tags: + for tag_name, tag_value in tags.items(): + if self.with_step: + self.run[tag_name].log( + tag_value, step=self.get_iter(runner)) + else: + tags['global_step'] = self.get_iter(runner) + self.run[tag_name].log(tags) + + @master_only + def after_run(self, runner): + self.run.stop() diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/pavi.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/pavi.py new file mode 100644 index 0000000000000000000000000000000000000000..1dcf146d8163aff1363e9764999b0a74d674a595 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/pavi.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp + +import torch +import yaml + +import annotator.uniformer.mmcv as mmcv +from ....parallel.utils import is_module_wrapper +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class PaviLoggerHook(LoggerHook): + + def __init__(self, + init_kwargs=None, + add_graph=False, + add_last_ckpt=False, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True, + img_key='img_info'): + super(PaviLoggerHook, self).__init__(interval, ignore_last, reset_flag, + by_epoch) + self.init_kwargs = init_kwargs + self.add_graph = add_graph + self.add_last_ckpt = add_last_ckpt + self.img_key = img_key + + @master_only + def before_run(self, runner): + super(PaviLoggerHook, self).before_run(runner) + try: + from pavi import SummaryWriter + except ImportError: + raise ImportError('Please run "pip install pavi" to install pavi.') + + self.run_name = runner.work_dir.split('/')[-1] + + if not self.init_kwargs: + self.init_kwargs = dict() + self.init_kwargs['name'] = self.run_name + self.init_kwargs['model'] = runner._model_name + if runner.meta is not None: + if 'config_dict' in runner.meta: + config_dict = runner.meta['config_dict'] + assert isinstance( + config_dict, + dict), ('meta["config_dict"] has to be of a dict, ' + f'but got {type(config_dict)}') + elif 'config_file' in runner.meta: + config_file = runner.meta['config_file'] + config_dict = dict(mmcv.Config.fromfile(config_file)) + else: + config_dict = None + if config_dict is not None: + # 'max_.*iter' is parsed in pavi sdk as the maximum iterations + # to properly set up the progress bar. + config_dict = config_dict.copy() + config_dict.setdefault('max_iter', runner.max_iters) + # non-serializable values are first converted in + # mmcv.dump to json + config_dict = json.loads( + mmcv.dump(config_dict, file_format='json')) + session_text = yaml.dump(config_dict) + self.init_kwargs['session_text'] = session_text + self.writer = SummaryWriter(**self.init_kwargs) + + def get_step(self, runner): + """Get the total training step/epoch.""" + if self.get_mode(runner) == 'val' and self.by_epoch: + return self.get_epoch(runner) + else: + return self.get_iter(runner) + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner, add_mode=False) + if tags: + self.writer.add_scalars( + self.get_mode(runner), tags, self.get_step(runner)) + + @master_only + def after_run(self, runner): + if self.add_last_ckpt: + ckpt_path = osp.join(runner.work_dir, 'latest.pth') + if osp.islink(ckpt_path): + ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path)) + + if osp.isfile(ckpt_path): + # runner.epoch += 1 has been done before `after_run`. + iteration = runner.epoch if self.by_epoch else runner.iter + return self.writer.add_snapshot_file( + tag=self.run_name, + snapshot_file_path=ckpt_path, + iteration=iteration) + + # flush the buffer and send a task ending signal to Pavi + self.writer.close() + + @master_only + def before_epoch(self, runner): + if runner.epoch == 0 and self.add_graph: + if is_module_wrapper(runner.model): + _model = runner.model.module + else: + _model = runner.model + device = next(_model.parameters()).device + data = next(iter(runner.data_loader)) + image = data[self.img_key][0:1].to(device) + with torch.no_grad(): + self.writer.add_graph(_model, image) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/tensorboard.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd5011dc08def6c09eef86d3ce5b124c9fc5372 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/tensorboard.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class TensorboardLoggerHook(LoggerHook): + + def __init__(self, + log_dir=None, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True): + super(TensorboardLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.log_dir = log_dir + + @master_only + def before_run(self, runner): + super(TensorboardLoggerHook, self).before_run(runner) + if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.1')): + try: + from tensorboardX import SummaryWriter + except ImportError: + raise ImportError('Please install tensorboardX to use ' + 'TensorboardLoggerHook.') + else: + try: + from torch.utils.tensorboard import SummaryWriter + except ImportError: + raise ImportError( + 'Please run "pip install future tensorboard" to install ' + 'the dependencies to use torch.utils.tensorboard ' + '(applicable to PyTorch 1.1 or higher)') + + if self.log_dir is None: + self.log_dir = osp.join(runner.work_dir, 'tf_logs') + self.writer = SummaryWriter(self.log_dir) + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner, allow_text=True) + for tag, val in tags.items(): + if isinstance(val, str): + self.writer.add_text(tag, val, self.get_iter(runner)) + else: + self.writer.add_scalar(tag, val, self.get_iter(runner)) + + @master_only + def after_run(self, runner): + self.writer.close() diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/text.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/text.py new file mode 100644 index 0000000000000000000000000000000000000000..87b1a3eca9595a130121526f8b4c29915387ab35 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/text.py @@ -0,0 +1,256 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import os +import os.path as osp +from collections import OrderedDict + +import torch +import torch.distributed as dist + +import annotator.uniformer.mmcv as mmcv +from annotator.uniformer.mmcv.fileio.file_client import FileClient +from annotator.uniformer.mmcv.utils import is_tuple_of, scandir +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class TextLoggerHook(LoggerHook): + """Logger hook in text. + + In this logger hook, the information will be printed on terminal and + saved in json file. + + Args: + by_epoch (bool, optional): Whether EpochBasedRunner is used. + Default: True. + interval (int, optional): Logging interval (every k iterations). + Default: 10. + ignore_last (bool, optional): Ignore the log of last iterations in each + epoch if less than :attr:`interval`. Default: True. + reset_flag (bool, optional): Whether to clear the output buffer after + logging. Default: False. + interval_exp_name (int, optional): Logging interval for experiment + name. This feature is to help users conveniently get the experiment + information from screen or log file. Default: 1000. + out_dir (str, optional): Logs are saved in ``runner.work_dir`` default. + If ``out_dir`` is specified, logs will be copied to a new directory + which is the concatenation of ``out_dir`` and the last level + directory of ``runner.work_dir``. Default: None. + `New in version 1.3.16.` + out_suffix (str or tuple[str], optional): Those filenames ending with + ``out_suffix`` will be copied to ``out_dir``. + Default: ('.log.json', '.log', '.py'). + `New in version 1.3.16.` + keep_local (bool, optional): Whether to keep local log when + :attr:`out_dir` is specified. If False, the local log will be + removed. Default: True. + `New in version 1.3.16.` + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + `New in version 1.3.16.` + """ + + def __init__(self, + by_epoch=True, + interval=10, + ignore_last=True, + reset_flag=False, + interval_exp_name=1000, + out_dir=None, + out_suffix=('.log.json', '.log', '.py'), + keep_local=True, + file_client_args=None): + super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag, + by_epoch) + self.by_epoch = by_epoch + self.time_sec_tot = 0 + self.interval_exp_name = interval_exp_name + + if out_dir is None and file_client_args is not None: + raise ValueError( + 'file_client_args should be "None" when `out_dir` is not' + 'specified.') + self.out_dir = out_dir + + if not (out_dir is None or isinstance(out_dir, str) + or is_tuple_of(out_dir, str)): + raise TypeError('out_dir should be "None" or string or tuple of ' + 'string, but got {out_dir}') + self.out_suffix = out_suffix + + self.keep_local = keep_local + self.file_client_args = file_client_args + if self.out_dir is not None: + self.file_client = FileClient.infer_client(file_client_args, + self.out_dir) + + def before_run(self, runner): + super(TextLoggerHook, self).before_run(runner) + + if self.out_dir is not None: + self.file_client = FileClient.infer_client(self.file_client_args, + self.out_dir) + # The final `self.out_dir` is the concatenation of `self.out_dir` + # and the last level directory of `runner.work_dir` + basename = osp.basename(runner.work_dir.rstrip(osp.sep)) + self.out_dir = self.file_client.join_path(self.out_dir, basename) + runner.logger.info( + (f'Text logs will be saved to {self.out_dir} by ' + f'{self.file_client.name} after the training process.')) + + self.start_iter = runner.iter + self.json_log_path = osp.join(runner.work_dir, + f'{runner.timestamp}.log.json') + if runner.meta is not None: + self._dump_log(runner.meta, runner) + + def _get_max_memory(self, runner): + device = getattr(runner.model, 'output_device', None) + mem = torch.cuda.max_memory_allocated(device=device) + mem_mb = torch.tensor([mem / (1024 * 1024)], + dtype=torch.int, + device=device) + if runner.world_size > 1: + dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX) + return mem_mb.item() + + def _log_info(self, log_dict, runner): + # print exp name for users to distinguish experiments + # at every ``interval_exp_name`` iterations and the end of each epoch + if runner.meta is not None and 'exp_name' in runner.meta: + if (self.every_n_iters(runner, self.interval_exp_name)) or ( + self.by_epoch and self.end_of_epoch(runner)): + exp_info = f'Exp name: {runner.meta["exp_name"]}' + runner.logger.info(exp_info) + + if log_dict['mode'] == 'train': + if isinstance(log_dict['lr'], dict): + lr_str = [] + for k, val in log_dict['lr'].items(): + lr_str.append(f'lr_{k}: {val:.3e}') + lr_str = ' '.join(lr_str) + else: + lr_str = f'lr: {log_dict["lr"]:.3e}' + + # by epoch: Epoch [4][100/1000] + # by iter: Iter [100/100000] + if self.by_epoch: + log_str = f'Epoch [{log_dict["epoch"]}]' \ + f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t' + else: + log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t' + log_str += f'{lr_str}, ' + + if 'time' in log_dict.keys(): + self.time_sec_tot += (log_dict['time'] * self.interval) + time_sec_avg = self.time_sec_tot / ( + runner.iter - self.start_iter + 1) + eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1) + eta_str = str(datetime.timedelta(seconds=int(eta_sec))) + log_str += f'eta: {eta_str}, ' + log_str += f'time: {log_dict["time"]:.3f}, ' \ + f'data_time: {log_dict["data_time"]:.3f}, ' + # statistic memory + if torch.cuda.is_available(): + log_str += f'memory: {log_dict["memory"]}, ' + else: + # val/test time + # here 1000 is the length of the val dataloader + # by epoch: Epoch[val] [4][1000] + # by iter: Iter[val] [1000] + if self.by_epoch: + log_str = f'Epoch({log_dict["mode"]}) ' \ + f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t' + else: + log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t' + + log_items = [] + for name, val in log_dict.items(): + # TODO: resolve this hack + # these items have been in log_str + if name in [ + 'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time', + 'memory', 'epoch' + ]: + continue + if isinstance(val, float): + val = f'{val:.4f}' + log_items.append(f'{name}: {val}') + log_str += ', '.join(log_items) + + runner.logger.info(log_str) + + def _dump_log(self, log_dict, runner): + # dump log in json format + json_log = OrderedDict() + for k, v in log_dict.items(): + json_log[k] = self._round_float(v) + # only append log at last line + if runner.rank == 0: + with open(self.json_log_path, 'a+') as f: + mmcv.dump(json_log, f, file_format='json') + f.write('\n') + + def _round_float(self, items): + if isinstance(items, list): + return [self._round_float(item) for item in items] + elif isinstance(items, float): + return round(items, 5) + else: + return items + + def log(self, runner): + if 'eval_iter_num' in runner.log_buffer.output: + # this doesn't modify runner.iter and is regardless of by_epoch + cur_iter = runner.log_buffer.output.pop('eval_iter_num') + else: + cur_iter = self.get_iter(runner, inner_iter=True) + + log_dict = OrderedDict( + mode=self.get_mode(runner), + epoch=self.get_epoch(runner), + iter=cur_iter) + + # only record lr of the first param group + cur_lr = runner.current_lr() + if isinstance(cur_lr, list): + log_dict['lr'] = cur_lr[0] + else: + assert isinstance(cur_lr, dict) + log_dict['lr'] = {} + for k, lr_ in cur_lr.items(): + assert isinstance(lr_, list) + log_dict['lr'].update({k: lr_[0]}) + + if 'time' in runner.log_buffer.output: + # statistic memory + if torch.cuda.is_available(): + log_dict['memory'] = self._get_max_memory(runner) + + log_dict = dict(log_dict, **runner.log_buffer.output) + + self._log_info(log_dict, runner) + self._dump_log(log_dict, runner) + return log_dict + + def after_run(self, runner): + # copy or upload logs to self.out_dir + if self.out_dir is not None: + for filename in scandir(runner.work_dir, self.out_suffix, True): + local_filepath = osp.join(runner.work_dir, filename) + out_filepath = self.file_client.join_path( + self.out_dir, filename) + with open(local_filepath, 'r') as f: + self.file_client.put_text(f.read(), out_filepath) + + runner.logger.info( + (f'The file {local_filepath} has been uploaded to ' + f'{out_filepath}.')) + + if not self.keep_local: + os.remove(local_filepath) + runner.logger.info( + (f'{local_filepath} was removed due to the ' + '`self.keep_local=False`')) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/wandb.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/wandb.py new file mode 100644 index 0000000000000000000000000000000000000000..9f6808462eb79ab2b04806a5d9f0d3dd079b5ea9 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/wandb.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class WandbLoggerHook(LoggerHook): + + def __init__(self, + init_kwargs=None, + interval=10, + ignore_last=True, + reset_flag=False, + commit=True, + by_epoch=True, + with_step=True): + super(WandbLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.import_wandb() + self.init_kwargs = init_kwargs + self.commit = commit + self.with_step = with_step + + def import_wandb(self): + try: + import wandb + except ImportError: + raise ImportError( + 'Please run "pip install wandb" to install wandb') + self.wandb = wandb + + @master_only + def before_run(self, runner): + super(WandbLoggerHook, self).before_run(runner) + if self.wandb is None: + self.import_wandb() + if self.init_kwargs: + self.wandb.init(**self.init_kwargs) + else: + self.wandb.init() + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner) + if tags: + if self.with_step: + self.wandb.log( + tags, step=self.get_iter(runner), commit=self.commit) + else: + tags['global_step'] = self.get_iter(runner) + self.wandb.log(tags, commit=self.commit) + + @master_only + def after_run(self, runner): + self.wandb.join() diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/lr_updater.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/lr_updater.py new file mode 100644 index 0000000000000000000000000000000000000000..6365908ddf6070086de2ffc0afada46ed2f32256 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/lr_updater.py @@ -0,0 +1,670 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numbers +from math import cos, pi + +import annotator.uniformer.mmcv as mmcv +from .hook import HOOKS, Hook + + +class LrUpdaterHook(Hook): + """LR Scheduler in MMCV. + + Args: + by_epoch (bool): LR changes epoch by epoch + warmup (string): Type of warmup used. It can be None(use no warmup), + 'constant', 'linear' or 'exp' + warmup_iters (int): The number of iterations or epochs that warmup + lasts + warmup_ratio (float): LR used at the beginning of warmup equals to + warmup_ratio * initial_lr + warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters + means the number of epochs that warmup lasts, otherwise means the + number of iteration that warmup lasts + """ + + def __init__(self, + by_epoch=True, + warmup=None, + warmup_iters=0, + warmup_ratio=0.1, + warmup_by_epoch=False): + # validate the "warmup" argument + if warmup is not None: + if warmup not in ['constant', 'linear', 'exp']: + raise ValueError( + f'"{warmup}" is not a supported type for warming up, valid' + ' types are "constant" and "linear"') + if warmup is not None: + assert warmup_iters > 0, \ + '"warmup_iters" must be a positive integer' + assert 0 < warmup_ratio <= 1.0, \ + '"warmup_ratio" must be in range (0,1]' + + self.by_epoch = by_epoch + self.warmup = warmup + self.warmup_iters = warmup_iters + self.warmup_ratio = warmup_ratio + self.warmup_by_epoch = warmup_by_epoch + + if self.warmup_by_epoch: + self.warmup_epochs = self.warmup_iters + self.warmup_iters = None + else: + self.warmup_epochs = None + + self.base_lr = [] # initial lr for all param groups + self.regular_lr = [] # expected lr if no warming up is performed + + def _set_lr(self, runner, lr_groups): + if isinstance(runner.optimizer, dict): + for k, optim in runner.optimizer.items(): + for param_group, lr in zip(optim.param_groups, lr_groups[k]): + param_group['lr'] = lr + else: + for param_group, lr in zip(runner.optimizer.param_groups, + lr_groups): + param_group['lr'] = lr + + def get_lr(self, runner, base_lr): + raise NotImplementedError + + def get_regular_lr(self, runner): + if isinstance(runner.optimizer, dict): + lr_groups = {} + for k in runner.optimizer.keys(): + _lr_group = [ + self.get_lr(runner, _base_lr) + for _base_lr in self.base_lr[k] + ] + lr_groups.update({k: _lr_group}) + + return lr_groups + else: + return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr] + + def get_warmup_lr(self, cur_iters): + + def _get_warmup_lr(cur_iters, regular_lr): + if self.warmup == 'constant': + warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr] + elif self.warmup == 'linear': + k = (1 - cur_iters / self.warmup_iters) * (1 - + self.warmup_ratio) + warmup_lr = [_lr * (1 - k) for _lr in regular_lr] + elif self.warmup == 'exp': + k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters) + warmup_lr = [_lr * k for _lr in regular_lr] + return warmup_lr + + if isinstance(self.regular_lr, dict): + lr_groups = {} + for key, regular_lr in self.regular_lr.items(): + lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr) + return lr_groups + else: + return _get_warmup_lr(cur_iters, self.regular_lr) + + def before_run(self, runner): + # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved, + # it will be set according to the optimizer params + if isinstance(runner.optimizer, dict): + self.base_lr = {} + for k, optim in runner.optimizer.items(): + for group in optim.param_groups: + group.setdefault('initial_lr', group['lr']) + _base_lr = [ + group['initial_lr'] for group in optim.param_groups + ] + self.base_lr.update({k: _base_lr}) + else: + for group in runner.optimizer.param_groups: + group.setdefault('initial_lr', group['lr']) + self.base_lr = [ + group['initial_lr'] for group in runner.optimizer.param_groups + ] + + def before_train_epoch(self, runner): + if self.warmup_iters is None: + epoch_len = len(runner.data_loader) + self.warmup_iters = self.warmup_epochs * epoch_len + + if not self.by_epoch: + return + + self.regular_lr = self.get_regular_lr(runner) + self._set_lr(runner, self.regular_lr) + + def before_train_iter(self, runner): + cur_iter = runner.iter + if not self.by_epoch: + self.regular_lr = self.get_regular_lr(runner) + if self.warmup is None or cur_iter >= self.warmup_iters: + self._set_lr(runner, self.regular_lr) + else: + warmup_lr = self.get_warmup_lr(cur_iter) + self._set_lr(runner, warmup_lr) + elif self.by_epoch: + if self.warmup is None or cur_iter > self.warmup_iters: + return + elif cur_iter == self.warmup_iters: + self._set_lr(runner, self.regular_lr) + else: + warmup_lr = self.get_warmup_lr(cur_iter) + self._set_lr(runner, warmup_lr) + + +@HOOKS.register_module() +class FixedLrUpdaterHook(LrUpdaterHook): + + def __init__(self, **kwargs): + super(FixedLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + return base_lr + + +@HOOKS.register_module() +class StepLrUpdaterHook(LrUpdaterHook): + """Step LR scheduler with min_lr clipping. + + Args: + step (int | list[int]): Step to decay the LR. If an int value is given, + regard it as the decay interval. If a list is given, decay LR at + these steps. + gamma (float, optional): Decay LR ratio. Default: 0.1. + min_lr (float, optional): Minimum LR value to keep. If LR after decay + is lower than `min_lr`, it will be clipped to this value. If None + is given, we don't perform lr clipping. Default: None. + """ + + def __init__(self, step, gamma=0.1, min_lr=None, **kwargs): + if isinstance(step, list): + assert mmcv.is_list_of(step, int) + assert all([s > 0 for s in step]) + elif isinstance(step, int): + assert step > 0 + else: + raise TypeError('"step" must be a list or integer') + self.step = step + self.gamma = gamma + self.min_lr = min_lr + super(StepLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + progress = runner.epoch if self.by_epoch else runner.iter + + # calculate exponential term + if isinstance(self.step, int): + exp = progress // self.step + else: + exp = len(self.step) + for i, s in enumerate(self.step): + if progress < s: + exp = i + break + + lr = base_lr * (self.gamma**exp) + if self.min_lr is not None: + # clip to a minimum value + lr = max(lr, self.min_lr) + return lr + + +@HOOKS.register_module() +class ExpLrUpdaterHook(LrUpdaterHook): + + def __init__(self, gamma, **kwargs): + self.gamma = gamma + super(ExpLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + progress = runner.epoch if self.by_epoch else runner.iter + return base_lr * self.gamma**progress + + +@HOOKS.register_module() +class PolyLrUpdaterHook(LrUpdaterHook): + + def __init__(self, power=1., min_lr=0., **kwargs): + self.power = power + self.min_lr = min_lr + super(PolyLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + if self.by_epoch: + progress = runner.epoch + max_progress = runner.max_epochs + else: + progress = runner.iter + max_progress = runner.max_iters + coeff = (1 - progress / max_progress)**self.power + return (base_lr - self.min_lr) * coeff + self.min_lr + + +@HOOKS.register_module() +class InvLrUpdaterHook(LrUpdaterHook): + + def __init__(self, gamma, power=1., **kwargs): + self.gamma = gamma + self.power = power + super(InvLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + progress = runner.epoch if self.by_epoch else runner.iter + return base_lr * (1 + self.gamma * progress)**(-self.power) + + +@HOOKS.register_module() +class CosineAnnealingLrUpdaterHook(LrUpdaterHook): + + def __init__(self, min_lr=None, min_lr_ratio=None, **kwargs): + assert (min_lr is None) ^ (min_lr_ratio is None) + self.min_lr = min_lr + self.min_lr_ratio = min_lr_ratio + super(CosineAnnealingLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + if self.by_epoch: + progress = runner.epoch + max_progress = runner.max_epochs + else: + progress = runner.iter + max_progress = runner.max_iters + + if self.min_lr_ratio is not None: + target_lr = base_lr * self.min_lr_ratio + else: + target_lr = self.min_lr + return annealing_cos(base_lr, target_lr, progress / max_progress) + + +@HOOKS.register_module() +class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook): + """Flat + Cosine lr schedule. + + Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501 + + Args: + start_percent (float): When to start annealing the learning rate + after the percentage of the total training steps. + The value should be in range [0, 1). + Default: 0.75 + min_lr (float, optional): The minimum lr. Default: None. + min_lr_ratio (float, optional): The ratio of minimum lr to the base lr. + Either `min_lr` or `min_lr_ratio` should be specified. + Default: None. + """ + + def __init__(self, + start_percent=0.75, + min_lr=None, + min_lr_ratio=None, + **kwargs): + assert (min_lr is None) ^ (min_lr_ratio is None) + if start_percent < 0 or start_percent > 1 or not isinstance( + start_percent, float): + raise ValueError( + 'expected float between 0 and 1 start_percent, but ' + f'got {start_percent}') + self.start_percent = start_percent + self.min_lr = min_lr + self.min_lr_ratio = min_lr_ratio + super(FlatCosineAnnealingLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + if self.by_epoch: + start = round(runner.max_epochs * self.start_percent) + progress = runner.epoch - start + max_progress = runner.max_epochs - start + else: + start = round(runner.max_iters * self.start_percent) + progress = runner.iter - start + max_progress = runner.max_iters - start + + if self.min_lr_ratio is not None: + target_lr = base_lr * self.min_lr_ratio + else: + target_lr = self.min_lr + + if progress < 0: + return base_lr + else: + return annealing_cos(base_lr, target_lr, progress / max_progress) + + +@HOOKS.register_module() +class CosineRestartLrUpdaterHook(LrUpdaterHook): + """Cosine annealing with restarts learning rate scheme. + + Args: + periods (list[int]): Periods for each cosine anneling cycle. + restart_weights (list[float], optional): Restart weights at each + restart iteration. Default: [1]. + min_lr (float, optional): The minimum lr. Default: None. + min_lr_ratio (float, optional): The ratio of minimum lr to the base lr. + Either `min_lr` or `min_lr_ratio` should be specified. + Default: None. + """ + + def __init__(self, + periods, + restart_weights=[1], + min_lr=None, + min_lr_ratio=None, + **kwargs): + assert (min_lr is None) ^ (min_lr_ratio is None) + self.periods = periods + self.min_lr = min_lr + self.min_lr_ratio = min_lr_ratio + self.restart_weights = restart_weights + assert (len(self.periods) == len(self.restart_weights) + ), 'periods and restart_weights should have the same length.' + super(CosineRestartLrUpdaterHook, self).__init__(**kwargs) + + self.cumulative_periods = [ + sum(self.periods[0:i + 1]) for i in range(0, len(self.periods)) + ] + + def get_lr(self, runner, base_lr): + if self.by_epoch: + progress = runner.epoch + else: + progress = runner.iter + + if self.min_lr_ratio is not None: + target_lr = base_lr * self.min_lr_ratio + else: + target_lr = self.min_lr + + idx = get_position_from_periods(progress, self.cumulative_periods) + current_weight = self.restart_weights[idx] + nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1] + current_periods = self.periods[idx] + + alpha = min((progress - nearest_restart) / current_periods, 1) + return annealing_cos(base_lr, target_lr, alpha, current_weight) + + +def get_position_from_periods(iteration, cumulative_periods): + """Get the position from a period list. + + It will return the index of the right-closest number in the period list. + For example, the cumulative_periods = [100, 200, 300, 400], + if iteration == 50, return 0; + if iteration == 210, return 2; + if iteration == 300, return 3. + + Args: + iteration (int): Current iteration. + cumulative_periods (list[int]): Cumulative period list. + + Returns: + int: The position of the right-closest number in the period list. + """ + for i, period in enumerate(cumulative_periods): + if iteration < period: + return i + raise ValueError(f'Current iteration {iteration} exceeds ' + f'cumulative_periods {cumulative_periods}') + + +@HOOKS.register_module() +class CyclicLrUpdaterHook(LrUpdaterHook): + """Cyclic LR Scheduler. + + Implement the cyclical learning rate policy (CLR) described in + https://arxiv.org/pdf/1506.01186.pdf + + Different from the original paper, we use cosine annealing rather than + triangular policy inside a cycle. This improves the performance in the + 3D detection area. + + Args: + by_epoch (bool): Whether to update LR by epoch. + target_ratio (tuple[float]): Relative ratio of the highest LR and the + lowest LR to the initial LR. + cyclic_times (int): Number of cycles during training + step_ratio_up (float): The ratio of the increasing process of LR in + the total cycle. + anneal_strategy (str): {'cos', 'linear'} + Specifies the annealing strategy: 'cos' for cosine annealing, + 'linear' for linear annealing. Default: 'cos'. + """ + + def __init__(self, + by_epoch=False, + target_ratio=(10, 1e-4), + cyclic_times=1, + step_ratio_up=0.4, + anneal_strategy='cos', + **kwargs): + if isinstance(target_ratio, float): + target_ratio = (target_ratio, target_ratio / 1e5) + elif isinstance(target_ratio, tuple): + target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \ + if len(target_ratio) == 1 else target_ratio + else: + raise ValueError('target_ratio should be either float ' + f'or tuple, got {type(target_ratio)}') + + assert len(target_ratio) == 2, \ + '"target_ratio" must be list or tuple of two floats' + assert 0 <= step_ratio_up < 1.0, \ + '"step_ratio_up" must be in range [0,1)' + + self.target_ratio = target_ratio + self.cyclic_times = cyclic_times + self.step_ratio_up = step_ratio_up + self.lr_phases = [] # init lr_phases + # validate anneal_strategy + if anneal_strategy not in ['cos', 'linear']: + raise ValueError('anneal_strategy must be one of "cos" or ' + f'"linear", instead got {anneal_strategy}') + elif anneal_strategy == 'cos': + self.anneal_func = annealing_cos + elif anneal_strategy == 'linear': + self.anneal_func = annealing_linear + + assert not by_epoch, \ + 'currently only support "by_epoch" = False' + super(CyclicLrUpdaterHook, self).__init__(by_epoch, **kwargs) + + def before_run(self, runner): + super(CyclicLrUpdaterHook, self).before_run(runner) + # initiate lr_phases + # total lr_phases are separated as up and down + max_iter_per_phase = runner.max_iters // self.cyclic_times + iter_up_phase = int(self.step_ratio_up * max_iter_per_phase) + self.lr_phases.append( + [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]]) + self.lr_phases.append([ + iter_up_phase, max_iter_per_phase, max_iter_per_phase, + self.target_ratio[0], self.target_ratio[1] + ]) + + def get_lr(self, runner, base_lr): + curr_iter = runner.iter + for (start_iter, end_iter, max_iter_per_phase, start_ratio, + end_ratio) in self.lr_phases: + curr_iter %= max_iter_per_phase + if start_iter <= curr_iter < end_iter: + progress = curr_iter - start_iter + return self.anneal_func(base_lr * start_ratio, + base_lr * end_ratio, + progress / (end_iter - start_iter)) + + +@HOOKS.register_module() +class OneCycleLrUpdaterHook(LrUpdaterHook): + """One Cycle LR Scheduler. + + The 1cycle learning rate policy changes the learning rate after every + batch. The one cycle learning rate policy is described in + https://arxiv.org/pdf/1708.07120.pdf + + Args: + max_lr (float or list): Upper learning rate boundaries in the cycle + for each parameter group. + total_steps (int, optional): The total number of steps in the cycle. + Note that if a value is not provided here, it will be the max_iter + of runner. Default: None. + pct_start (float): The percentage of the cycle (in number of steps) + spent increasing the learning rate. + Default: 0.3 + anneal_strategy (str): {'cos', 'linear'} + Specifies the annealing strategy: 'cos' for cosine annealing, + 'linear' for linear annealing. + Default: 'cos' + div_factor (float): Determines the initial learning rate via + initial_lr = max_lr/div_factor + Default: 25 + final_div_factor (float): Determines the minimum learning rate via + min_lr = initial_lr/final_div_factor + Default: 1e4 + three_phase (bool): If three_phase is True, use a third phase of the + schedule to annihilate the learning rate according to + final_div_factor instead of modifying the second phase (the first + two phases will be symmetrical about the step indicated by + pct_start). + Default: False + """ + + def __init__(self, + max_lr, + total_steps=None, + pct_start=0.3, + anneal_strategy='cos', + div_factor=25, + final_div_factor=1e4, + three_phase=False, + **kwargs): + # validate by_epoch, currently only support by_epoch = False + if 'by_epoch' not in kwargs: + kwargs['by_epoch'] = False + else: + assert not kwargs['by_epoch'], \ + 'currently only support "by_epoch" = False' + if not isinstance(max_lr, (numbers.Number, list, dict)): + raise ValueError('the type of max_lr must be the one of list or ' + f'dict, but got {type(max_lr)}') + self._max_lr = max_lr + if total_steps is not None: + if not isinstance(total_steps, int): + raise ValueError('the type of total_steps must be int, but' + f'got {type(total_steps)}') + self.total_steps = total_steps + # validate pct_start + if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): + raise ValueError('expected float between 0 and 1 pct_start, but ' + f'got {pct_start}') + self.pct_start = pct_start + # validate anneal_strategy + if anneal_strategy not in ['cos', 'linear']: + raise ValueError('anneal_strategy must be one of "cos" or ' + f'"linear", instead got {anneal_strategy}') + elif anneal_strategy == 'cos': + self.anneal_func = annealing_cos + elif anneal_strategy == 'linear': + self.anneal_func = annealing_linear + self.div_factor = div_factor + self.final_div_factor = final_div_factor + self.three_phase = three_phase + self.lr_phases = [] # init lr_phases + super(OneCycleLrUpdaterHook, self).__init__(**kwargs) + + def before_run(self, runner): + if hasattr(self, 'total_steps'): + total_steps = self.total_steps + else: + total_steps = runner.max_iters + if total_steps < runner.max_iters: + raise ValueError( + 'The total steps must be greater than or equal to max ' + f'iterations {runner.max_iters} of runner, but total steps ' + f'is {total_steps}.') + + if isinstance(runner.optimizer, dict): + self.base_lr = {} + for k, optim in runner.optimizer.items(): + _max_lr = format_param(k, optim, self._max_lr) + self.base_lr[k] = [lr / self.div_factor for lr in _max_lr] + for group, lr in zip(optim.param_groups, self.base_lr[k]): + group.setdefault('initial_lr', lr) + else: + k = type(runner.optimizer).__name__ + _max_lr = format_param(k, runner.optimizer, self._max_lr) + self.base_lr = [lr / self.div_factor for lr in _max_lr] + for group, lr in zip(runner.optimizer.param_groups, self.base_lr): + group.setdefault('initial_lr', lr) + + if self.three_phase: + self.lr_phases.append( + [float(self.pct_start * total_steps) - 1, 1, self.div_factor]) + self.lr_phases.append([ + float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1 + ]) + self.lr_phases.append( + [total_steps - 1, 1, 1 / self.final_div_factor]) + else: + self.lr_phases.append( + [float(self.pct_start * total_steps) - 1, 1, self.div_factor]) + self.lr_phases.append( + [total_steps - 1, self.div_factor, 1 / self.final_div_factor]) + + def get_lr(self, runner, base_lr): + curr_iter = runner.iter + start_iter = 0 + for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases): + if curr_iter <= end_iter: + pct = (curr_iter - start_iter) / (end_iter - start_iter) + lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr, + pct) + break + start_iter = end_iter + return lr + + +def annealing_cos(start, end, factor, weight=1): + """Calculate annealing cos learning rate. + + Cosine anneal from `weight * start + (1 - weight) * end` to `end` as + percentage goes from 0.0 to 1.0. + + Args: + start (float): The starting learning rate of the cosine annealing. + end (float): The ending learing rate of the cosine annealing. + factor (float): The coefficient of `pi` when calculating the current + percentage. Range from 0.0 to 1.0. + weight (float, optional): The combination factor of `start` and `end` + when calculating the actual starting learning rate. Default to 1. + """ + cos_out = cos(pi * factor) + 1 + return end + 0.5 * weight * (start - end) * cos_out + + +def annealing_linear(start, end, factor): + """Calculate annealing linear learning rate. + + Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0. + + Args: + start (float): The starting learning rate of the linear annealing. + end (float): The ending learing rate of the linear annealing. + factor (float): The coefficient of `pi` when calculating the current + percentage. Range from 0.0 to 1.0. + """ + return start + (end - start) * factor + + +def format_param(name, optim, param): + if isinstance(param, numbers.Number): + return [param] * len(optim.param_groups) + elif isinstance(param, (list, tuple)): # multi param groups + if len(param) != len(optim.param_groups): + raise ValueError(f'expected {len(optim.param_groups)} ' + f'values for {name}, got {len(param)}') + return param + else: # multi optimizers + if name not in param: + raise KeyError(f'{name} is not found in {param.keys()}') + return param[name] diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/memory.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..70cf9a838fb314e3bd3c07aadbc00921a81e83ed --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/memory.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class EmptyCacheHook(Hook): + + def __init__(self, before_epoch=False, after_epoch=True, after_iter=False): + self._before_epoch = before_epoch + self._after_epoch = after_epoch + self._after_iter = after_iter + + def after_iter(self, runner): + if self._after_iter: + torch.cuda.empty_cache() + + def before_epoch(self, runner): + if self._before_epoch: + torch.cuda.empty_cache() + + def after_epoch(self, runner): + if self._after_epoch: + torch.cuda.empty_cache() diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/momentum_updater.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/momentum_updater.py new file mode 100644 index 0000000000000000000000000000000000000000..60437756ceedf06055ec349df69a25465738d3f0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/momentum_updater.py @@ -0,0 +1,493 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import annotator.uniformer.mmcv as mmcv +from .hook import HOOKS, Hook +from .lr_updater import annealing_cos, annealing_linear, format_param + + +class MomentumUpdaterHook(Hook): + + def __init__(self, + by_epoch=True, + warmup=None, + warmup_iters=0, + warmup_ratio=0.9): + # validate the "warmup" argument + if warmup is not None: + if warmup not in ['constant', 'linear', 'exp']: + raise ValueError( + f'"{warmup}" is not a supported type for warming up, valid' + ' types are "constant" and "linear"') + if warmup is not None: + assert warmup_iters > 0, \ + '"warmup_iters" must be a positive integer' + assert 0 < warmup_ratio <= 1.0, \ + '"warmup_momentum" must be in range (0,1]' + + self.by_epoch = by_epoch + self.warmup = warmup + self.warmup_iters = warmup_iters + self.warmup_ratio = warmup_ratio + + self.base_momentum = [] # initial momentum for all param groups + self.regular_momentum = [ + ] # expected momentum if no warming up is performed + + def _set_momentum(self, runner, momentum_groups): + if isinstance(runner.optimizer, dict): + for k, optim in runner.optimizer.items(): + for param_group, mom in zip(optim.param_groups, + momentum_groups[k]): + if 'momentum' in param_group.keys(): + param_group['momentum'] = mom + elif 'betas' in param_group.keys(): + param_group['betas'] = (mom, param_group['betas'][1]) + else: + for param_group, mom in zip(runner.optimizer.param_groups, + momentum_groups): + if 'momentum' in param_group.keys(): + param_group['momentum'] = mom + elif 'betas' in param_group.keys(): + param_group['betas'] = (mom, param_group['betas'][1]) + + def get_momentum(self, runner, base_momentum): + raise NotImplementedError + + def get_regular_momentum(self, runner): + if isinstance(runner.optimizer, dict): + momentum_groups = {} + for k in runner.optimizer.keys(): + _momentum_group = [ + self.get_momentum(runner, _base_momentum) + for _base_momentum in self.base_momentum[k] + ] + momentum_groups.update({k: _momentum_group}) + return momentum_groups + else: + return [ + self.get_momentum(runner, _base_momentum) + for _base_momentum in self.base_momentum + ] + + def get_warmup_momentum(self, cur_iters): + + def _get_warmup_momentum(cur_iters, regular_momentum): + if self.warmup == 'constant': + warmup_momentum = [ + _momentum / self.warmup_ratio + for _momentum in self.regular_momentum + ] + elif self.warmup == 'linear': + k = (1 - cur_iters / self.warmup_iters) * (1 - + self.warmup_ratio) + warmup_momentum = [ + _momentum / (1 - k) for _momentum in self.regular_mom + ] + elif self.warmup == 'exp': + k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters) + warmup_momentum = [ + _momentum / k for _momentum in self.regular_mom + ] + return warmup_momentum + + if isinstance(self.regular_momentum, dict): + momentum_groups = {} + for key, regular_momentum in self.regular_momentum.items(): + momentum_groups[key] = _get_warmup_momentum( + cur_iters, regular_momentum) + return momentum_groups + else: + return _get_warmup_momentum(cur_iters, self.regular_momentum) + + def before_run(self, runner): + # NOTE: when resuming from a checkpoint, + # if 'initial_momentum' is not saved, + # it will be set according to the optimizer params + if isinstance(runner.optimizer, dict): + self.base_momentum = {} + for k, optim in runner.optimizer.items(): + for group in optim.param_groups: + if 'momentum' in group.keys(): + group.setdefault('initial_momentum', group['momentum']) + else: + group.setdefault('initial_momentum', group['betas'][0]) + _base_momentum = [ + group['initial_momentum'] for group in optim.param_groups + ] + self.base_momentum.update({k: _base_momentum}) + else: + for group in runner.optimizer.param_groups: + if 'momentum' in group.keys(): + group.setdefault('initial_momentum', group['momentum']) + else: + group.setdefault('initial_momentum', group['betas'][0]) + self.base_momentum = [ + group['initial_momentum'] + for group in runner.optimizer.param_groups + ] + + def before_train_epoch(self, runner): + if not self.by_epoch: + return + self.regular_mom = self.get_regular_momentum(runner) + self._set_momentum(runner, self.regular_mom) + + def before_train_iter(self, runner): + cur_iter = runner.iter + if not self.by_epoch: + self.regular_mom = self.get_regular_momentum(runner) + if self.warmup is None or cur_iter >= self.warmup_iters: + self._set_momentum(runner, self.regular_mom) + else: + warmup_momentum = self.get_warmup_momentum(cur_iter) + self._set_momentum(runner, warmup_momentum) + elif self.by_epoch: + if self.warmup is None or cur_iter > self.warmup_iters: + return + elif cur_iter == self.warmup_iters: + self._set_momentum(runner, self.regular_mom) + else: + warmup_momentum = self.get_warmup_momentum(cur_iter) + self._set_momentum(runner, warmup_momentum) + + +@HOOKS.register_module() +class StepMomentumUpdaterHook(MomentumUpdaterHook): + """Step momentum scheduler with min value clipping. + + Args: + step (int | list[int]): Step to decay the momentum. If an int value is + given, regard it as the decay interval. If a list is given, decay + momentum at these steps. + gamma (float, optional): Decay momentum ratio. Default: 0.5. + min_momentum (float, optional): Minimum momentum value to keep. If + momentum after decay is lower than this value, it will be clipped + accordingly. If None is given, we don't perform lr clipping. + Default: None. + """ + + def __init__(self, step, gamma=0.5, min_momentum=None, **kwargs): + if isinstance(step, list): + assert mmcv.is_list_of(step, int) + assert all([s > 0 for s in step]) + elif isinstance(step, int): + assert step > 0 + else: + raise TypeError('"step" must be a list or integer') + self.step = step + self.gamma = gamma + self.min_momentum = min_momentum + super(StepMomentumUpdaterHook, self).__init__(**kwargs) + + def get_momentum(self, runner, base_momentum): + progress = runner.epoch if self.by_epoch else runner.iter + + # calculate exponential term + if isinstance(self.step, int): + exp = progress // self.step + else: + exp = len(self.step) + for i, s in enumerate(self.step): + if progress < s: + exp = i + break + + momentum = base_momentum * (self.gamma**exp) + if self.min_momentum is not None: + # clip to a minimum value + momentum = max(momentum, self.min_momentum) + return momentum + + +@HOOKS.register_module() +class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook): + + def __init__(self, min_momentum=None, min_momentum_ratio=None, **kwargs): + assert (min_momentum is None) ^ (min_momentum_ratio is None) + self.min_momentum = min_momentum + self.min_momentum_ratio = min_momentum_ratio + super(CosineAnnealingMomentumUpdaterHook, self).__init__(**kwargs) + + def get_momentum(self, runner, base_momentum): + if self.by_epoch: + progress = runner.epoch + max_progress = runner.max_epochs + else: + progress = runner.iter + max_progress = runner.max_iters + if self.min_momentum_ratio is not None: + target_momentum = base_momentum * self.min_momentum_ratio + else: + target_momentum = self.min_momentum + return annealing_cos(base_momentum, target_momentum, + progress / max_progress) + + +@HOOKS.register_module() +class CyclicMomentumUpdaterHook(MomentumUpdaterHook): + """Cyclic momentum Scheduler. + + Implement the cyclical momentum scheduler policy described in + https://arxiv.org/pdf/1708.07120.pdf + + This momentum scheduler usually used together with the CyclicLRUpdater + to improve the performance in the 3D detection area. + + Attributes: + target_ratio (tuple[float]): Relative ratio of the lowest momentum and + the highest momentum to the initial momentum. + cyclic_times (int): Number of cycles during training + step_ratio_up (float): The ratio of the increasing process of momentum + in the total cycle. + by_epoch (bool): Whether to update momentum by epoch. + """ + + def __init__(self, + by_epoch=False, + target_ratio=(0.85 / 0.95, 1), + cyclic_times=1, + step_ratio_up=0.4, + **kwargs): + if isinstance(target_ratio, float): + target_ratio = (target_ratio, target_ratio / 1e5) + elif isinstance(target_ratio, tuple): + target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \ + if len(target_ratio) == 1 else target_ratio + else: + raise ValueError('target_ratio should be either float ' + f'or tuple, got {type(target_ratio)}') + + assert len(target_ratio) == 2, \ + '"target_ratio" must be list or tuple of two floats' + assert 0 <= step_ratio_up < 1.0, \ + '"step_ratio_up" must be in range [0,1)' + + self.target_ratio = target_ratio + self.cyclic_times = cyclic_times + self.step_ratio_up = step_ratio_up + self.momentum_phases = [] # init momentum_phases + # currently only support by_epoch=False + assert not by_epoch, \ + 'currently only support "by_epoch" = False' + super(CyclicMomentumUpdaterHook, self).__init__(by_epoch, **kwargs) + + def before_run(self, runner): + super(CyclicMomentumUpdaterHook, self).before_run(runner) + # initiate momentum_phases + # total momentum_phases are separated as up and down + max_iter_per_phase = runner.max_iters // self.cyclic_times + iter_up_phase = int(self.step_ratio_up * max_iter_per_phase) + self.momentum_phases.append( + [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]]) + self.momentum_phases.append([ + iter_up_phase, max_iter_per_phase, max_iter_per_phase, + self.target_ratio[0], self.target_ratio[1] + ]) + + def get_momentum(self, runner, base_momentum): + curr_iter = runner.iter + for (start_iter, end_iter, max_iter_per_phase, start_ratio, + end_ratio) in self.momentum_phases: + curr_iter %= max_iter_per_phase + if start_iter <= curr_iter < end_iter: + progress = curr_iter - start_iter + return annealing_cos(base_momentum * start_ratio, + base_momentum * end_ratio, + progress / (end_iter - start_iter)) + + +@HOOKS.register_module() +class OneCycleMomentumUpdaterHook(MomentumUpdaterHook): + """OneCycle momentum Scheduler. + + This momentum scheduler usually used together with the OneCycleLrUpdater + to improve the performance. + + Args: + base_momentum (float or list): Lower momentum boundaries in the cycle + for each parameter group. Note that momentum is cycled inversely + to learning rate; at the peak of a cycle, momentum is + 'base_momentum' and learning rate is 'max_lr'. + Default: 0.85 + max_momentum (float or list): Upper momentum boundaries in the cycle + for each parameter group. Functionally, + it defines the cycle amplitude (max_momentum - base_momentum). + Note that momentum is cycled inversely + to learning rate; at the start of a cycle, momentum is + 'max_momentum' and learning rate is 'base_lr' + Default: 0.95 + pct_start (float): The percentage of the cycle (in number of steps) + spent increasing the learning rate. + Default: 0.3 + anneal_strategy (str): {'cos', 'linear'} + Specifies the annealing strategy: 'cos' for cosine annealing, + 'linear' for linear annealing. + Default: 'cos' + three_phase (bool): If three_phase is True, use a third phase of the + schedule to annihilate the learning rate according to + final_div_factor instead of modifying the second phase (the first + two phases will be symmetrical about the step indicated by + pct_start). + Default: False + """ + + def __init__(self, + base_momentum=0.85, + max_momentum=0.95, + pct_start=0.3, + anneal_strategy='cos', + three_phase=False, + **kwargs): + # validate by_epoch, currently only support by_epoch=False + if 'by_epoch' not in kwargs: + kwargs['by_epoch'] = False + else: + assert not kwargs['by_epoch'], \ + 'currently only support "by_epoch" = False' + if not isinstance(base_momentum, (float, list, dict)): + raise ValueError('base_momentum must be the type among of float,' + 'list or dict.') + self._base_momentum = base_momentum + if not isinstance(max_momentum, (float, list, dict)): + raise ValueError('max_momentum must be the type among of float,' + 'list or dict.') + self._max_momentum = max_momentum + # validate pct_start + if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): + raise ValueError('Expected float between 0 and 1 pct_start, but ' + f'got {pct_start}') + self.pct_start = pct_start + # validate anneal_strategy + if anneal_strategy not in ['cos', 'linear']: + raise ValueError('anneal_strategy must by one of "cos" or ' + f'"linear", instead got {anneal_strategy}') + elif anneal_strategy == 'cos': + self.anneal_func = annealing_cos + elif anneal_strategy == 'linear': + self.anneal_func = annealing_linear + self.three_phase = three_phase + self.momentum_phases = [] # init momentum_phases + super(OneCycleMomentumUpdaterHook, self).__init__(**kwargs) + + def before_run(self, runner): + if isinstance(runner.optimizer, dict): + for k, optim in runner.optimizer.items(): + if ('momentum' not in optim.defaults + and 'betas' not in optim.defaults): + raise ValueError('optimizer must support momentum with' + 'option enabled') + self.use_beta1 = 'betas' in optim.defaults + _base_momentum = format_param(k, optim, self._base_momentum) + _max_momentum = format_param(k, optim, self._max_momentum) + for group, b_momentum, m_momentum in zip( + optim.param_groups, _base_momentum, _max_momentum): + if self.use_beta1: + _, beta2 = group['betas'] + group['betas'] = (m_momentum, beta2) + else: + group['momentum'] = m_momentum + group['base_momentum'] = b_momentum + group['max_momentum'] = m_momentum + else: + optim = runner.optimizer + if ('momentum' not in optim.defaults + and 'betas' not in optim.defaults): + raise ValueError('optimizer must support momentum with' + 'option enabled') + self.use_beta1 = 'betas' in optim.defaults + k = type(optim).__name__ + _base_momentum = format_param(k, optim, self._base_momentum) + _max_momentum = format_param(k, optim, self._max_momentum) + for group, b_momentum, m_momentum in zip(optim.param_groups, + _base_momentum, + _max_momentum): + if self.use_beta1: + _, beta2 = group['betas'] + group['betas'] = (m_momentum, beta2) + else: + group['momentum'] = m_momentum + group['base_momentum'] = b_momentum + group['max_momentum'] = m_momentum + + if self.three_phase: + self.momentum_phases.append({ + 'end_iter': + float(self.pct_start * runner.max_iters) - 1, + 'start_momentum': + 'max_momentum', + 'end_momentum': + 'base_momentum' + }) + self.momentum_phases.append({ + 'end_iter': + float(2 * self.pct_start * runner.max_iters) - 2, + 'start_momentum': + 'base_momentum', + 'end_momentum': + 'max_momentum' + }) + self.momentum_phases.append({ + 'end_iter': runner.max_iters - 1, + 'start_momentum': 'max_momentum', + 'end_momentum': 'max_momentum' + }) + else: + self.momentum_phases.append({ + 'end_iter': + float(self.pct_start * runner.max_iters) - 1, + 'start_momentum': + 'max_momentum', + 'end_momentum': + 'base_momentum' + }) + self.momentum_phases.append({ + 'end_iter': runner.max_iters - 1, + 'start_momentum': 'base_momentum', + 'end_momentum': 'max_momentum' + }) + + def _set_momentum(self, runner, momentum_groups): + if isinstance(runner.optimizer, dict): + for k, optim in runner.optimizer.items(): + for param_group, mom in zip(optim.param_groups, + momentum_groups[k]): + if 'momentum' in param_group.keys(): + param_group['momentum'] = mom + elif 'betas' in param_group.keys(): + param_group['betas'] = (mom, param_group['betas'][1]) + else: + for param_group, mom in zip(runner.optimizer.param_groups, + momentum_groups): + if 'momentum' in param_group.keys(): + param_group['momentum'] = mom + elif 'betas' in param_group.keys(): + param_group['betas'] = (mom, param_group['betas'][1]) + + def get_momentum(self, runner, param_group): + curr_iter = runner.iter + start_iter = 0 + for i, phase in enumerate(self.momentum_phases): + end_iter = phase['end_iter'] + if curr_iter <= end_iter or i == len(self.momentum_phases) - 1: + pct = (curr_iter - start_iter) / (end_iter - start_iter) + momentum = self.anneal_func( + param_group[phase['start_momentum']], + param_group[phase['end_momentum']], pct) + break + start_iter = end_iter + return momentum + + def get_regular_momentum(self, runner): + if isinstance(runner.optimizer, dict): + momentum_groups = {} + for k, optim in runner.optimizer.items(): + _momentum_group = [ + self.get_momentum(runner, param_group) + for param_group in optim.param_groups + ] + momentum_groups.update({k: _momentum_group}) + return momentum_groups + else: + momentum_groups = [] + for param_group in runner.optimizer.param_groups: + momentum_groups.append(self.get_momentum(runner, param_group)) + return momentum_groups diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/optimizer.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..4ef3e9ff8f9c6926e32bdf027612267b64ed80df --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/optimizer.py @@ -0,0 +1,508 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from collections import defaultdict +from itertools import chain + +from torch.nn.utils import clip_grad + +from annotator.uniformer.mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version +from ..dist_utils import allreduce_grads +from ..fp16_utils import LossScaler, wrap_fp16_model +from .hook import HOOKS, Hook + +try: + # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported + # and used; otherwise, auto fp16 will adopt mmcv's implementation. + from torch.cuda.amp import GradScaler +except ImportError: + pass + + +@HOOKS.register_module() +class OptimizerHook(Hook): + + def __init__(self, grad_clip=None): + self.grad_clip = grad_clip + + def clip_grads(self, params): + params = list( + filter(lambda p: p.requires_grad and p.grad is not None, params)) + if len(params) > 0: + return clip_grad.clip_grad_norm_(params, **self.grad_clip) + + def after_train_iter(self, runner): + runner.optimizer.zero_grad() + runner.outputs['loss'].backward() + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update({'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + runner.optimizer.step() + + +@HOOKS.register_module() +class GradientCumulativeOptimizerHook(OptimizerHook): + """Optimizer Hook implements multi-iters gradient cumulating. + + Args: + cumulative_iters (int, optional): Num of gradient cumulative iters. + The optimizer will step every `cumulative_iters` iters. + Defaults to 1. + + Examples: + >>> # Use cumulative_iters to simulate a large batch size + >>> # It is helpful when the hardware cannot handle a large batch size. + >>> loader = DataLoader(data, batch_size=64) + >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4) + >>> # almost equals to + >>> loader = DataLoader(data, batch_size=256) + >>> optim_hook = OptimizerHook() + """ + + def __init__(self, cumulative_iters=1, **kwargs): + super(GradientCumulativeOptimizerHook, self).__init__(**kwargs) + + assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \ + f'cumulative_iters only accepts positive int, but got ' \ + f'{type(cumulative_iters)} instead.' + + self.cumulative_iters = cumulative_iters + self.divisible_iters = 0 + self.remainder_iters = 0 + self.initialized = False + + def has_batch_norm(self, module): + if isinstance(module, _BatchNorm): + return True + for m in module.children(): + if self.has_batch_norm(m): + return True + return False + + def _init(self, runner): + if runner.iter % self.cumulative_iters != 0: + runner.logger.warning( + 'Resume iter number is not divisible by cumulative_iters in ' + 'GradientCumulativeOptimizerHook, which means the gradient of ' + 'some iters is lost and the result may be influenced slightly.' + ) + + if self.has_batch_norm(runner.model) and self.cumulative_iters > 1: + runner.logger.warning( + 'GradientCumulativeOptimizerHook may slightly decrease ' + 'performance if the model has BatchNorm layers.') + + residual_iters = runner.max_iters - runner.iter + + self.divisible_iters = ( + residual_iters // self.cumulative_iters * self.cumulative_iters) + self.remainder_iters = residual_iters - self.divisible_iters + + self.initialized = True + + def after_train_iter(self, runner): + if not self.initialized: + self._init(runner) + + if runner.iter < self.divisible_iters: + loss_factor = self.cumulative_iters + else: + loss_factor = self.remainder_iters + loss = runner.outputs['loss'] + loss = loss / loss_factor + loss.backward() + + if (self.every_n_iters(runner, self.cumulative_iters) + or self.is_last_iter(runner)): + + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update({'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + runner.optimizer.step() + runner.optimizer.zero_grad() + + +if (TORCH_VERSION != 'parrots' + and digit_version(TORCH_VERSION) >= digit_version('1.6.0')): + + @HOOKS.register_module() + class Fp16OptimizerHook(OptimizerHook): + """FP16 optimizer hook (using PyTorch's implementation). + + If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, + to take care of the optimization procedure. + + Args: + loss_scale (float | str | dict): Scale factor configuration. + If loss_scale is a float, static loss scaling will be used with + the specified scale. If loss_scale is a string, it must be + 'dynamic', then dynamic loss scaling will be used. + It can also be a dict containing arguments of GradScalar. + Defaults to 512. For Pytorch >= 1.6, mmcv uses official + implementation of GradScaler. If you use a dict version of + loss_scale to create GradScaler, please refer to: + https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler + for the parameters. + + Examples: + >>> loss_scale = dict( + ... init_scale=65536.0, + ... growth_factor=2.0, + ... backoff_factor=0.5, + ... growth_interval=2000 + ... ) + >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale) + """ + + def __init__(self, + grad_clip=None, + coalesce=True, + bucket_size_mb=-1, + loss_scale=512., + distributed=True): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + self.distributed = distributed + self._scale_update_param = None + if loss_scale == 'dynamic': + self.loss_scaler = GradScaler() + elif isinstance(loss_scale, float): + self._scale_update_param = loss_scale + self.loss_scaler = GradScaler(init_scale=loss_scale) + elif isinstance(loss_scale, dict): + self.loss_scaler = GradScaler(**loss_scale) + else: + raise ValueError('loss_scale must be of type float, dict, or ' + f'"dynamic", got {loss_scale}') + + def before_run(self, runner): + """Preparing steps before Mixed Precision Training.""" + # wrap model mode to fp16 + wrap_fp16_model(runner.model) + # resume from state dict + if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']: + scaler_state_dict = runner.meta['fp16']['loss_scaler'] + self.loss_scaler.load_state_dict(scaler_state_dict) + + def copy_grads_to_fp32(self, fp16_net, fp32_weights): + """Copy gradients from fp16 model to fp32 weight copy.""" + for fp32_param, fp16_param in zip(fp32_weights, + fp16_net.parameters()): + if fp16_param.grad is not None: + if fp32_param.grad is None: + fp32_param.grad = fp32_param.data.new( + fp32_param.size()) + fp32_param.grad.copy_(fp16_param.grad) + + def copy_params_to_fp16(self, fp16_net, fp32_weights): + """Copy updated params from fp32 weight copy to fp16 model.""" + for fp16_param, fp32_param in zip(fp16_net.parameters(), + fp32_weights): + fp16_param.data.copy_(fp32_param.data) + + def after_train_iter(self, runner): + """Backward optimization steps for Mixed Precision Training. For + dynamic loss scaling, please refer to + https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler. + + 1. Scale the loss by a scale factor. + 2. Backward the loss to obtain the gradients. + 3. Unscale the optimizer’s gradient tensors. + 4. Call optimizer.step() and update scale factor. + 5. Save loss_scaler state_dict for resume purpose. + """ + # clear grads of last iteration + runner.model.zero_grad() + runner.optimizer.zero_grad() + + self.loss_scaler.scale(runner.outputs['loss']).backward() + self.loss_scaler.unscale_(runner.optimizer) + # grad clip + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update({'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + # backward and update scaler + self.loss_scaler.step(runner.optimizer) + self.loss_scaler.update(self._scale_update_param) + + # save state_dict of loss_scaler + runner.meta.setdefault( + 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() + + @HOOKS.register_module() + class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook, + Fp16OptimizerHook): + """Fp16 optimizer Hook (using PyTorch's implementation) implements + multi-iters gradient cumulating. + + If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, + to take care of the optimization procedure. + """ + + def __init__(self, *args, **kwargs): + super(GradientCumulativeFp16OptimizerHook, + self).__init__(*args, **kwargs) + + def after_train_iter(self, runner): + if not self.initialized: + self._init(runner) + + if runner.iter < self.divisible_iters: + loss_factor = self.cumulative_iters + else: + loss_factor = self.remainder_iters + loss = runner.outputs['loss'] + loss = loss / loss_factor + + self.loss_scaler.scale(loss).backward() + + if (self.every_n_iters(runner, self.cumulative_iters) + or self.is_last_iter(runner)): + + # copy fp16 grads in the model to fp32 params in the optimizer + self.loss_scaler.unscale_(runner.optimizer) + + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update( + {'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + + # backward and update scaler + self.loss_scaler.step(runner.optimizer) + self.loss_scaler.update(self._scale_update_param) + + # save state_dict of loss_scaler + runner.meta.setdefault( + 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() + + # clear grads + runner.model.zero_grad() + runner.optimizer.zero_grad() + +else: + + @HOOKS.register_module() + class Fp16OptimizerHook(OptimizerHook): + """FP16 optimizer hook (mmcv's implementation). + + The steps of fp16 optimizer is as follows. + 1. Scale the loss value. + 2. BP in the fp16 model. + 2. Copy gradients from fp16 model to fp32 weights. + 3. Update fp32 weights. + 4. Copy updated parameters from fp32 weights to fp16 model. + + Refer to https://arxiv.org/abs/1710.03740 for more details. + + Args: + loss_scale (float | str | dict): Scale factor configuration. + If loss_scale is a float, static loss scaling will be used with + the specified scale. If loss_scale is a string, it must be + 'dynamic', then dynamic loss scaling will be used. + It can also be a dict containing arguments of LossScaler. + Defaults to 512. + """ + + def __init__(self, + grad_clip=None, + coalesce=True, + bucket_size_mb=-1, + loss_scale=512., + distributed=True): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + self.distributed = distributed + if loss_scale == 'dynamic': + self.loss_scaler = LossScaler(mode='dynamic') + elif isinstance(loss_scale, float): + self.loss_scaler = LossScaler( + init_scale=loss_scale, mode='static') + elif isinstance(loss_scale, dict): + self.loss_scaler = LossScaler(**loss_scale) + else: + raise ValueError('loss_scale must be of type float, dict, or ' + f'"dynamic", got {loss_scale}') + + def before_run(self, runner): + """Preparing steps before Mixed Precision Training. + + 1. Make a master copy of fp32 weights for optimization. + 2. Convert the main model from fp32 to fp16. + """ + # keep a copy of fp32 weights + old_groups = runner.optimizer.param_groups + runner.optimizer.param_groups = copy.deepcopy( + runner.optimizer.param_groups) + state = defaultdict(dict) + p_map = { + old_p: p + for old_p, p in zip( + chain(*(g['params'] for g in old_groups)), + chain(*(g['params'] + for g in runner.optimizer.param_groups))) + } + for k, v in runner.optimizer.state.items(): + state[p_map[k]] = v + runner.optimizer.state = state + # convert model to fp16 + wrap_fp16_model(runner.model) + # resume from state dict + if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']: + scaler_state_dict = runner.meta['fp16']['loss_scaler'] + self.loss_scaler.load_state_dict(scaler_state_dict) + + def copy_grads_to_fp32(self, fp16_net, fp32_weights): + """Copy gradients from fp16 model to fp32 weight copy.""" + for fp32_param, fp16_param in zip(fp32_weights, + fp16_net.parameters()): + if fp16_param.grad is not None: + if fp32_param.grad is None: + fp32_param.grad = fp32_param.data.new( + fp32_param.size()) + fp32_param.grad.copy_(fp16_param.grad) + + def copy_params_to_fp16(self, fp16_net, fp32_weights): + """Copy updated params from fp32 weight copy to fp16 model.""" + for fp16_param, fp32_param in zip(fp16_net.parameters(), + fp32_weights): + fp16_param.data.copy_(fp32_param.data) + + def after_train_iter(self, runner): + """Backward optimization steps for Mixed Precision Training. For + dynamic loss scaling, please refer `loss_scalar.py` + + 1. Scale the loss by a scale factor. + 2. Backward the loss to obtain the gradients (fp16). + 3. Copy gradients from the model to the fp32 weight copy. + 4. Scale the gradients back and update the fp32 weight copy. + 5. Copy back the params from fp32 weight copy to the fp16 model. + 6. Save loss_scaler state_dict for resume purpose. + """ + # clear grads of last iteration + runner.model.zero_grad() + runner.optimizer.zero_grad() + # scale the loss value + scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale + scaled_loss.backward() + # copy fp16 grads in the model to fp32 params in the optimizer + + fp32_weights = [] + for param_group in runner.optimizer.param_groups: + fp32_weights += param_group['params'] + self.copy_grads_to_fp32(runner.model, fp32_weights) + # allreduce grads + if self.distributed: + allreduce_grads(fp32_weights, self.coalesce, + self.bucket_size_mb) + + has_overflow = self.loss_scaler.has_overflow(fp32_weights) + # if has overflow, skip this iteration + if not has_overflow: + # scale the gradients back + for param in fp32_weights: + if param.grad is not None: + param.grad.div_(self.loss_scaler.loss_scale) + if self.grad_clip is not None: + grad_norm = self.clip_grads(fp32_weights) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update( + {'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + # update fp32 params + runner.optimizer.step() + # copy fp32 params to the fp16 model + self.copy_params_to_fp16(runner.model, fp32_weights) + self.loss_scaler.update_scale(has_overflow) + if has_overflow: + runner.logger.warning('Check overflow, downscale loss scale ' + f'to {self.loss_scaler.cur_scale}') + + # save state_dict of loss_scaler + runner.meta.setdefault( + 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() + + @HOOKS.register_module() + class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook, + Fp16OptimizerHook): + """Fp16 optimizer Hook (using mmcv implementation) implements multi- + iters gradient cumulating.""" + + def __init__(self, *args, **kwargs): + super(GradientCumulativeFp16OptimizerHook, + self).__init__(*args, **kwargs) + + def after_train_iter(self, runner): + if not self.initialized: + self._init(runner) + + if runner.iter < self.divisible_iters: + loss_factor = self.cumulative_iters + else: + loss_factor = self.remainder_iters + + loss = runner.outputs['loss'] + loss = loss / loss_factor + + # scale the loss value + scaled_loss = loss * self.loss_scaler.loss_scale + scaled_loss.backward() + + if (self.every_n_iters(runner, self.cumulative_iters) + or self.is_last_iter(runner)): + + # copy fp16 grads in the model to fp32 params in the optimizer + fp32_weights = [] + for param_group in runner.optimizer.param_groups: + fp32_weights += param_group['params'] + self.copy_grads_to_fp32(runner.model, fp32_weights) + # allreduce grads + if self.distributed: + allreduce_grads(fp32_weights, self.coalesce, + self.bucket_size_mb) + + has_overflow = self.loss_scaler.has_overflow(fp32_weights) + # if has overflow, skip this iteration + if not has_overflow: + # scale the gradients back + for param in fp32_weights: + if param.grad is not None: + param.grad.div_(self.loss_scaler.loss_scale) + if self.grad_clip is not None: + grad_norm = self.clip_grads(fp32_weights) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update( + {'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + # update fp32 params + runner.optimizer.step() + # copy fp32 params to the fp16 model + self.copy_params_to_fp16(runner.model, fp32_weights) + else: + runner.logger.warning( + 'Check overflow, downscale loss scale ' + f'to {self.loss_scaler.cur_scale}') + + self.loss_scaler.update_scale(has_overflow) + + # save state_dict of loss_scaler + runner.meta.setdefault( + 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() + + # clear grads + runner.model.zero_grad() + runner.optimizer.zero_grad() diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/profiler.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..b70236997eec59c2209ef351ae38863b4112d0ec --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/profiler.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Callable, List, Optional, Union + +import torch + +from ..dist_utils import master_only +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class ProfilerHook(Hook): + """Profiler to analyze performance during training. + + PyTorch Profiler is a tool that allows the collection of the performance + metrics during the training. More details on Profiler can be found at + https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile + + Args: + by_epoch (bool): Profile performance by epoch or by iteration. + Default: True. + profile_iters (int): Number of iterations for profiling. + If ``by_epoch=True``, profile_iters indicates that they are the + first profile_iters epochs at the beginning of the + training, otherwise it indicates the first profile_iters + iterations. Default: 1. + activities (list[str]): List of activity groups (CPU, CUDA) to use in + profiling. Default: ['cpu', 'cuda']. + schedule (dict, optional): Config of generating the callable schedule. + if schedule is None, profiler will not add step markers into the + trace and table view. Default: None. + on_trace_ready (callable, dict): Either a handler or a dict of generate + handler. Default: None. + record_shapes (bool): Save information about operator's input shapes. + Default: False. + profile_memory (bool): Track tensor memory allocation/deallocation. + Default: False. + with_stack (bool): Record source information (file and line number) + for the ops. Default: False. + with_flops (bool): Use formula to estimate the FLOPS of specific + operators (matrix multiplication and 2D convolution). + Default: False. + json_trace_path (str, optional): Exports the collected trace in Chrome + JSON format. Default: None. + + Example: + >>> runner = ... # instantiate a Runner + >>> # tensorboard trace + >>> trace_config = dict(type='tb_trace', dir_name='work_dir') + >>> profiler_config = dict(on_trace_ready=trace_config) + >>> runner.register_profiler_hook(profiler_config) + >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)]) + """ + + def __init__(self, + by_epoch: bool = True, + profile_iters: int = 1, + activities: List[str] = ['cpu', 'cuda'], + schedule: Optional[dict] = None, + on_trace_ready: Optional[Union[Callable, dict]] = None, + record_shapes: bool = False, + profile_memory: bool = False, + with_stack: bool = False, + with_flops: bool = False, + json_trace_path: Optional[str] = None) -> None: + try: + from torch import profiler # torch version >= 1.8.1 + except ImportError: + raise ImportError('profiler is the new feature of torch1.8.1, ' + f'but your version is {torch.__version__}') + + assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.' + self.by_epoch = by_epoch + + if profile_iters < 1: + raise ValueError('profile_iters should be greater than 0, but got ' + f'{profile_iters}') + self.profile_iters = profile_iters + + if not isinstance(activities, list): + raise ValueError( + f'activities should be list, but got {type(activities)}') + self.activities = [] + for activity in activities: + activity = activity.lower() + if activity == 'cpu': + self.activities.append(profiler.ProfilerActivity.CPU) + elif activity == 'cuda': + self.activities.append(profiler.ProfilerActivity.CUDA) + else: + raise ValueError( + f'activity should be "cpu" or "cuda", but got {activity}') + + if schedule is not None: + self.schedule = profiler.schedule(**schedule) + else: + self.schedule = None + + self.on_trace_ready = on_trace_ready + self.record_shapes = record_shapes + self.profile_memory = profile_memory + self.with_stack = with_stack + self.with_flops = with_flops + self.json_trace_path = json_trace_path + + @master_only + def before_run(self, runner): + if self.by_epoch and runner.max_epochs < self.profile_iters: + raise ValueError('self.profile_iters should not be greater than ' + f'{runner.max_epochs}') + + if not self.by_epoch and runner.max_iters < self.profile_iters: + raise ValueError('self.profile_iters should not be greater than ' + f'{runner.max_iters}') + + if callable(self.on_trace_ready): # handler + _on_trace_ready = self.on_trace_ready + elif isinstance(self.on_trace_ready, dict): # config of handler + trace_cfg = self.on_trace_ready.copy() + trace_type = trace_cfg.pop('type') # log_trace handler + if trace_type == 'log_trace': + + def _log_handler(prof): + print(prof.key_averages().table(**trace_cfg)) + + _on_trace_ready = _log_handler + elif trace_type == 'tb_trace': # tensorboard_trace handler + try: + import torch_tb_profiler # noqa: F401 + except ImportError: + raise ImportError('please run "pip install ' + 'torch-tb-profiler" to install ' + 'torch_tb_profiler') + _on_trace_ready = torch.profiler.tensorboard_trace_handler( + **trace_cfg) + else: + raise ValueError('trace_type should be "log_trace" or ' + f'"tb_trace", but got {trace_type}') + elif self.on_trace_ready is None: + _on_trace_ready = None # type: ignore + else: + raise ValueError('on_trace_ready should be handler, dict or None, ' + f'but got {type(self.on_trace_ready)}') + + if runner.max_epochs > 1: + warnings.warn(f'profiler will profile {runner.max_epochs} epochs ' + 'instead of 1 epoch. Since profiler will slow down ' + 'the training, it is recommended to train 1 epoch ' + 'with ProfilerHook and adjust your setting according' + ' to the profiler summary. During normal training ' + '(epoch > 1), you may disable the ProfilerHook.') + + self.profiler = torch.profiler.profile( + activities=self.activities, + schedule=self.schedule, + on_trace_ready=_on_trace_ready, + record_shapes=self.record_shapes, + profile_memory=self.profile_memory, + with_stack=self.with_stack, + with_flops=self.with_flops) + + self.profiler.__enter__() + runner.logger.info('profiler is profiling...') + + @master_only + def after_train_epoch(self, runner): + if self.by_epoch and runner.epoch == self.profile_iters - 1: + runner.logger.info('profiler may take a few minutes...') + self.profiler.__exit__(None, None, None) + if self.json_trace_path is not None: + self.profiler.export_chrome_trace(self.json_trace_path) + + @master_only + def after_train_iter(self, runner): + self.profiler.step() + if not self.by_epoch and runner.iter == self.profile_iters - 1: + runner.logger.info('profiler may take a few minutes...') + self.profiler.__exit__(None, None, None) + if self.json_trace_path is not None: + self.profiler.export_chrome_trace(self.json_trace_path) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py new file mode 100644 index 0000000000000000000000000000000000000000..ee0dc6bdd8df5775857028aaed5444c0f59caf80 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class DistSamplerSeedHook(Hook): + """Data-loading sampler for distributed training. + + When distributed training, it is only useful in conjunction with + :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same + purpose with :obj:`IterLoader`. + """ + + def before_epoch(self, runner): + if hasattr(runner.data_loader.sampler, 'set_epoch'): + # in case the data loader uses `SequentialSampler` in Pytorch + runner.data_loader.sampler.set_epoch(runner.epoch) + elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'): + # batch sampler in pytorch warps the sampler as its attributes. + runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py b/lavis/common/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..6376b7ff894280cb2782243b25e8973650591577 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..dist_utils import allreduce_params +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class SyncBuffersHook(Hook): + """Synchronize model buffers such as running_mean and running_var in BN at + the end of each epoch. + + Args: + distributed (bool): Whether distributed training is used. It is + effective only for distributed training. Defaults to True. + """ + + def __init__(self, distributed=True): + self.distributed = distributed + + def after_epoch(self, runner): + """All-reduce model buffers at the end of each epoch.""" + if self.distributed: + allreduce_params(runner.model.buffers()) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/iter_based_runner.py b/lavis/common/annotator/uniformer/mmcv/runner/iter_based_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..1df4de8c0285669dec9b014dfd1f3dd1600f0831 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/iter_based_runner.py @@ -0,0 +1,273 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import platform +import shutil +import time +import warnings + +import torch +from torch.optim import Optimizer + +import annotator.uniformer.mmcv as mmcv +from .base_runner import BaseRunner +from .builder import RUNNERS +from .checkpoint import save_checkpoint +from .hooks import IterTimerHook +from .utils import get_host_info + + +class IterLoader: + + def __init__(self, dataloader): + self._dataloader = dataloader + self.iter_loader = iter(self._dataloader) + self._epoch = 0 + + @property + def epoch(self): + return self._epoch + + def __next__(self): + try: + data = next(self.iter_loader) + except StopIteration: + self._epoch += 1 + if hasattr(self._dataloader.sampler, 'set_epoch'): + self._dataloader.sampler.set_epoch(self._epoch) + time.sleep(2) # Prevent possible deadlock during epoch transition + self.iter_loader = iter(self._dataloader) + data = next(self.iter_loader) + + return data + + def __len__(self): + return len(self._dataloader) + + +@RUNNERS.register_module() +class IterBasedRunner(BaseRunner): + """Iteration-based Runner. + + This runner train models iteration by iteration. + """ + + def train(self, data_loader, **kwargs): + self.model.train() + self.mode = 'train' + self.data_loader = data_loader + self._epoch = data_loader.epoch + data_batch = next(data_loader) + self.call_hook('before_train_iter') + outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) + if not isinstance(outputs, dict): + raise TypeError('model.train_step() must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs + self.call_hook('after_train_iter') + self._inner_iter += 1 + self._iter += 1 + + @torch.no_grad() + def val(self, data_loader, **kwargs): + self.model.eval() + self.mode = 'val' + self.data_loader = data_loader + data_batch = next(data_loader) + self.call_hook('before_val_iter') + outputs = self.model.val_step(data_batch, **kwargs) + if not isinstance(outputs, dict): + raise TypeError('model.val_step() must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs + self.call_hook('after_val_iter') + self._inner_iter += 1 + + def run(self, data_loaders, workflow, max_iters=None, **kwargs): + """Start running. + + Args: + data_loaders (list[:obj:`DataLoader`]): Dataloaders for training + and validation. + workflow (list[tuple]): A list of (phase, iters) to specify the + running order and iterations. E.g, [('train', 10000), + ('val', 1000)] means running 10000 iterations for training and + 1000 iterations for validation, iteratively. + """ + assert isinstance(data_loaders, list) + assert mmcv.is_list_of(workflow, tuple) + assert len(data_loaders) == len(workflow) + if max_iters is not None: + warnings.warn( + 'setting max_iters in run is deprecated, ' + 'please set max_iters in runner_config', DeprecationWarning) + self._max_iters = max_iters + assert self._max_iters is not None, ( + 'max_iters must be specified during instantiation') + + work_dir = self.work_dir if self.work_dir is not None else 'NONE' + self.logger.info('Start running, host: %s, work_dir: %s', + get_host_info(), work_dir) + self.logger.info('Hooks will be executed in the following order:\n%s', + self.get_hook_info()) + self.logger.info('workflow: %s, max: %d iters', workflow, + self._max_iters) + self.call_hook('before_run') + + iter_loaders = [IterLoader(x) for x in data_loaders] + + self.call_hook('before_epoch') + + while self.iter < self._max_iters: + for i, flow in enumerate(workflow): + self._inner_iter = 0 + mode, iters = flow + if not isinstance(mode, str) or not hasattr(self, mode): + raise ValueError( + 'runner has no method named "{}" to run a workflow'. + format(mode)) + iter_runner = getattr(self, mode) + for _ in range(iters): + if mode == 'train' and self.iter >= self._max_iters: + break + iter_runner(iter_loaders[i], **kwargs) + + time.sleep(1) # wait for some hooks like loggers to finish + self.call_hook('after_epoch') + self.call_hook('after_run') + + def resume(self, + checkpoint, + resume_optimizer=True, + map_location='default'): + """Resume model from checkpoint. + + Args: + checkpoint (str): Checkpoint to resume from. + resume_optimizer (bool, optional): Whether resume the optimizer(s) + if the checkpoint file includes optimizer(s). Default to True. + map_location (str, optional): Same as :func:`torch.load`. + Default to 'default'. + """ + if map_location == 'default': + device_id = torch.cuda.current_device() + checkpoint = self.load_checkpoint( + checkpoint, + map_location=lambda storage, loc: storage.cuda(device_id)) + else: + checkpoint = self.load_checkpoint( + checkpoint, map_location=map_location) + + self._epoch = checkpoint['meta']['epoch'] + self._iter = checkpoint['meta']['iter'] + self._inner_iter = checkpoint['meta']['iter'] + if 'optimizer' in checkpoint and resume_optimizer: + if isinstance(self.optimizer, Optimizer): + self.optimizer.load_state_dict(checkpoint['optimizer']) + elif isinstance(self.optimizer, dict): + for k in self.optimizer.keys(): + self.optimizer[k].load_state_dict( + checkpoint['optimizer'][k]) + else: + raise TypeError( + 'Optimizer should be dict or torch.optim.Optimizer ' + f'but got {type(self.optimizer)}') + + self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}') + + def save_checkpoint(self, + out_dir, + filename_tmpl='iter_{}.pth', + meta=None, + save_optimizer=True, + create_symlink=True): + """Save checkpoint to file. + + Args: + out_dir (str): Directory to save checkpoint files. + filename_tmpl (str, optional): Checkpoint file template. + Defaults to 'iter_{}.pth'. + meta (dict, optional): Metadata to be saved in checkpoint. + Defaults to None. + save_optimizer (bool, optional): Whether save optimizer. + Defaults to True. + create_symlink (bool, optional): Whether create symlink to the + latest checkpoint file. Defaults to True. + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError( + f'meta should be a dict or None, but got {type(meta)}') + if self.meta is not None: + meta.update(self.meta) + # Note: meta.update(self.meta) should be done before + # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise + # there will be problems with resumed checkpoints. + # More details in https://github.com/open-mmlab/mmcv/pull/1108 + meta.update(epoch=self.epoch + 1, iter=self.iter) + + filename = filename_tmpl.format(self.iter + 1) + filepath = osp.join(out_dir, filename) + optimizer = self.optimizer if save_optimizer else None + save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) + # in some environments, `os.symlink` is not supported, you may need to + # set `create_symlink` to False + if create_symlink: + dst_file = osp.join(out_dir, 'latest.pth') + if platform.system() != 'Windows': + mmcv.symlink(filename, dst_file) + else: + shutil.copy(filepath, dst_file) + + def register_training_hooks(self, + lr_config, + optimizer_config=None, + checkpoint_config=None, + log_config=None, + momentum_config=None, + custom_hooks_config=None): + """Register default hooks for iter-based training. + + Checkpoint hook, optimizer stepper hook and logger hooks will be set to + `by_epoch=False` by default. + + Default hooks include: + + +----------------------+-------------------------+ + | Hooks | Priority | + +======================+=========================+ + | LrUpdaterHook | VERY_HIGH (10) | + +----------------------+-------------------------+ + | MomentumUpdaterHook | HIGH (30) | + +----------------------+-------------------------+ + | OptimizerStepperHook | ABOVE_NORMAL (40) | + +----------------------+-------------------------+ + | CheckpointSaverHook | NORMAL (50) | + +----------------------+-------------------------+ + | IterTimerHook | LOW (70) | + +----------------------+-------------------------+ + | LoggerHook(s) | VERY_LOW (90) | + +----------------------+-------------------------+ + | CustomHook(s) | defaults to NORMAL (50) | + +----------------------+-------------------------+ + + If custom hooks have same priority with default hooks, custom hooks + will be triggered after default hooks. + """ + if checkpoint_config is not None: + checkpoint_config.setdefault('by_epoch', False) + if lr_config is not None: + lr_config.setdefault('by_epoch', False) + if log_config is not None: + for info in log_config['hooks']: + info.setdefault('by_epoch', False) + super(IterBasedRunner, self).register_training_hooks( + lr_config=lr_config, + momentum_config=momentum_config, + optimizer_config=optimizer_config, + checkpoint_config=checkpoint_config, + log_config=log_config, + timer_config=IterTimerHook(), + custom_hooks_config=custom_hooks_config) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/log_buffer.py b/lavis/common/annotator/uniformer/mmcv/runner/log_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..d949e2941c5400088c7cd8a1dc893d8b233ae785 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/log_buffer.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +import numpy as np + + +class LogBuffer: + + def __init__(self): + self.val_history = OrderedDict() + self.n_history = OrderedDict() + self.output = OrderedDict() + self.ready = False + + def clear(self): + self.val_history.clear() + self.n_history.clear() + self.clear_output() + + def clear_output(self): + self.output.clear() + self.ready = False + + def update(self, vars, count=1): + assert isinstance(vars, dict) + for key, var in vars.items(): + if key not in self.val_history: + self.val_history[key] = [] + self.n_history[key] = [] + self.val_history[key].append(var) + self.n_history[key].append(count) + + def average(self, n=0): + """Average latest n values or all values.""" + assert n >= 0 + for key in self.val_history: + values = np.array(self.val_history[key][-n:]) + nums = np.array(self.n_history[key][-n:]) + avg = np.sum(values * nums) / np.sum(nums) + self.output[key] = avg + self.ready = True diff --git a/lavis/common/annotator/uniformer/mmcv/runner/optimizer/__init__.py b/lavis/common/annotator/uniformer/mmcv/runner/optimizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53c34d0470992cbc374f29681fdd00dc0e57968d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/optimizer/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer, + build_optimizer_constructor) +from .default_constructor import DefaultOptimizerConstructor + +__all__ = [ + 'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor', + 'build_optimizer', 'build_optimizer_constructor' +] diff --git a/lavis/common/annotator/uniformer/mmcv/runner/optimizer/builder.py b/lavis/common/annotator/uniformer/mmcv/runner/optimizer/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..f9234eed8f1f186d9d8dfda34562157ee39bdb3a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/optimizer/builder.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import inspect + +import torch + +from ...utils import Registry, build_from_cfg + +OPTIMIZERS = Registry('optimizer') +OPTIMIZER_BUILDERS = Registry('optimizer builder') + + +def register_torch_optimizers(): + torch_optimizers = [] + for module_name in dir(torch.optim): + if module_name.startswith('__'): + continue + _optim = getattr(torch.optim, module_name) + if inspect.isclass(_optim) and issubclass(_optim, + torch.optim.Optimizer): + OPTIMIZERS.register_module()(_optim) + torch_optimizers.append(module_name) + return torch_optimizers + + +TORCH_OPTIMIZERS = register_torch_optimizers() + + +def build_optimizer_constructor(cfg): + return build_from_cfg(cfg, OPTIMIZER_BUILDERS) + + +def build_optimizer(model, cfg): + optimizer_cfg = copy.deepcopy(cfg) + constructor_type = optimizer_cfg.pop('constructor', + 'DefaultOptimizerConstructor') + paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None) + optim_constructor = build_optimizer_constructor( + dict( + type=constructor_type, + optimizer_cfg=optimizer_cfg, + paramwise_cfg=paramwise_cfg)) + optimizer = optim_constructor(model) + return optimizer diff --git a/lavis/common/annotator/uniformer/mmcv/runner/optimizer/default_constructor.py b/lavis/common/annotator/uniformer/mmcv/runner/optimizer/default_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..2c0da3503b75441738efe38d70352b55a210a34a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/optimizer/default_constructor.py @@ -0,0 +1,249 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch +from torch.nn import GroupNorm, LayerNorm + +from annotator.uniformer.mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of +from annotator.uniformer.mmcv.utils.ext_loader import check_ops_exist +from .builder import OPTIMIZER_BUILDERS, OPTIMIZERS + + +@OPTIMIZER_BUILDERS.register_module() +class DefaultOptimizerConstructor: + """Default constructor for optimizers. + + By default each parameter share the same optimizer settings, and we + provide an argument ``paramwise_cfg`` to specify parameter-wise settings. + It is a dict and may contain the following fields: + + - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If + one of the keys in ``custom_keys`` is a substring of the name of one + parameter, then the setting of the parameter will be specified by + ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will + be ignored. It should be noted that the aforementioned ``key`` is the + longest key that is a substring of the name of the parameter. If there + are multiple matched keys with the same length, then the key with lower + alphabet order will be chosen. + ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult`` + and ``decay_mult``. See Example 2 below. + - ``bias_lr_mult`` (float): It will be multiplied to the learning + rate for all bias parameters (except for those in normalization + layers and offset layers of DCN). + - ``bias_decay_mult`` (float): It will be multiplied to the weight + decay for all bias parameters (except for those in + normalization layers, depthwise conv layers, offset layers of DCN). + - ``norm_decay_mult`` (float): It will be multiplied to the weight + decay for all weight and bias parameters of normalization + layers. + - ``dwconv_decay_mult`` (float): It will be multiplied to the weight + decay for all weight and bias parameters of depthwise conv + layers. + - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning + rate for parameters of offset layer in the deformable convs + of a model. + - ``bypass_duplicate`` (bool): If true, the duplicate parameters + would not be added into optimizer. Default: False. + + Note: + 1. If the option ``dcn_offset_lr_mult`` is used, the constructor will + override the effect of ``bias_lr_mult`` in the bias of offset + layer. So be careful when using both ``bias_lr_mult`` and + ``dcn_offset_lr_mult``. If you wish to apply both of them to the + offset layer in deformable convs, set ``dcn_offset_lr_mult`` + to the original ``dcn_offset_lr_mult`` * ``bias_lr_mult``. + 2. If the option ``dcn_offset_lr_mult`` is used, the constructor will + apply it to all the DCN layers in the model. So be careful when + the model contains multiple DCN layers in places other than + backbone. + + Args: + model (:obj:`nn.Module`): The model with parameters to be optimized. + optimizer_cfg (dict): The config dict of the optimizer. + Positional fields are + + - `type`: class name of the optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + paramwise_cfg (dict, optional): Parameter-wise options. + + Example 1: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, + >>> weight_decay=0.0001) + >>> paramwise_cfg = dict(norm_decay_mult=0.) + >>> optim_builder = DefaultOptimizerConstructor( + >>> optimizer_cfg, paramwise_cfg) + >>> optimizer = optim_builder(model) + + Example 2: + >>> # assume model have attribute model.backbone and model.cls_head + >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95) + >>> paramwise_cfg = dict(custom_keys={ + '.backbone': dict(lr_mult=0.1, decay_mult=0.9)}) + >>> optim_builder = DefaultOptimizerConstructor( + >>> optimizer_cfg, paramwise_cfg) + >>> optimizer = optim_builder(model) + >>> # Then the `lr` and `weight_decay` for model.backbone is + >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for + >>> # model.cls_head is (0.01, 0.95). + """ + + def __init__(self, optimizer_cfg, paramwise_cfg=None): + if not isinstance(optimizer_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optimizer_cfg)}') + self.optimizer_cfg = optimizer_cfg + self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg + self.base_lr = optimizer_cfg.get('lr', None) + self.base_wd = optimizer_cfg.get('weight_decay', None) + self._validate_cfg() + + def _validate_cfg(self): + if not isinstance(self.paramwise_cfg, dict): + raise TypeError('paramwise_cfg should be None or a dict, ' + f'but got {type(self.paramwise_cfg)}') + + if 'custom_keys' in self.paramwise_cfg: + if not isinstance(self.paramwise_cfg['custom_keys'], dict): + raise TypeError( + 'If specified, custom_keys must be a dict, ' + f'but got {type(self.paramwise_cfg["custom_keys"])}') + if self.base_wd is None: + for key in self.paramwise_cfg['custom_keys']: + if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]: + raise ValueError('base_wd should not be None') + + # get base lr and weight decay + # weight_decay must be explicitly specified if mult is specified + if ('bias_decay_mult' in self.paramwise_cfg + or 'norm_decay_mult' in self.paramwise_cfg + or 'dwconv_decay_mult' in self.paramwise_cfg): + if self.base_wd is None: + raise ValueError('base_wd should not be None') + + def _is_in(self, param_group, param_group_list): + assert is_list_of(param_group_list, dict) + param = set(param_group['params']) + param_set = set() + for group in param_group_list: + param_set.update(set(group['params'])) + + return not param.isdisjoint(param_set) + + def add_params(self, params, module, prefix='', is_dcn_module=None): + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (list[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + prefix (str): The prefix of the module + is_dcn_module (int|float|None): If the current module is a + submodule of DCN, `is_dcn_module` will be passed to + control conv_offset layer's learning rate. Defaults to None. + """ + # get param-wise options + custom_keys = self.paramwise_cfg.get('custom_keys', {}) + # first sort with alphabet order and then sort with reversed len of str + sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) + + bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.) + bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.) + norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.) + dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.) + bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False) + dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.) + + # special rules for norm layers and depth-wise conv layers + is_norm = isinstance(module, + (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) + is_dwconv = ( + isinstance(module, torch.nn.Conv2d) + and module.in_channels == module.groups) + + for name, param in module.named_parameters(recurse=False): + param_group = {'params': [param]} + if not param.requires_grad: + params.append(param_group) + continue + if bypass_duplicate and self._is_in(param_group, params): + warnings.warn(f'{prefix} is duplicate. It is skipped since ' + f'bypass_duplicate={bypass_duplicate}') + continue + # if the parameter match one of the custom keys, ignore other rules + is_custom = False + for key in sorted_keys: + if key in f'{prefix}.{name}': + is_custom = True + lr_mult = custom_keys[key].get('lr_mult', 1.) + param_group['lr'] = self.base_lr * lr_mult + if self.base_wd is not None: + decay_mult = custom_keys[key].get('decay_mult', 1.) + param_group['weight_decay'] = self.base_wd * decay_mult + break + + if not is_custom: + # bias_lr_mult affects all bias parameters + # except for norm.bias dcn.conv_offset.bias + if name == 'bias' and not (is_norm or is_dcn_module): + param_group['lr'] = self.base_lr * bias_lr_mult + + if (prefix.find('conv_offset') != -1 and is_dcn_module + and isinstance(module, torch.nn.Conv2d)): + # deal with both dcn_offset's bias & weight + param_group['lr'] = self.base_lr * dcn_offset_lr_mult + + # apply weight decay policies + if self.base_wd is not None: + # norm decay + if is_norm: + param_group[ + 'weight_decay'] = self.base_wd * norm_decay_mult + # depth-wise conv + elif is_dwconv: + param_group[ + 'weight_decay'] = self.base_wd * dwconv_decay_mult + # bias lr and decay + elif name == 'bias' and not is_dcn_module: + # TODO: current bias_decay_mult will have affect on DCN + param_group[ + 'weight_decay'] = self.base_wd * bias_decay_mult + params.append(param_group) + + if check_ops_exist(): + from annotator.uniformer.mmcv.ops import DeformConv2d, ModulatedDeformConv2d + is_dcn_module = isinstance(module, + (DeformConv2d, ModulatedDeformConv2d)) + else: + is_dcn_module = False + for child_name, child_mod in module.named_children(): + child_prefix = f'{prefix}.{child_name}' if prefix else child_name + self.add_params( + params, + child_mod, + prefix=child_prefix, + is_dcn_module=is_dcn_module) + + def __call__(self, model): + if hasattr(model, 'module'): + model = model.module + + optimizer_cfg = self.optimizer_cfg.copy() + # if no paramwise option is specified, just use the global setting + if not self.paramwise_cfg: + optimizer_cfg['params'] = model.parameters() + return build_from_cfg(optimizer_cfg, OPTIMIZERS) + + # set param-wise lr and weight decay recursively + params = [] + self.add_params(params, model) + optimizer_cfg['params'] = params + + return build_from_cfg(optimizer_cfg, OPTIMIZERS) diff --git a/lavis/common/annotator/uniformer/mmcv/runner/priority.py b/lavis/common/annotator/uniformer/mmcv/runner/priority.py new file mode 100644 index 0000000000000000000000000000000000000000..64cc4e3a05f8d5b89ab6eb32461e6e80f1d62e67 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/priority.py @@ -0,0 +1,60 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from enum import Enum + + +class Priority(Enum): + """Hook priority levels. + + +--------------+------------+ + | Level | Value | + +==============+============+ + | HIGHEST | 0 | + +--------------+------------+ + | VERY_HIGH | 10 | + +--------------+------------+ + | HIGH | 30 | + +--------------+------------+ + | ABOVE_NORMAL | 40 | + +--------------+------------+ + | NORMAL | 50 | + +--------------+------------+ + | BELOW_NORMAL | 60 | + +--------------+------------+ + | LOW | 70 | + +--------------+------------+ + | VERY_LOW | 90 | + +--------------+------------+ + | LOWEST | 100 | + +--------------+------------+ + """ + + HIGHEST = 0 + VERY_HIGH = 10 + HIGH = 30 + ABOVE_NORMAL = 40 + NORMAL = 50 + BELOW_NORMAL = 60 + LOW = 70 + VERY_LOW = 90 + LOWEST = 100 + + +def get_priority(priority): + """Get priority value. + + Args: + priority (int or str or :obj:`Priority`): Priority. + + Returns: + int: The priority value. + """ + if isinstance(priority, int): + if priority < 0 or priority > 100: + raise ValueError('priority must be between 0 and 100') + return priority + elif isinstance(priority, Priority): + return priority.value + elif isinstance(priority, str): + return Priority[priority.upper()].value + else: + raise TypeError('priority must be an integer or Priority enum value') diff --git a/lavis/common/annotator/uniformer/mmcv/runner/utils.py b/lavis/common/annotator/uniformer/mmcv/runner/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c5befb8e56ece50b5fecfd007b26f8a29124c0bd --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/runner/utils.py @@ -0,0 +1,93 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import random +import sys +import time +import warnings +from getpass import getuser +from socket import gethostname + +import numpy as np +import torch + +import annotator.uniformer.mmcv as mmcv + + +def get_host_info(): + """Get hostname and username. + + Return empty string if exception raised, e.g. ``getpass.getuser()`` will + lead to error in docker container + """ + host = '' + try: + host = f'{getuser()}@{gethostname()}' + except Exception as e: + warnings.warn(f'Host or user not found: {str(e)}') + finally: + return host + + +def get_time_str(): + return time.strftime('%Y%m%d_%H%M%S', time.localtime()) + + +def obj_from_dict(info, parent=None, default_args=None): + """Initialize an object from dict. + + The dict must contain the key "type", which indicates the object type, it + can be either a string or type, such as "list" or ``list``. Remaining + fields are treated as the arguments for constructing the object. + + Args: + info (dict): Object types and arguments. + parent (:class:`module`): Module which may containing expected object + classes. + default_args (dict, optional): Default arguments for initializing the + object. + + Returns: + any type: Object built from the dict. + """ + assert isinstance(info, dict) and 'type' in info + assert isinstance(default_args, dict) or default_args is None + args = info.copy() + obj_type = args.pop('type') + if mmcv.is_str(obj_type): + if parent is not None: + obj_type = getattr(parent, obj_type) + else: + obj_type = sys.modules[obj_type] + elif not isinstance(obj_type, type): + raise TypeError('type must be a str or valid type, but ' + f'got {type(obj_type)}') + if default_args is not None: + for name, value in default_args.items(): + args.setdefault(name, value) + return obj_type(**args) + + +def set_random_seed(seed, deterministic=False, use_rank_shift=False): + """Set random seed. + + Args: + seed (int): Seed to be used. + deterministic (bool): Whether to set the deterministic option for + CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` + to True and `torch.backends.cudnn.benchmark` to False. + Default: False. + rank_shift (bool): Whether to add rank number to the random seed to + have different random seed in different threads. Default: False. + """ + if use_rank_shift: + rank, _ = mmcv.runner.get_dist_info() + seed += rank + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + if deterministic: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False diff --git a/lavis/common/annotator/uniformer/mmcv/utils/__init__.py b/lavis/common/annotator/uniformer/mmcv/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..378a0068432a371af364de9d73785901c0f83383 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/__init__.py @@ -0,0 +1,69 @@ +# flake8: noqa +# Copyright (c) OpenMMLab. All rights reserved. +from .config import Config, ConfigDict, DictAction +from .misc import (check_prerequisites, concat_list, deprecated_api_warning, + has_method, import_modules_from_strings, is_list_of, + is_method_overridden, is_seq_of, is_str, is_tuple_of, + iter_cast, list_cast, requires_executable, requires_package, + slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple, + to_ntuple, tuple_cast) +from .path import (check_file_exist, fopen, is_filepath, mkdir_or_exist, + scandir, symlink) +from .progressbar import (ProgressBar, track_iter_progress, + track_parallel_progress, track_progress) +from .testing import (assert_attrs_equal, assert_dict_contains_subset, + assert_dict_has_keys, assert_is_norm_layer, + assert_keys_equal, assert_params_all_zeros, + check_python_script) +from .timer import Timer, TimerError, check_time +from .version_utils import digit_version, get_git_hash + +try: + import torch +except ImportError: + __all__ = [ + 'Config', 'ConfigDict', 'DictAction', 'is_str', 'iter_cast', + 'list_cast', 'tuple_cast', 'is_seq_of', 'is_list_of', 'is_tuple_of', + 'slice_list', 'concat_list', 'check_prerequisites', 'requires_package', + 'requires_executable', 'is_filepath', 'fopen', 'check_file_exist', + 'mkdir_or_exist', 'symlink', 'scandir', 'ProgressBar', + 'track_progress', 'track_iter_progress', 'track_parallel_progress', + 'Timer', 'TimerError', 'check_time', 'deprecated_api_warning', + 'digit_version', 'get_git_hash', 'import_modules_from_strings', + 'assert_dict_contains_subset', 'assert_attrs_equal', + 'assert_dict_has_keys', 'assert_keys_equal', 'check_python_script', + 'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple', + 'is_method_overridden', 'has_method' + ] +else: + from .env import collect_env + from .logging import get_logger, print_log + from .parrots_jit import jit, skip_no_elena + from .parrots_wrapper import ( + TORCH_VERSION, BuildExtension, CppExtension, CUDAExtension, DataLoader, + PoolDataLoader, SyncBatchNorm, _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, + _AvgPoolNd, _BatchNorm, _ConvNd, _ConvTransposeMixin, _InstanceNorm, + _MaxPoolNd, get_build_config, is_rocm_pytorch, _get_cuda_home) + from .registry import Registry, build_from_cfg + from .trace import is_jit_tracing + __all__ = [ + 'Config', 'ConfigDict', 'DictAction', 'collect_env', 'get_logger', + 'print_log', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast', + 'is_seq_of', 'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list', + 'check_prerequisites', 'requires_package', 'requires_executable', + 'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist', + 'symlink', 'scandir', 'ProgressBar', 'track_progress', + 'track_iter_progress', 'track_parallel_progress', 'Registry', + 'build_from_cfg', 'Timer', 'TimerError', 'check_time', 'SyncBatchNorm', + '_AdaptiveAvgPoolNd', '_AdaptiveMaxPoolNd', '_AvgPoolNd', '_BatchNorm', + '_ConvNd', '_ConvTransposeMixin', '_InstanceNorm', '_MaxPoolNd', + 'get_build_config', 'BuildExtension', 'CppExtension', 'CUDAExtension', + 'DataLoader', 'PoolDataLoader', 'TORCH_VERSION', + 'deprecated_api_warning', 'digit_version', 'get_git_hash', + 'import_modules_from_strings', 'jit', 'skip_no_elena', + 'assert_dict_contains_subset', 'assert_attrs_equal', + 'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer', + 'assert_params_all_zeros', 'check_python_script', + 'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch', + '_get_cuda_home', 'has_method' + ] diff --git a/lavis/common/annotator/uniformer/mmcv/utils/config.py b/lavis/common/annotator/uniformer/mmcv/utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..17149353aefac6d737c67bb2f35a3a6cd2147b0a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/config.py @@ -0,0 +1,688 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import ast +import copy +import os +import os.path as osp +import platform +import shutil +import sys +import tempfile +import uuid +import warnings +from argparse import Action, ArgumentParser +from collections import abc +from importlib import import_module + +from addict import Dict +from yapf.yapflib.yapf_api import FormatCode + +from .misc import import_modules_from_strings +from .path import check_file_exist + +if platform.system() == 'Windows': + import regex as re +else: + import re + +BASE_KEY = '_base_' +DELETE_KEY = '_delete_' +DEPRECATION_KEY = '_deprecation_' +RESERVED_KEYS = ['filename', 'text', 'pretty_text'] + + +class ConfigDict(Dict): + + def __missing__(self, name): + raise KeyError(name) + + def __getattr__(self, name): + try: + value = super(ConfigDict, self).__getattr__(name) + except KeyError: + ex = AttributeError(f"'{self.__class__.__name__}' object has no " + f"attribute '{name}'") + except Exception as e: + ex = e + else: + return value + raise ex + + +def add_args(parser, cfg, prefix=''): + for k, v in cfg.items(): + if isinstance(v, str): + parser.add_argument('--' + prefix + k) + elif isinstance(v, int): + parser.add_argument('--' + prefix + k, type=int) + elif isinstance(v, float): + parser.add_argument('--' + prefix + k, type=float) + elif isinstance(v, bool): + parser.add_argument('--' + prefix + k, action='store_true') + elif isinstance(v, dict): + add_args(parser, v, prefix + k + '.') + elif isinstance(v, abc.Iterable): + parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+') + else: + print(f'cannot parse key {prefix + k} of type {type(v)}') + return parser + + +class Config: + """A facility for config and config files. + + It supports common file formats as configs: python/json/yaml. The interface + is the same as a dict object and also allows access config values as + attributes. + + Example: + >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1]))) + >>> cfg.a + 1 + >>> cfg.b + {'b1': [0, 1]} + >>> cfg.b.b1 + [0, 1] + >>> cfg = Config.fromfile('tests/data/config/a.py') + >>> cfg.filename + "/home/kchen/projects/mmcv/tests/data/config/a.py" + >>> cfg.item4 + 'test' + >>> cfg + "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: " + "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}" + """ + + @staticmethod + def _validate_py_syntax(filename): + with open(filename, 'r', encoding='utf-8') as f: + # Setting encoding explicitly to resolve coding issue on windows + content = f.read() + try: + ast.parse(content) + except SyntaxError as e: + raise SyntaxError('There are syntax errors in config ' + f'file {filename}: {e}') + + @staticmethod + def _substitute_predefined_vars(filename, temp_config_name): + file_dirname = osp.dirname(filename) + file_basename = osp.basename(filename) + file_basename_no_extension = osp.splitext(file_basename)[0] + file_extname = osp.splitext(filename)[1] + support_templates = dict( + fileDirname=file_dirname, + fileBasename=file_basename, + fileBasenameNoExtension=file_basename_no_extension, + fileExtname=file_extname) + with open(filename, 'r', encoding='utf-8') as f: + # Setting encoding explicitly to resolve coding issue on windows + config_file = f.read() + for key, value in support_templates.items(): + regexp = r'\{\{\s*' + str(key) + r'\s*\}\}' + value = value.replace('\\', '/') + config_file = re.sub(regexp, value, config_file) + with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file: + tmp_config_file.write(config_file) + + @staticmethod + def _pre_substitute_base_vars(filename, temp_config_name): + """Substitute base variable placehoders to string, so that parsing + would work.""" + with open(filename, 'r', encoding='utf-8') as f: + # Setting encoding explicitly to resolve coding issue on windows + config_file = f.read() + base_var_dict = {} + regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}' + base_vars = set(re.findall(regexp, config_file)) + for base_var in base_vars: + randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}' + base_var_dict[randstr] = base_var + regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}' + config_file = re.sub(regexp, f'"{randstr}"', config_file) + with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file: + tmp_config_file.write(config_file) + return base_var_dict + + @staticmethod + def _substitute_base_vars(cfg, base_var_dict, base_cfg): + """Substitute variable strings to their actual values.""" + cfg = copy.deepcopy(cfg) + + if isinstance(cfg, dict): + for k, v in cfg.items(): + if isinstance(v, str) and v in base_var_dict: + new_v = base_cfg + for new_k in base_var_dict[v].split('.'): + new_v = new_v[new_k] + cfg[k] = new_v + elif isinstance(v, (list, tuple, dict)): + cfg[k] = Config._substitute_base_vars( + v, base_var_dict, base_cfg) + elif isinstance(cfg, tuple): + cfg = tuple( + Config._substitute_base_vars(c, base_var_dict, base_cfg) + for c in cfg) + elif isinstance(cfg, list): + cfg = [ + Config._substitute_base_vars(c, base_var_dict, base_cfg) + for c in cfg + ] + elif isinstance(cfg, str) and cfg in base_var_dict: + new_v = base_cfg + for new_k in base_var_dict[cfg].split('.'): + new_v = new_v[new_k] + cfg = new_v + + return cfg + + @staticmethod + def _file2dict(filename, use_predefined_variables=True): + filename = osp.abspath(osp.expanduser(filename)) + check_file_exist(filename) + fileExtname = osp.splitext(filename)[1] + if fileExtname not in ['.py', '.json', '.yaml', '.yml']: + raise IOError('Only py/yml/yaml/json type are supported now!') + + with tempfile.TemporaryDirectory() as temp_config_dir: + temp_config_file = tempfile.NamedTemporaryFile( + dir=temp_config_dir, suffix=fileExtname) + if platform.system() == 'Windows': + temp_config_file.close() + temp_config_name = osp.basename(temp_config_file.name) + # Substitute predefined variables + if use_predefined_variables: + Config._substitute_predefined_vars(filename, + temp_config_file.name) + else: + shutil.copyfile(filename, temp_config_file.name) + # Substitute base variables from placeholders to strings + base_var_dict = Config._pre_substitute_base_vars( + temp_config_file.name, temp_config_file.name) + + if filename.endswith('.py'): + temp_module_name = osp.splitext(temp_config_name)[0] + sys.path.insert(0, temp_config_dir) + Config._validate_py_syntax(filename) + mod = import_module(temp_module_name) + sys.path.pop(0) + cfg_dict = { + name: value + for name, value in mod.__dict__.items() + if not name.startswith('__') + } + # delete imported module + del sys.modules[temp_module_name] + elif filename.endswith(('.yml', '.yaml', '.json')): + import annotator.uniformer.mmcv as mmcv + cfg_dict = mmcv.load(temp_config_file.name) + # close temp file + temp_config_file.close() + + # check deprecation information + if DEPRECATION_KEY in cfg_dict: + deprecation_info = cfg_dict.pop(DEPRECATION_KEY) + warning_msg = f'The config file {filename} will be deprecated ' \ + 'in the future.' + if 'expected' in deprecation_info: + warning_msg += f' Please use {deprecation_info["expected"]} ' \ + 'instead.' + if 'reference' in deprecation_info: + warning_msg += ' More information can be found at ' \ + f'{deprecation_info["reference"]}' + warnings.warn(warning_msg) + + cfg_text = filename + '\n' + with open(filename, 'r', encoding='utf-8') as f: + # Setting encoding explicitly to resolve coding issue on windows + cfg_text += f.read() + + if BASE_KEY in cfg_dict: + cfg_dir = osp.dirname(filename) + base_filename = cfg_dict.pop(BASE_KEY) + base_filename = base_filename if isinstance( + base_filename, list) else [base_filename] + + cfg_dict_list = list() + cfg_text_list = list() + for f in base_filename: + _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f)) + cfg_dict_list.append(_cfg_dict) + cfg_text_list.append(_cfg_text) + + base_cfg_dict = dict() + for c in cfg_dict_list: + duplicate_keys = base_cfg_dict.keys() & c.keys() + if len(duplicate_keys) > 0: + raise KeyError('Duplicate key is not allowed among bases. ' + f'Duplicate keys: {duplicate_keys}') + base_cfg_dict.update(c) + + # Substitute base variables from strings to their actual values + cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict, + base_cfg_dict) + + base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict) + cfg_dict = base_cfg_dict + + # merge cfg_text + cfg_text_list.append(cfg_text) + cfg_text = '\n'.join(cfg_text_list) + + return cfg_dict, cfg_text + + @staticmethod + def _merge_a_into_b(a, b, allow_list_keys=False): + """merge dict ``a`` into dict ``b`` (non-inplace). + + Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid + in-place modifications. + + Args: + a (dict): The source dict to be merged into ``b``. + b (dict): The origin dict to be fetch keys from ``a``. + allow_list_keys (bool): If True, int string keys (e.g. '0', '1') + are allowed in source ``a`` and will replace the element of the + corresponding index in b if b is a list. Default: False. + + Returns: + dict: The modified dict of ``b`` using ``a``. + + Examples: + # Normally merge a into b. + >>> Config._merge_a_into_b( + ... dict(obj=dict(a=2)), dict(obj=dict(a=1))) + {'obj': {'a': 2}} + + # Delete b first and merge a into b. + >>> Config._merge_a_into_b( + ... dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1))) + {'obj': {'a': 2}} + + # b is a list + >>> Config._merge_a_into_b( + ... {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True) + [{'a': 2}, {'b': 2}] + """ + b = b.copy() + for k, v in a.items(): + if allow_list_keys and k.isdigit() and isinstance(b, list): + k = int(k) + if len(b) <= k: + raise KeyError(f'Index {k} exceeds the length of list {b}') + b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys) + elif isinstance(v, + dict) and k in b and not v.pop(DELETE_KEY, False): + allowed_types = (dict, list) if allow_list_keys else dict + if not isinstance(b[k], allowed_types): + raise TypeError( + f'{k}={v} in child config cannot inherit from base ' + f'because {k} is a dict in the child config but is of ' + f'type {type(b[k])} in base config. You may set ' + f'`{DELETE_KEY}=True` to ignore the base config') + b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys) + else: + b[k] = v + return b + + @staticmethod + def fromfile(filename, + use_predefined_variables=True, + import_custom_modules=True): + cfg_dict, cfg_text = Config._file2dict(filename, + use_predefined_variables) + if import_custom_modules and cfg_dict.get('custom_imports', None): + import_modules_from_strings(**cfg_dict['custom_imports']) + return Config(cfg_dict, cfg_text=cfg_text, filename=filename) + + @staticmethod + def fromstring(cfg_str, file_format): + """Generate config from config str. + + Args: + cfg_str (str): Config str. + file_format (str): Config file format corresponding to the + config str. Only py/yml/yaml/json type are supported now! + + Returns: + obj:`Config`: Config obj. + """ + if file_format not in ['.py', '.json', '.yaml', '.yml']: + raise IOError('Only py/yml/yaml/json type are supported now!') + if file_format != '.py' and 'dict(' in cfg_str: + # check if users specify a wrong suffix for python + warnings.warn( + 'Please check "file_format", the file format may be .py') + with tempfile.NamedTemporaryFile( + 'w', encoding='utf-8', suffix=file_format, + delete=False) as temp_file: + temp_file.write(cfg_str) + # on windows, previous implementation cause error + # see PR 1077 for details + cfg = Config.fromfile(temp_file.name) + os.remove(temp_file.name) + return cfg + + @staticmethod + def auto_argparser(description=None): + """Generate argparser from config file automatically (experimental)""" + partial_parser = ArgumentParser(description=description) + partial_parser.add_argument('config', help='config file path') + cfg_file = partial_parser.parse_known_args()[0].config + cfg = Config.fromfile(cfg_file) + parser = ArgumentParser(description=description) + parser.add_argument('config', help='config file path') + add_args(parser, cfg) + return parser, cfg + + def __init__(self, cfg_dict=None, cfg_text=None, filename=None): + if cfg_dict is None: + cfg_dict = dict() + elif not isinstance(cfg_dict, dict): + raise TypeError('cfg_dict must be a dict, but ' + f'got {type(cfg_dict)}') + for key in cfg_dict: + if key in RESERVED_KEYS: + raise KeyError(f'{key} is reserved for config file') + + super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict)) + super(Config, self).__setattr__('_filename', filename) + if cfg_text: + text = cfg_text + elif filename: + with open(filename, 'r') as f: + text = f.read() + else: + text = '' + super(Config, self).__setattr__('_text', text) + + @property + def filename(self): + return self._filename + + @property + def text(self): + return self._text + + @property + def pretty_text(self): + + indent = 4 + + def _indent(s_, num_spaces): + s = s_.split('\n') + if len(s) == 1: + return s_ + first = s.pop(0) + s = [(num_spaces * ' ') + line for line in s] + s = '\n'.join(s) + s = first + '\n' + s + return s + + def _format_basic_types(k, v, use_mapping=False): + if isinstance(v, str): + v_str = f"'{v}'" + else: + v_str = str(v) + + if use_mapping: + k_str = f"'{k}'" if isinstance(k, str) else str(k) + attr_str = f'{k_str}: {v_str}' + else: + attr_str = f'{str(k)}={v_str}' + attr_str = _indent(attr_str, indent) + + return attr_str + + def _format_list(k, v, use_mapping=False): + # check if all items in the list are dict + if all(isinstance(_, dict) for _ in v): + v_str = '[\n' + v_str += '\n'.join( + f'dict({_indent(_format_dict(v_), indent)}),' + for v_ in v).rstrip(',') + if use_mapping: + k_str = f"'{k}'" if isinstance(k, str) else str(k) + attr_str = f'{k_str}: {v_str}' + else: + attr_str = f'{str(k)}={v_str}' + attr_str = _indent(attr_str, indent) + ']' + else: + attr_str = _format_basic_types(k, v, use_mapping) + return attr_str + + def _contain_invalid_identifier(dict_str): + contain_invalid_identifier = False + for key_name in dict_str: + contain_invalid_identifier |= \ + (not str(key_name).isidentifier()) + return contain_invalid_identifier + + def _format_dict(input_dict, outest_level=False): + r = '' + s = [] + + use_mapping = _contain_invalid_identifier(input_dict) + if use_mapping: + r += '{' + for idx, (k, v) in enumerate(input_dict.items()): + is_last = idx >= len(input_dict) - 1 + end = '' if outest_level or is_last else ',' + if isinstance(v, dict): + v_str = '\n' + _format_dict(v) + if use_mapping: + k_str = f"'{k}'" if isinstance(k, str) else str(k) + attr_str = f'{k_str}: dict({v_str}' + else: + attr_str = f'{str(k)}=dict({v_str}' + attr_str = _indent(attr_str, indent) + ')' + end + elif isinstance(v, list): + attr_str = _format_list(k, v, use_mapping) + end + else: + attr_str = _format_basic_types(k, v, use_mapping) + end + + s.append(attr_str) + r += '\n'.join(s) + if use_mapping: + r += '}' + return r + + cfg_dict = self._cfg_dict.to_dict() + text = _format_dict(cfg_dict, outest_level=True) + # copied from setup.cfg + yapf_style = dict( + based_on_style='pep8', + blank_line_before_nested_class_or_def=True, + split_before_expression_after_opening_paren=True) + text, _ = FormatCode(text, style_config=yapf_style, verify=True) + + return text + + def __repr__(self): + return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}' + + def __len__(self): + return len(self._cfg_dict) + + def __getattr__(self, name): + return getattr(self._cfg_dict, name) + + def __getitem__(self, name): + return self._cfg_dict.__getitem__(name) + + def __setattr__(self, name, value): + if isinstance(value, dict): + value = ConfigDict(value) + self._cfg_dict.__setattr__(name, value) + + def __setitem__(self, name, value): + if isinstance(value, dict): + value = ConfigDict(value) + self._cfg_dict.__setitem__(name, value) + + def __iter__(self): + return iter(self._cfg_dict) + + def __getstate__(self): + return (self._cfg_dict, self._filename, self._text) + + def __setstate__(self, state): + _cfg_dict, _filename, _text = state + super(Config, self).__setattr__('_cfg_dict', _cfg_dict) + super(Config, self).__setattr__('_filename', _filename) + super(Config, self).__setattr__('_text', _text) + + def dump(self, file=None): + cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict() + if self.filename.endswith('.py'): + if file is None: + return self.pretty_text + else: + with open(file, 'w', encoding='utf-8') as f: + f.write(self.pretty_text) + else: + import annotator.uniformer.mmcv as mmcv + if file is None: + file_format = self.filename.split('.')[-1] + return mmcv.dump(cfg_dict, file_format=file_format) + else: + mmcv.dump(cfg_dict, file) + + def merge_from_dict(self, options, allow_list_keys=True): + """Merge list into cfg_dict. + + Merge the dict parsed by MultipleKVAction into this cfg. + + Examples: + >>> options = {'model.backbone.depth': 50, + ... 'model.backbone.with_cp':True} + >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet')))) + >>> cfg.merge_from_dict(options) + >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') + >>> assert cfg_dict == dict( + ... model=dict(backbone=dict(depth=50, with_cp=True))) + + # Merge list element + >>> cfg = Config(dict(pipeline=[ + ... dict(type='LoadImage'), dict(type='LoadAnnotations')])) + >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')}) + >>> cfg.merge_from_dict(options, allow_list_keys=True) + >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') + >>> assert cfg_dict == dict(pipeline=[ + ... dict(type='SelfLoadImage'), dict(type='LoadAnnotations')]) + + Args: + options (dict): dict of configs to merge from. + allow_list_keys (bool): If True, int string keys (e.g. '0', '1') + are allowed in ``options`` and will replace the element of the + corresponding index in the config if the config is a list. + Default: True. + """ + option_cfg_dict = {} + for full_key, v in options.items(): + d = option_cfg_dict + key_list = full_key.split('.') + for subkey in key_list[:-1]: + d.setdefault(subkey, ConfigDict()) + d = d[subkey] + subkey = key_list[-1] + d[subkey] = v + + cfg_dict = super(Config, self).__getattribute__('_cfg_dict') + super(Config, self).__setattr__( + '_cfg_dict', + Config._merge_a_into_b( + option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys)) + + +class DictAction(Action): + """ + argparse action to split an argument into KEY=VALUE form + on the first = and append to a dictionary. List options can + be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit + brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build + list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]' + """ + + @staticmethod + def _parse_int_float_bool(val): + try: + return int(val) + except ValueError: + pass + try: + return float(val) + except ValueError: + pass + if val.lower() in ['true', 'false']: + return True if val.lower() == 'true' else False + return val + + @staticmethod + def _parse_iterable(val): + """Parse iterable values in the string. + + All elements inside '()' or '[]' are treated as iterable values. + + Args: + val (str): Value string. + + Returns: + list | tuple: The expanded list or tuple from the string. + + Examples: + >>> DictAction._parse_iterable('1,2,3') + [1, 2, 3] + >>> DictAction._parse_iterable('[a, b, c]') + ['a', 'b', 'c'] + >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]') + [(1, 2, 3), ['a', 'b'], 'c'] + """ + + def find_next_comma(string): + """Find the position of next comma in the string. + + If no ',' is found in the string, return the string length. All + chars inside '()' and '[]' are treated as one element and thus ',' + inside these brackets are ignored. + """ + assert (string.count('(') == string.count(')')) and ( + string.count('[') == string.count(']')), \ + f'Imbalanced brackets exist in {string}' + end = len(string) + for idx, char in enumerate(string): + pre = string[:idx] + # The string before this ',' is balanced + if ((char == ',') and (pre.count('(') == pre.count(')')) + and (pre.count('[') == pre.count(']'))): + end = idx + break + return end + + # Strip ' and " characters and replace whitespace. + val = val.strip('\'\"').replace(' ', '') + is_tuple = False + if val.startswith('(') and val.endswith(')'): + is_tuple = True + val = val[1:-1] + elif val.startswith('[') and val.endswith(']'): + val = val[1:-1] + elif ',' not in val: + # val is a single value + return DictAction._parse_int_float_bool(val) + + values = [] + while len(val) > 0: + comma_idx = find_next_comma(val) + element = DictAction._parse_iterable(val[:comma_idx]) + values.append(element) + val = val[comma_idx + 1:] + if is_tuple: + values = tuple(values) + return values + + def __call__(self, parser, namespace, values, option_string=None): + options = {} + for kv in values: + key, val = kv.split('=', maxsplit=1) + options[key] = self._parse_iterable(val) + setattr(namespace, self.dest, options) diff --git a/lavis/common/annotator/uniformer/mmcv/utils/env.py b/lavis/common/annotator/uniformer/mmcv/utils/env.py new file mode 100644 index 0000000000000000000000000000000000000000..e3f0d92529e193e6d8339419bcd9bed7901a7769 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/env.py @@ -0,0 +1,95 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This file holding some environment constant for sharing by other files.""" + +import os.path as osp +import subprocess +import sys +from collections import defaultdict + +import cv2 +import torch + +import annotator.uniformer.mmcv as mmcv +from .parrots_wrapper import get_build_config + + +def collect_env(): + """Collect the information of the running environments. + + Returns: + dict: The environment information. The following fields are contained. + + - sys.platform: The variable of ``sys.platform``. + - Python: Python version. + - CUDA available: Bool, indicating if CUDA is available. + - GPU devices: Device type of each GPU. + - CUDA_HOME (optional): The env var ``CUDA_HOME``. + - NVCC (optional): NVCC version. + - GCC: GCC version, "n/a" if GCC is not installed. + - PyTorch: PyTorch version. + - PyTorch compiling details: The output of \ + ``torch.__config__.show()``. + - TorchVision (optional): TorchVision version. + - OpenCV: OpenCV version. + - MMCV: MMCV version. + - MMCV Compiler: The GCC version for compiling MMCV ops. + - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops. + """ + env_info = {} + env_info['sys.platform'] = sys.platform + env_info['Python'] = sys.version.replace('\n', '') + + cuda_available = torch.cuda.is_available() + env_info['CUDA available'] = cuda_available + + if cuda_available: + devices = defaultdict(list) + for k in range(torch.cuda.device_count()): + devices[torch.cuda.get_device_name(k)].append(str(k)) + for name, device_ids in devices.items(): + env_info['GPU ' + ','.join(device_ids)] = name + + from annotator.uniformer.mmcv.utils.parrots_wrapper import _get_cuda_home + CUDA_HOME = _get_cuda_home() + env_info['CUDA_HOME'] = CUDA_HOME + + if CUDA_HOME is not None and osp.isdir(CUDA_HOME): + try: + nvcc = osp.join(CUDA_HOME, 'bin/nvcc') + nvcc = subprocess.check_output( + f'"{nvcc}" -V | tail -n1', shell=True) + nvcc = nvcc.decode('utf-8').strip() + except subprocess.SubprocessError: + nvcc = 'Not Available' + env_info['NVCC'] = nvcc + + try: + gcc = subprocess.check_output('gcc --version | head -n1', shell=True) + gcc = gcc.decode('utf-8').strip() + env_info['GCC'] = gcc + except subprocess.CalledProcessError: # gcc is unavailable + env_info['GCC'] = 'n/a' + + env_info['PyTorch'] = torch.__version__ + env_info['PyTorch compiling details'] = get_build_config() + + try: + import torchvision + env_info['TorchVision'] = torchvision.__version__ + except ModuleNotFoundError: + pass + + env_info['OpenCV'] = cv2.__version__ + + env_info['MMCV'] = mmcv.__version__ + + try: + from annotator.uniformer.mmcv.ops import get_compiler_version, get_compiling_cuda_version + except ModuleNotFoundError: + env_info['MMCV Compiler'] = 'n/a' + env_info['MMCV CUDA Compiler'] = 'n/a' + else: + env_info['MMCV Compiler'] = get_compiler_version() + env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version() + + return env_info diff --git a/lavis/common/annotator/uniformer/mmcv/utils/ext_loader.py b/lavis/common/annotator/uniformer/mmcv/utils/ext_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..08132d2c1b9a1c28880e4bab4d4fa1ba39d9d083 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/ext_loader.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import importlib +import os +import pkgutil +import warnings +from collections import namedtuple + +import torch + +if torch.__version__ != 'parrots': + + def load_ext(name, funcs): + ext = importlib.import_module('mmcv.' + name) + for fun in funcs: + assert hasattr(ext, fun), f'{fun} miss in module {name}' + return ext +else: + from parrots import extension + from parrots.base import ParrotsException + + has_return_value_ops = [ + 'nms', + 'softnms', + 'nms_match', + 'nms_rotated', + 'top_pool_forward', + 'top_pool_backward', + 'bottom_pool_forward', + 'bottom_pool_backward', + 'left_pool_forward', + 'left_pool_backward', + 'right_pool_forward', + 'right_pool_backward', + 'fused_bias_leakyrelu', + 'upfirdn2d', + 'ms_deform_attn_forward', + 'pixel_group', + 'contour_expand', + ] + + def get_fake_func(name, e): + + def fake_func(*args, **kwargs): + warnings.warn(f'{name} is not supported in parrots now') + raise e + + return fake_func + + def load_ext(name, funcs): + ExtModule = namedtuple('ExtModule', funcs) + ext_list = [] + lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + for fun in funcs: + try: + ext_fun = extension.load(fun, name, lib_dir=lib_root) + except ParrotsException as e: + if 'No element registered' not in e.message: + warnings.warn(e.message) + ext_fun = get_fake_func(fun, e) + ext_list.append(ext_fun) + else: + if fun in has_return_value_ops: + ext_list.append(ext_fun.op) + else: + ext_list.append(ext_fun.op_) + return ExtModule(*ext_list) + + +def check_ops_exist(): + ext_loader = pkgutil.find_loader('mmcv._ext') + return ext_loader is not None diff --git a/lavis/common/annotator/uniformer/mmcv/utils/logging.py b/lavis/common/annotator/uniformer/mmcv/utils/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa0e04bb9b3ab2a4bfbc4def50404ccbac2c6e6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/logging.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch.distributed as dist + +logger_initialized = {} + + +def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): + """Initialize and get a logger by name. + + If the logger has not been initialized, this method will initialize the + logger by adding one or two handlers, otherwise the initialized logger will + be directly returned. During initialization, a StreamHandler will always be + added. If `log_file` is specified and the process rank is 0, a FileHandler + will also be added. + + Args: + name (str): Logger name. + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the logger. + log_level (int): The logger level. Note that only the process of + rank 0 is affected, and other processes will set the level to + "Error" thus be silent most of the time. + file_mode (str): The file mode used in opening log file. + Defaults to 'w'. + + Returns: + logging.Logger: The expected logger. + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + # handle hierarchical names + # e.g., logger "a" is initialized, then logger "a.b" will skip the + # initialization since it is a child of "a". + for logger_name in logger_initialized: + if name.startswith(logger_name): + return logger + + # handle duplicate logs to the console + # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET) + # to the root logger. As logger.propagate is True by default, this root + # level handler causes logging messages from rank>0 processes to + # unexpectedly show up on the console, creating much unwanted clutter. + # To fix this issue, we set the root logger's StreamHandler, if any, to log + # at the ERROR level. + for handler in logger.root.handlers: + if type(handler) is logging.StreamHandler: + handler.setLevel(logging.ERROR) + + stream_handler = logging.StreamHandler() + handlers = [stream_handler] + + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank() + else: + rank = 0 + + # only rank 0 will add a FileHandler + if rank == 0 and log_file is not None: + # Here, the default behaviour of the official logger is 'a'. Thus, we + # provide an interface to change the file mode to the default + # behaviour. + file_handler = logging.FileHandler(log_file, file_mode) + handlers.append(file_handler) + + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + for handler in handlers: + handler.setFormatter(formatter) + handler.setLevel(log_level) + logger.addHandler(handler) + + if rank == 0: + logger.setLevel(log_level) + else: + logger.setLevel(logging.ERROR) + + logger_initialized[name] = True + + return logger + + +def print_log(msg, logger=None, level=logging.INFO): + """Print a log message. + + Args: + msg (str): The message to be logged. + logger (logging.Logger | str | None): The logger to be used. + Some special loggers are: + - "silent": no message will be printed. + - other str: the logger obtained with `get_root_logger(logger)`. + - None: The `print()` method will be used to print log messages. + level (int): Logging level. Only available when `logger` is a Logger + object or "root". + """ + if logger is None: + print(msg) + elif isinstance(logger, logging.Logger): + logger.log(level, msg) + elif logger == 'silent': + pass + elif isinstance(logger, str): + _logger = get_logger(logger) + _logger.log(level, msg) + else: + raise TypeError( + 'logger should be either a logging.Logger object, str, ' + f'"silent" or None, but got {type(logger)}') diff --git a/lavis/common/annotator/uniformer/mmcv/utils/misc.py b/lavis/common/annotator/uniformer/mmcv/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..2c58d0d7fee9fe3d4519270ad8c1e998d0d8a18c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/misc.py @@ -0,0 +1,377 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import collections.abc +import functools +import itertools +import subprocess +import warnings +from collections import abc +from importlib import import_module +from inspect import getfullargspec +from itertools import repeat + + +# From PyTorch internals +def _ntuple(n): + + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple + + +def is_str(x): + """Whether the input is an string instance. + + Note: This method is deprecated since python 2 is no longer supported. + """ + return isinstance(x, str) + + +def import_modules_from_strings(imports, allow_failed_imports=False): + """Import modules from the given list of strings. + + Args: + imports (list | str | None): The given module names to be imported. + allow_failed_imports (bool): If True, the failed imports will return + None. Otherwise, an ImportError is raise. Default: False. + + Returns: + list[module] | module | None: The imported modules. + + Examples: + >>> osp, sys = import_modules_from_strings( + ... ['os.path', 'sys']) + >>> import os.path as osp_ + >>> import sys as sys_ + >>> assert osp == osp_ + >>> assert sys == sys_ + """ + if not imports: + return + single_import = False + if isinstance(imports, str): + single_import = True + imports = [imports] + if not isinstance(imports, list): + raise TypeError( + f'custom_imports must be a list but got type {type(imports)}') + imported = [] + for imp in imports: + if not isinstance(imp, str): + raise TypeError( + f'{imp} is of type {type(imp)} and cannot be imported.') + try: + imported_tmp = import_module(imp) + except ImportError: + if allow_failed_imports: + warnings.warn(f'{imp} failed to import and is ignored.', + UserWarning) + imported_tmp = None + else: + raise ImportError + imported.append(imported_tmp) + if single_import: + imported = imported[0] + return imported + + +def iter_cast(inputs, dst_type, return_type=None): + """Cast elements of an iterable object into some type. + + Args: + inputs (Iterable): The input object. + dst_type (type): Destination type. + return_type (type, optional): If specified, the output object will be + converted to this type, otherwise an iterator. + + Returns: + iterator or specified type: The converted object. + """ + if not isinstance(inputs, abc.Iterable): + raise TypeError('inputs must be an iterable object') + if not isinstance(dst_type, type): + raise TypeError('"dst_type" must be a valid type') + + out_iterable = map(dst_type, inputs) + + if return_type is None: + return out_iterable + else: + return return_type(out_iterable) + + +def list_cast(inputs, dst_type): + """Cast elements of an iterable object into a list of some type. + + A partial method of :func:`iter_cast`. + """ + return iter_cast(inputs, dst_type, return_type=list) + + +def tuple_cast(inputs, dst_type): + """Cast elements of an iterable object into a tuple of some type. + + A partial method of :func:`iter_cast`. + """ + return iter_cast(inputs, dst_type, return_type=tuple) + + +def is_seq_of(seq, expected_type, seq_type=None): + """Check whether it is a sequence of some type. + + Args: + seq (Sequence): The sequence to be checked. + expected_type (type): Expected type of sequence items. + seq_type (type, optional): Expected sequence type. + + Returns: + bool: Whether the sequence is valid. + """ + if seq_type is None: + exp_seq_type = abc.Sequence + else: + assert isinstance(seq_type, type) + exp_seq_type = seq_type + if not isinstance(seq, exp_seq_type): + return False + for item in seq: + if not isinstance(item, expected_type): + return False + return True + + +def is_list_of(seq, expected_type): + """Check whether it is a list of some type. + + A partial method of :func:`is_seq_of`. + """ + return is_seq_of(seq, expected_type, seq_type=list) + + +def is_tuple_of(seq, expected_type): + """Check whether it is a tuple of some type. + + A partial method of :func:`is_seq_of`. + """ + return is_seq_of(seq, expected_type, seq_type=tuple) + + +def slice_list(in_list, lens): + """Slice a list into several sub lists by a list of given length. + + Args: + in_list (list): The list to be sliced. + lens(int or list): The expected length of each out list. + + Returns: + list: A list of sliced list. + """ + if isinstance(lens, int): + assert len(in_list) % lens == 0 + lens = [lens] * int(len(in_list) / lens) + if not isinstance(lens, list): + raise TypeError('"indices" must be an integer or a list of integers') + elif sum(lens) != len(in_list): + raise ValueError('sum of lens and list length does not ' + f'match: {sum(lens)} != {len(in_list)}') + out_list = [] + idx = 0 + for i in range(len(lens)): + out_list.append(in_list[idx:idx + lens[i]]) + idx += lens[i] + return out_list + + +def concat_list(in_list): + """Concatenate a list of list into a single list. + + Args: + in_list (list): The list of list to be merged. + + Returns: + list: The concatenated flat list. + """ + return list(itertools.chain(*in_list)) + + +def check_prerequisites( + prerequisites, + checker, + msg_tmpl='Prerequisites "{}" are required in method "{}" but not ' + 'found, please install them first.'): # yapf: disable + """A decorator factory to check if prerequisites are satisfied. + + Args: + prerequisites (str of list[str]): Prerequisites to be checked. + checker (callable): The checker method that returns True if a + prerequisite is meet, False otherwise. + msg_tmpl (str): The message template with two variables. + + Returns: + decorator: A specific decorator. + """ + + def wrap(func): + + @functools.wraps(func) + def wrapped_func(*args, **kwargs): + requirements = [prerequisites] if isinstance( + prerequisites, str) else prerequisites + missing = [] + for item in requirements: + if not checker(item): + missing.append(item) + if missing: + print(msg_tmpl.format(', '.join(missing), func.__name__)) + raise RuntimeError('Prerequisites not meet.') + else: + return func(*args, **kwargs) + + return wrapped_func + + return wrap + + +def _check_py_package(package): + try: + import_module(package) + except ImportError: + return False + else: + return True + + +def _check_executable(cmd): + if subprocess.call(f'which {cmd}', shell=True) != 0: + return False + else: + return True + + +def requires_package(prerequisites): + """A decorator to check if some python packages are installed. + + Example: + >>> @requires_package('numpy') + >>> func(arg1, args): + >>> return numpy.zeros(1) + array([0.]) + >>> @requires_package(['numpy', 'non_package']) + >>> func(arg1, args): + >>> return numpy.zeros(1) + ImportError + """ + return check_prerequisites(prerequisites, checker=_check_py_package) + + +def requires_executable(prerequisites): + """A decorator to check if some executable files are installed. + + Example: + >>> @requires_executable('ffmpeg') + >>> func(arg1, args): + >>> print(1) + 1 + """ + return check_prerequisites(prerequisites, checker=_check_executable) + + +def deprecated_api_warning(name_dict, cls_name=None): + """A decorator to check if some arguments are deprecate and try to replace + deprecate src_arg_name to dst_arg_name. + + Args: + name_dict(dict): + key (str): Deprecate argument names. + val (str): Expected argument names. + + Returns: + func: New function. + """ + + def api_warning_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get name of the function + func_name = old_func.__name__ + if cls_name is not None: + func_name = f'{cls_name}.{func_name}' + if args: + arg_names = args_info.args[:len(args)] + for src_arg_name, dst_arg_name in name_dict.items(): + if src_arg_name in arg_names: + warnings.warn( + f'"{src_arg_name}" is deprecated in ' + f'`{func_name}`, please use "{dst_arg_name}" ' + 'instead') + arg_names[arg_names.index(src_arg_name)] = dst_arg_name + if kwargs: + for src_arg_name, dst_arg_name in name_dict.items(): + if src_arg_name in kwargs: + + assert dst_arg_name not in kwargs, ( + f'The expected behavior is to replace ' + f'the deprecated key `{src_arg_name}` to ' + f'new key `{dst_arg_name}`, but got them ' + f'in the arguments at the same time, which ' + f'is confusing. `{src_arg_name} will be ' + f'deprecated in the future, please ' + f'use `{dst_arg_name}` instead.') + + warnings.warn( + f'"{src_arg_name}" is deprecated in ' + f'`{func_name}`, please use "{dst_arg_name}" ' + 'instead') + kwargs[dst_arg_name] = kwargs.pop(src_arg_name) + + # apply converted arguments to the decorated method + output = old_func(*args, **kwargs) + return output + + return new_func + + return api_warning_wrapper + + +def is_method_overridden(method, base_class, derived_class): + """Check if a method of base class is overridden in derived class. + + Args: + method (str): the method name to check. + base_class (type): the class of the base class. + derived_class (type | Any): the class or instance of the derived class. + """ + assert isinstance(base_class, type), \ + "base_class doesn't accept instance, Please pass class instead." + + if not isinstance(derived_class, type): + derived_class = derived_class.__class__ + + base_method = getattr(base_class, method) + derived_method = getattr(derived_class, method) + return derived_method != base_method + + +def has_method(obj: object, method: str) -> bool: + """Check whether the object has a method. + + Args: + method (str): The method name to check. + obj (object): The object to check. + + Returns: + bool: True if the object has the method else False. + """ + return hasattr(obj, method) and callable(getattr(obj, method)) diff --git a/lavis/common/annotator/uniformer/mmcv/utils/parrots_jit.py b/lavis/common/annotator/uniformer/mmcv/utils/parrots_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..61873f6dbb9b10ed972c90aa8faa321e3cb3249e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/parrots_jit.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os + +from .parrots_wrapper import TORCH_VERSION + +parrots_jit_option = os.getenv('PARROTS_JIT_OPTION') + +if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON': + from parrots.jit import pat as jit +else: + + def jit(func=None, + check_input=None, + full_shape=True, + derivate=False, + coderize=False, + optimize=False): + + def wrapper(func): + + def wrapper_inner(*args, **kargs): + return func(*args, **kargs) + + return wrapper_inner + + if func is None: + return wrapper + else: + return func + + +if TORCH_VERSION == 'parrots': + from parrots.utils.tester import skip_no_elena +else: + + def skip_no_elena(func): + + def wrapper(*args, **kargs): + return func(*args, **kargs) + + return wrapper diff --git a/lavis/common/annotator/uniformer/mmcv/utils/parrots_wrapper.py b/lavis/common/annotator/uniformer/mmcv/utils/parrots_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..93c97640d4b9ed088ca82cfe03e6efebfcfa9dbf --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/parrots_wrapper.py @@ -0,0 +1,107 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import partial + +import torch + +TORCH_VERSION = torch.__version__ + + +def is_rocm_pytorch() -> bool: + is_rocm = False + if TORCH_VERSION != 'parrots': + try: + from torch.utils.cpp_extension import ROCM_HOME + is_rocm = True if ((torch.version.hip is not None) and + (ROCM_HOME is not None)) else False + except ImportError: + pass + return is_rocm + + +def _get_cuda_home(): + if TORCH_VERSION == 'parrots': + from parrots.utils.build_extension import CUDA_HOME + else: + if is_rocm_pytorch(): + from torch.utils.cpp_extension import ROCM_HOME + CUDA_HOME = ROCM_HOME + else: + from torch.utils.cpp_extension import CUDA_HOME + return CUDA_HOME + + +def get_build_config(): + if TORCH_VERSION == 'parrots': + from parrots.config import get_build_info + return get_build_info() + else: + return torch.__config__.show() + + +def _get_conv(): + if TORCH_VERSION == 'parrots': + from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin + else: + from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin + return _ConvNd, _ConvTransposeMixin + + +def _get_dataloader(): + if TORCH_VERSION == 'parrots': + from torch.utils.data import DataLoader, PoolDataLoader + else: + from torch.utils.data import DataLoader + PoolDataLoader = DataLoader + return DataLoader, PoolDataLoader + + +def _get_extension(): + if TORCH_VERSION == 'parrots': + from parrots.utils.build_extension import BuildExtension, Extension + CppExtension = partial(Extension, cuda=False) + CUDAExtension = partial(Extension, cuda=True) + else: + from torch.utils.cpp_extension import (BuildExtension, CppExtension, + CUDAExtension) + return BuildExtension, CppExtension, CUDAExtension + + +def _get_pool(): + if TORCH_VERSION == 'parrots': + from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd, + _AdaptiveMaxPoolNd, _AvgPoolNd, + _MaxPoolNd) + else: + from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd, + _AdaptiveMaxPoolNd, _AvgPoolNd, + _MaxPoolNd) + return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd + + +def _get_norm(): + if TORCH_VERSION == 'parrots': + from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm + SyncBatchNorm_ = torch.nn.SyncBatchNorm2d + else: + from torch.nn.modules.instancenorm import _InstanceNorm + from torch.nn.modules.batchnorm import _BatchNorm + SyncBatchNorm_ = torch.nn.SyncBatchNorm + return _BatchNorm, _InstanceNorm, SyncBatchNorm_ + + +_ConvNd, _ConvTransposeMixin = _get_conv() +DataLoader, PoolDataLoader = _get_dataloader() +BuildExtension, CppExtension, CUDAExtension = _get_extension() +_BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm() +_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool() + + +class SyncBatchNorm(SyncBatchNorm_): + + def _check_input_dim(self, input): + if TORCH_VERSION == 'parrots': + if input.dim() < 2: + raise ValueError( + f'expected at least 2D input (got {input.dim()}D input)') + else: + super()._check_input_dim(input) diff --git a/lavis/common/annotator/uniformer/mmcv/utils/path.py b/lavis/common/annotator/uniformer/mmcv/utils/path.py new file mode 100644 index 0000000000000000000000000000000000000000..7dab4b3041413b1432b0f434b8b14783097d33c6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/path.py @@ -0,0 +1,101 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +from pathlib import Path + +from .misc import is_str + + +def is_filepath(x): + return is_str(x) or isinstance(x, Path) + + +def fopen(filepath, *args, **kwargs): + if is_str(filepath): + return open(filepath, *args, **kwargs) + elif isinstance(filepath, Path): + return filepath.open(*args, **kwargs) + raise ValueError('`filepath` should be a string or a Path') + + +def check_file_exist(filename, msg_tmpl='file "{}" does not exist'): + if not osp.isfile(filename): + raise FileNotFoundError(msg_tmpl.format(filename)) + + +def mkdir_or_exist(dir_name, mode=0o777): + if dir_name == '': + return + dir_name = osp.expanduser(dir_name) + os.makedirs(dir_name, mode=mode, exist_ok=True) + + +def symlink(src, dst, overwrite=True, **kwargs): + if os.path.lexists(dst) and overwrite: + os.remove(dst) + os.symlink(src, dst, **kwargs) + + +def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True): + """Scan a directory to find the interested files. + + Args: + dir_path (str | obj:`Path`): Path of the directory. + suffix (str | tuple(str), optional): File suffix that we are + interested in. Default: None. + recursive (bool, optional): If set to True, recursively scan the + directory. Default: False. + case_sensitive (bool, optional) : If set to False, ignore the case of + suffix. Default: True. + + Returns: + A generator for all the interested files with relative paths. + """ + if isinstance(dir_path, (str, Path)): + dir_path = str(dir_path) + else: + raise TypeError('"dir_path" must be a string or Path object') + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('"suffix" must be a string or tuple of strings') + + if suffix is not None and not case_sensitive: + suffix = suffix.lower() if isinstance(suffix, str) else tuple( + item.lower() for item in suffix) + + root = dir_path + + def _scandir(dir_path, suffix, recursive, case_sensitive): + for entry in os.scandir(dir_path): + if not entry.name.startswith('.') and entry.is_file(): + rel_path = osp.relpath(entry.path, root) + _rel_path = rel_path if case_sensitive else rel_path.lower() + if suffix is None or _rel_path.endswith(suffix): + yield rel_path + elif recursive and os.path.isdir(entry.path): + # scan recursively if entry.path is a directory + yield from _scandir(entry.path, suffix, recursive, + case_sensitive) + + return _scandir(dir_path, suffix, recursive, case_sensitive) + + +def find_vcs_root(path, markers=('.git', )): + """Finds the root directory (including itself) of specified markers. + + Args: + path (str): Path of directory or file. + markers (list[str], optional): List of file or directory names. + + Returns: + The directory contained one of the markers or None if not found. + """ + if osp.isfile(path): + path = osp.dirname(path) + + prev, cur = None, osp.abspath(osp.expanduser(path)) + while cur != prev: + if any(osp.exists(osp.join(cur, marker)) for marker in markers): + return cur + prev, cur = cur, osp.split(cur)[0] + return None diff --git a/lavis/common/annotator/uniformer/mmcv/utils/progressbar.py b/lavis/common/annotator/uniformer/mmcv/utils/progressbar.py new file mode 100644 index 0000000000000000000000000000000000000000..0062f670dd94fa9da559ab26ef85517dcf5211c7 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/progressbar.py @@ -0,0 +1,208 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import sys +from collections.abc import Iterable +from multiprocessing import Pool +from shutil import get_terminal_size + +from .timer import Timer + + +class ProgressBar: + """A progress bar which can print the progress.""" + + def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout): + self.task_num = task_num + self.bar_width = bar_width + self.completed = 0 + self.file = file + if start: + self.start() + + @property + def terminal_width(self): + width, _ = get_terminal_size() + return width + + def start(self): + if self.task_num > 0: + self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, ' + 'elapsed: 0s, ETA:') + else: + self.file.write('completed: 0, elapsed: 0s') + self.file.flush() + self.timer = Timer() + + def update(self, num_tasks=1): + assert num_tasks > 0 + self.completed += num_tasks + elapsed = self.timer.since_start() + if elapsed > 0: + fps = self.completed / elapsed + else: + fps = float('inf') + if self.task_num > 0: + percentage = self.completed / float(self.task_num) + eta = int(elapsed * (1 - percentage) / percentage + 0.5) + msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \ + f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \ + f'ETA: {eta:5}s' + + bar_width = min(self.bar_width, + int(self.terminal_width - len(msg)) + 2, + int(self.terminal_width * 0.6)) + bar_width = max(2, bar_width) + mark_width = int(bar_width * percentage) + bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width) + self.file.write(msg.format(bar_chars)) + else: + self.file.write( + f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,' + f' {fps:.1f} tasks/s') + self.file.flush() + + +def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs): + """Track the progress of tasks execution with a progress bar. + + Tasks are done with a simple for-loop. + + Args: + func (callable): The function to be applied to each task. + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + bar_width (int): Width of progress bar. + + Returns: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, Iterable): + task_num = len(tasks) + else: + raise TypeError( + '"tasks" must be an iterable object or a (iterator, int) tuple') + prog_bar = ProgressBar(task_num, bar_width, file=file) + results = [] + for task in tasks: + results.append(func(task, **kwargs)) + prog_bar.update() + prog_bar.file.write('\n') + return results + + +def init_pool(process_num, initializer=None, initargs=None): + if initializer is None: + return Pool(process_num) + elif initargs is None: + return Pool(process_num, initializer) + else: + if not isinstance(initargs, tuple): + raise TypeError('"initargs" must be a tuple') + return Pool(process_num, initializer, initargs) + + +def track_parallel_progress(func, + tasks, + nproc, + initializer=None, + initargs=None, + bar_width=50, + chunksize=1, + skip_first=False, + keep_order=True, + file=sys.stdout): + """Track the progress of parallel task execution with a progress bar. + + The built-in :mod:`multiprocessing` module is used for process pools and + tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`. + + Args: + func (callable): The function to be applied to each task. + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + nproc (int): Process (worker) number. + initializer (None or callable): Refer to :class:`multiprocessing.Pool` + for details. + initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for + details. + chunksize (int): Refer to :class:`multiprocessing.Pool` for details. + bar_width (int): Width of progress bar. + skip_first (bool): Whether to skip the first sample for each worker + when estimating fps, since the initialization step may takes + longer. + keep_order (bool): If True, :func:`Pool.imap` is used, otherwise + :func:`Pool.imap_unordered` is used. + + Returns: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, Iterable): + task_num = len(tasks) + else: + raise TypeError( + '"tasks" must be an iterable object or a (iterator, int) tuple') + pool = init_pool(nproc, initializer, initargs) + start = not skip_first + task_num -= nproc * chunksize * int(skip_first) + prog_bar = ProgressBar(task_num, bar_width, start, file=file) + results = [] + if keep_order: + gen = pool.imap(func, tasks, chunksize) + else: + gen = pool.imap_unordered(func, tasks, chunksize) + for result in gen: + results.append(result) + if skip_first: + if len(results) < nproc * chunksize: + continue + elif len(results) == nproc * chunksize: + prog_bar.start() + continue + prog_bar.update() + prog_bar.file.write('\n') + pool.close() + pool.join() + return results + + +def track_iter_progress(tasks, bar_width=50, file=sys.stdout): + """Track the progress of tasks iteration or enumeration with a progress + bar. + + Tasks are yielded with a simple for-loop. + + Args: + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + bar_width (int): Width of progress bar. + + Yields: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, Iterable): + task_num = len(tasks) + else: + raise TypeError( + '"tasks" must be an iterable object or a (iterator, int) tuple') + prog_bar = ProgressBar(task_num, bar_width, file=file) + for task in tasks: + yield task + prog_bar.update() + prog_bar.file.write('\n') diff --git a/lavis/common/annotator/uniformer/mmcv/utils/registry.py b/lavis/common/annotator/uniformer/mmcv/utils/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..fa9df39bc9f3d8d568361e7250ab35468f2b74e0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/registry.py @@ -0,0 +1,315 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect +import warnings +from functools import partial + +from .misc import is_seq_of + + +def build_from_cfg(cfg, registry, default_args=None): + """Build a module from config dict. + + Args: + cfg (dict): Config dict. It should at least contain the key "type". + registry (:obj:`Registry`): The registry to search the type from. + default_args (dict, optional): Default initialization arguments. + + Returns: + object: The constructed object. + """ + if not isinstance(cfg, dict): + raise TypeError(f'cfg must be a dict, but got {type(cfg)}') + if 'type' not in cfg: + if default_args is None or 'type' not in default_args: + raise KeyError( + '`cfg` or `default_args` must contain the key "type", ' + f'but got {cfg}\n{default_args}') + if not isinstance(registry, Registry): + raise TypeError('registry must be an mmcv.Registry object, ' + f'but got {type(registry)}') + if not (isinstance(default_args, dict) or default_args is None): + raise TypeError('default_args must be a dict or None, ' + f'but got {type(default_args)}') + + args = cfg.copy() + + if default_args is not None: + for name, value in default_args.items(): + args.setdefault(name, value) + + obj_type = args.pop('type') + if isinstance(obj_type, str): + obj_cls = registry.get(obj_type) + if obj_cls is None: + raise KeyError( + f'{obj_type} is not in the {registry.name} registry') + elif inspect.isclass(obj_type): + obj_cls = obj_type + else: + raise TypeError( + f'type must be a str or valid type, but got {type(obj_type)}') + try: + return obj_cls(**args) + except Exception as e: + # Normal TypeError does not print class name. + raise type(e)(f'{obj_cls.__name__}: {e}') + + +class Registry: + """A registry to map strings to classes. + + Registered object could be built from registry. + Example: + >>> MODELS = Registry('models') + >>> @MODELS.register_module() + >>> class ResNet: + >>> pass + >>> resnet = MODELS.build(dict(type='ResNet')) + + Please refer to + https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for + advanced usage. + + Args: + name (str): Registry name. + build_func(func, optional): Build function to construct instance from + Registry, func:`build_from_cfg` is used if neither ``parent`` or + ``build_func`` is specified. If ``parent`` is specified and + ``build_func`` is not given, ``build_func`` will be inherited + from ``parent``. Default: None. + parent (Registry, optional): Parent registry. The class registered in + children registry could be built from parent. Default: None. + scope (str, optional): The scope of registry. It is the key to search + for children registry. If not specified, scope will be the name of + the package where class is defined, e.g. mmdet, mmcls, mmseg. + Default: None. + """ + + def __init__(self, name, build_func=None, parent=None, scope=None): + self._name = name + self._module_dict = dict() + self._children = dict() + self._scope = self.infer_scope() if scope is None else scope + + # self.build_func will be set with the following priority: + # 1. build_func + # 2. parent.build_func + # 3. build_from_cfg + if build_func is None: + if parent is not None: + self.build_func = parent.build_func + else: + self.build_func = build_from_cfg + else: + self.build_func = build_func + if parent is not None: + assert isinstance(parent, Registry) + parent._add_children(self) + self.parent = parent + else: + self.parent = None + + def __len__(self): + return len(self._module_dict) + + def __contains__(self, key): + return self.get(key) is not None + + def __repr__(self): + format_str = self.__class__.__name__ + \ + f'(name={self._name}, ' \ + f'items={self._module_dict})' + return format_str + + @staticmethod + def infer_scope(): + """Infer the scope of registry. + + The name of the package where registry is defined will be returned. + + Example: + # in mmdet/models/backbone/resnet.py + >>> MODELS = Registry('models') + >>> @MODELS.register_module() + >>> class ResNet: + >>> pass + The scope of ``ResNet`` will be ``mmdet``. + + + Returns: + scope (str): The inferred scope name. + """ + # inspect.stack() trace where this function is called, the index-2 + # indicates the frame where `infer_scope()` is called + filename = inspect.getmodule(inspect.stack()[2][0]).__name__ + split_filename = filename.split('.') + return split_filename[0] + + @staticmethod + def split_scope_key(key): + """Split scope and key. + + The first scope will be split from key. + + Examples: + >>> Registry.split_scope_key('mmdet.ResNet') + 'mmdet', 'ResNet' + >>> Registry.split_scope_key('ResNet') + None, 'ResNet' + + Return: + scope (str, None): The first scope. + key (str): The remaining key. + """ + split_index = key.find('.') + if split_index != -1: + return key[:split_index], key[split_index + 1:] + else: + return None, key + + @property + def name(self): + return self._name + + @property + def scope(self): + return self._scope + + @property + def module_dict(self): + return self._module_dict + + @property + def children(self): + return self._children + + def get(self, key): + """Get the registry record. + + Args: + key (str): The class name in string format. + + Returns: + class: The corresponding class. + """ + scope, real_key = self.split_scope_key(key) + if scope is None or scope == self._scope: + # get from self + if real_key in self._module_dict: + return self._module_dict[real_key] + else: + # get from self._children + if scope in self._children: + return self._children[scope].get(real_key) + else: + # goto root + parent = self.parent + while parent.parent is not None: + parent = parent.parent + return parent.get(key) + + def build(self, *args, **kwargs): + return self.build_func(*args, **kwargs, registry=self) + + def _add_children(self, registry): + """Add children for a registry. + + The ``registry`` will be added as children based on its scope. + The parent registry could build objects from children registry. + + Example: + >>> models = Registry('models') + >>> mmdet_models = Registry('models', parent=models) + >>> @mmdet_models.register_module() + >>> class ResNet: + >>> pass + >>> resnet = models.build(dict(type='mmdet.ResNet')) + """ + + assert isinstance(registry, Registry) + assert registry.scope is not None + assert registry.scope not in self.children, \ + f'scope {registry.scope} exists in {self.name} registry' + self.children[registry.scope] = registry + + def _register_module(self, module_class, module_name=None, force=False): + if not inspect.isclass(module_class): + raise TypeError('module must be a class, ' + f'but got {type(module_class)}') + + if module_name is None: + module_name = module_class.__name__ + if isinstance(module_name, str): + module_name = [module_name] + for name in module_name: + if not force and name in self._module_dict: + raise KeyError(f'{name} is already registered ' + f'in {self.name}') + self._module_dict[name] = module_class + + def deprecated_register_module(self, cls=None, force=False): + warnings.warn( + 'The old API of register_module(module, force=False) ' + 'is deprecated and will be removed, please use the new API ' + 'register_module(name=None, force=False, module=None) instead.') + if cls is None: + return partial(self.deprecated_register_module, force=force) + self._register_module(cls, force=force) + return cls + + def register_module(self, name=None, force=False, module=None): + """Register a module. + + A record will be added to `self._module_dict`, whose key is the class + name or the specified name, and value is the class itself. + It can be used as a decorator or a normal function. + + Example: + >>> backbones = Registry('backbone') + >>> @backbones.register_module() + >>> class ResNet: + >>> pass + + >>> backbones = Registry('backbone') + >>> @backbones.register_module(name='mnet') + >>> class MobileNet: + >>> pass + + >>> backbones = Registry('backbone') + >>> class ResNet: + >>> pass + >>> backbones.register_module(ResNet) + + Args: + name (str | None): The module name to be registered. If not + specified, the class name will be used. + force (bool, optional): Whether to override an existing class with + the same name. Default: False. + module (type): Module class to be registered. + """ + if not isinstance(force, bool): + raise TypeError(f'force must be a boolean, but got {type(force)}') + # NOTE: This is a walkaround to be compatible with the old api, + # while it may introduce unexpected bugs. + if isinstance(name, type): + return self.deprecated_register_module(name, force=force) + + # raise the error ahead of time + if not (name is None or isinstance(name, str) or is_seq_of(name, str)): + raise TypeError( + 'name must be either of None, an instance of str or a sequence' + f' of str, but got {type(name)}') + + # use it as a normal method: x.register_module(module=SomeClass) + if module is not None: + self._register_module( + module_class=module, module_name=name, force=force) + return module + + # use it as a decorator: @x.register_module() + def _register(cls): + self._register_module( + module_class=cls, module_name=name, force=force) + return cls + + return _register diff --git a/lavis/common/annotator/uniformer/mmcv/utils/testing.py b/lavis/common/annotator/uniformer/mmcv/utils/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..a27f936da8ec14bac18562ede0a79d476d82f797 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/testing.py @@ -0,0 +1,140 @@ +# Copyright (c) Open-MMLab. +import sys +from collections.abc import Iterable +from runpy import run_path +from shlex import split +from typing import Any, Dict, List +from unittest.mock import patch + + +def check_python_script(cmd): + """Run the python cmd script with `__main__`. The difference between + `os.system` is that, this function exectues code in the current process, so + that it can be tracked by coverage tools. Currently it supports two forms: + + - ./tests/data/scripts/hello.py zz + - python tests/data/scripts/hello.py zz + """ + args = split(cmd) + if args[0] == 'python': + args = args[1:] + with patch.object(sys, 'argv', args): + run_path(args[0], run_name='__main__') + + +def _any(judge_result): + """Since built-in ``any`` works only when the element of iterable is not + iterable, implement the function.""" + if not isinstance(judge_result, Iterable): + return judge_result + + try: + for element in judge_result: + if _any(element): + return True + except TypeError: + # Maybe encounter the case: torch.tensor(True) | torch.tensor(False) + if judge_result: + return True + return False + + +def assert_dict_contains_subset(dict_obj: Dict[Any, Any], + expected_subset: Dict[Any, Any]) -> bool: + """Check if the dict_obj contains the expected_subset. + + Args: + dict_obj (Dict[Any, Any]): Dict object to be checked. + expected_subset (Dict[Any, Any]): Subset expected to be contained in + dict_obj. + + Returns: + bool: Whether the dict_obj contains the expected_subset. + """ + + for key, value in expected_subset.items(): + if key not in dict_obj.keys() or _any(dict_obj[key] != value): + return False + return True + + +def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool: + """Check if attribute of class object is correct. + + Args: + obj (object): Class object to be checked. + expected_attrs (Dict[str, Any]): Dict of the expected attrs. + + Returns: + bool: Whether the attribute of class object is correct. + """ + for attr, value in expected_attrs.items(): + if not hasattr(obj, attr) or _any(getattr(obj, attr) != value): + return False + return True + + +def assert_dict_has_keys(obj: Dict[str, Any], + expected_keys: List[str]) -> bool: + """Check if the obj has all the expected_keys. + + Args: + obj (Dict[str, Any]): Object to be checked. + expected_keys (List[str]): Keys expected to contained in the keys of + the obj. + + Returns: + bool: Whether the obj has the expected keys. + """ + return set(expected_keys).issubset(set(obj.keys())) + + +def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool: + """Check if target_keys is equal to result_keys. + + Args: + result_keys (List[str]): Result keys to be checked. + target_keys (List[str]): Target keys to be checked. + + Returns: + bool: Whether target_keys is equal to result_keys. + """ + return set(result_keys) == set(target_keys) + + +def assert_is_norm_layer(module) -> bool: + """Check if the module is a norm layer. + + Args: + module (nn.Module): The module to be checked. + + Returns: + bool: Whether the module is a norm layer. + """ + from .parrots_wrapper import _BatchNorm, _InstanceNorm + from torch.nn import GroupNorm, LayerNorm + norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm) + return isinstance(module, norm_layer_candidates) + + +def assert_params_all_zeros(module) -> bool: + """Check if the parameters of the module is all zeros. + + Args: + module (nn.Module): The module to be checked. + + Returns: + bool: Whether the parameters of the module is all zeros. + """ + weight_data = module.weight.data + is_weight_zero = weight_data.allclose( + weight_data.new_zeros(weight_data.size())) + + if hasattr(module, 'bias') and module.bias is not None: + bias_data = module.bias.data + is_bias_zero = bias_data.allclose( + bias_data.new_zeros(bias_data.size())) + else: + is_bias_zero = True + + return is_weight_zero and is_bias_zero diff --git a/lavis/common/annotator/uniformer/mmcv/utils/timer.py b/lavis/common/annotator/uniformer/mmcv/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..e3db7d497d8b374e18b5297e0a1d6eb186fd8cba --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/timer.py @@ -0,0 +1,118 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from time import time + + +class TimerError(Exception): + + def __init__(self, message): + self.message = message + super(TimerError, self).__init__(message) + + +class Timer: + """A flexible Timer class. + + :Example: + + >>> import time + >>> import annotator.uniformer.mmcv as mmcv + >>> with mmcv.Timer(): + >>> # simulate a code block that will run for 1s + >>> time.sleep(1) + 1.000 + >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'): + >>> # simulate a code block that will run for 1s + >>> time.sleep(1) + it takes 1.0 seconds + >>> timer = mmcv.Timer() + >>> time.sleep(0.5) + >>> print(timer.since_start()) + 0.500 + >>> time.sleep(0.5) + >>> print(timer.since_last_check()) + 0.500 + >>> print(timer.since_start()) + 1.000 + """ + + def __init__(self, start=True, print_tmpl=None): + self._is_running = False + self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}' + if start: + self.start() + + @property + def is_running(self): + """bool: indicate whether the timer is running""" + return self._is_running + + def __enter__(self): + self.start() + return self + + def __exit__(self, type, value, traceback): + print(self.print_tmpl.format(self.since_last_check())) + self._is_running = False + + def start(self): + """Start the timer.""" + if not self._is_running: + self._t_start = time() + self._is_running = True + self._t_last = time() + + def since_start(self): + """Total time since the timer is started. + + Returns (float): Time in seconds. + """ + if not self._is_running: + raise TimerError('timer is not running') + self._t_last = time() + return self._t_last - self._t_start + + def since_last_check(self): + """Time since the last checking. + + Either :func:`since_start` or :func:`since_last_check` is a checking + operation. + + Returns (float): Time in seconds. + """ + if not self._is_running: + raise TimerError('timer is not running') + dur = time() - self._t_last + self._t_last = time() + return dur + + +_g_timers = {} # global timers + + +def check_time(timer_id): + """Add check points in a single line. + + This method is suitable for running a task on a list of items. A timer will + be registered when the method is called for the first time. + + :Example: + + >>> import time + >>> import annotator.uniformer.mmcv as mmcv + >>> for i in range(1, 6): + >>> # simulate a code block + >>> time.sleep(i) + >>> mmcv.check_time('task1') + 2.000 + 3.000 + 4.000 + 5.000 + + Args: + timer_id (str): Timer identifier. + """ + if timer_id not in _g_timers: + _g_timers[timer_id] = Timer() + return 0 + else: + return _g_timers[timer_id].since_last_check() diff --git a/lavis/common/annotator/uniformer/mmcv/utils/trace.py b/lavis/common/annotator/uniformer/mmcv/utils/trace.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca99dc3eda05ef980d9a4249b50deca8273b6cc --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/trace.py @@ -0,0 +1,23 @@ +import warnings + +import torch + +from annotator.uniformer.mmcv.utils import digit_version + + +def is_jit_tracing() -> bool: + if (torch.__version__ != 'parrots' + and digit_version(torch.__version__) >= digit_version('1.6.0')): + on_trace = torch.jit.is_tracing() + # In PyTorch 1.6, torch.jit.is_tracing has a bug. + # Refers to https://github.com/pytorch/pytorch/issues/42448 + if isinstance(on_trace, bool): + return on_trace + else: + return torch._C._is_tracing() + else: + warnings.warn( + 'torch.jit.is_tracing is only supported after v1.6.0. ' + 'Therefore is_tracing returns False automatically. Please ' + 'set on_trace manually if you are using trace.', UserWarning) + return False diff --git a/lavis/common/annotator/uniformer/mmcv/utils/version_utils.py b/lavis/common/annotator/uniformer/mmcv/utils/version_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..963c45a2e8a86a88413ab6c18c22481fb9831985 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/utils/version_utils.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import subprocess +import warnings + +from packaging.version import parse + + +def digit_version(version_str: str, length: int = 4): + """Convert a version string into a tuple of integers. + + This method is usually used for comparing two versions. For pre-release + versions: alpha < beta < rc. + + Args: + version_str (str): The version string. + length (int): The maximum number of version levels. Default: 4. + + Returns: + tuple[int]: The version info in digits (integers). + """ + assert 'parrots' not in version_str + version = parse(version_str) + assert version.release, f'failed to parse version {version_str}' + release = list(version.release) + release = release[:length] + if len(release) < length: + release = release + [0] * (length - len(release)) + if version.is_prerelease: + mapping = {'a': -3, 'b': -2, 'rc': -1} + val = -4 + # version.pre can be None + if version.pre: + if version.pre[0] not in mapping: + warnings.warn(f'unknown prerelease version {version.pre[0]}, ' + 'version checking may go wrong') + else: + val = mapping[version.pre[0]] + release.extend([val, version.pre[-1]]) + else: + release.extend([val, 0]) + + elif version.is_postrelease: + release.extend([1, version.post]) + else: + release.extend([0, 0]) + return tuple(release) + + +def _minimal_ext_cmd(cmd): + # construct minimal environment + env = {} + for k in ['SYSTEMROOT', 'PATH', 'HOME']: + v = os.environ.get(k) + if v is not None: + env[k] = v + # LANGUAGE is used on win32 + env['LANGUAGE'] = 'C' + env['LANG'] = 'C' + env['LC_ALL'] = 'C' + out = subprocess.Popen( + cmd, stdout=subprocess.PIPE, env=env).communicate()[0] + return out + + +def get_git_hash(fallback='unknown', digits=None): + """Get the git hash of the current repo. + + Args: + fallback (str, optional): The fallback string when git hash is + unavailable. Defaults to 'unknown'. + digits (int, optional): kept digits of the hash. Defaults to None, + meaning all digits are kept. + + Returns: + str: Git commit hash. + """ + + if digits is not None and not isinstance(digits, int): + raise TypeError('digits must be None or an integer') + + try: + out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) + sha = out.strip().decode('ascii') + if digits is not None: + sha = sha[:digits] + except OSError: + sha = fallback + + return sha diff --git a/lavis/common/annotator/uniformer/mmcv/version.py b/lavis/common/annotator/uniformer/mmcv/version.py new file mode 100644 index 0000000000000000000000000000000000000000..1cce4e50bd692d4002e3cac3c545a3fb2efe95d0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/version.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +__version__ = '1.3.17' + + +def parse_version_info(version_str: str, length: int = 4) -> tuple: + """Parse a version string into a tuple. + + Args: + version_str (str): The version string. + length (int): The maximum number of version levels. Default: 4. + + Returns: + tuple[int | str]: The version info, e.g., "1.3.0" is parsed into + (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into + (2, 0, 0, 0, 'rc', 1) (when length is set to 4). + """ + from packaging.version import parse + version = parse(version_str) + assert version.release, f'failed to parse version {version_str}' + release = list(version.release) + release = release[:length] + if len(release) < length: + release = release + [0] * (length - len(release)) + if version.is_prerelease: + release.extend(list(version.pre)) + elif version.is_postrelease: + release.extend(list(version.post)) + else: + release.extend([0, 0]) + return tuple(release) + + +version_info = tuple(int(x) for x in __version__.split('.')[:3]) + +__all__ = ['__version__', 'version_info', 'parse_version_info'] diff --git a/lavis/common/annotator/uniformer/mmcv/video/__init__.py b/lavis/common/annotator/uniformer/mmcv/video/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..73199b01dec52820dc6ca0139903536344d5a1eb --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/video/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .io import Cache, VideoReader, frames2video +from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread, + flowwrite, quantize_flow, sparse_flow_from_bytes) +from .processing import concat_video, convert_video, cut_video, resize_video + +__all__ = [ + 'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video', + 'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow', + 'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes' +] diff --git a/lavis/common/annotator/uniformer/mmcv/video/io.py b/lavis/common/annotator/uniformer/mmcv/video/io.py new file mode 100644 index 0000000000000000000000000000000000000000..9879154227f640c262853b92c219461c6f67ee8e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/video/io.py @@ -0,0 +1,318 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from collections import OrderedDict + +import cv2 +from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT, + CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH, + CAP_PROP_POS_FRAMES, VideoWriter_fourcc) + +from annotator.uniformer.mmcv.utils import (check_file_exist, mkdir_or_exist, scandir, + track_progress) + + +class Cache: + + def __init__(self, capacity): + self._cache = OrderedDict() + self._capacity = int(capacity) + if capacity <= 0: + raise ValueError('capacity must be a positive integer') + + @property + def capacity(self): + return self._capacity + + @property + def size(self): + return len(self._cache) + + def put(self, key, val): + if key in self._cache: + return + if len(self._cache) >= self.capacity: + self._cache.popitem(last=False) + self._cache[key] = val + + def get(self, key, default=None): + val = self._cache[key] if key in self._cache else default + return val + + +class VideoReader: + """Video class with similar usage to a list object. + + This video warpper class provides convenient apis to access frames. + There exists an issue of OpenCV's VideoCapture class that jumping to a + certain frame may be inaccurate. It is fixed in this class by checking + the position after jumping each time. + Cache is used when decoding videos. So if the same frame is visited for + the second time, there is no need to decode again if it is stored in the + cache. + + :Example: + + >>> import annotator.uniformer.mmcv as mmcv + >>> v = mmcv.VideoReader('sample.mp4') + >>> len(v) # get the total frame number with `len()` + 120 + >>> for img in v: # v is iterable + >>> mmcv.imshow(img) + >>> v[5] # get the 6th frame + """ + + def __init__(self, filename, cache_capacity=10): + # Check whether the video path is a url + if not filename.startswith(('https://', 'http://')): + check_file_exist(filename, 'Video file not found: ' + filename) + self._vcap = cv2.VideoCapture(filename) + assert cache_capacity > 0 + self._cache = Cache(cache_capacity) + self._position = 0 + # get basic info + self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH)) + self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT)) + self._fps = self._vcap.get(CAP_PROP_FPS) + self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT)) + self._fourcc = self._vcap.get(CAP_PROP_FOURCC) + + @property + def vcap(self): + """:obj:`cv2.VideoCapture`: The raw VideoCapture object.""" + return self._vcap + + @property + def opened(self): + """bool: Indicate whether the video is opened.""" + return self._vcap.isOpened() + + @property + def width(self): + """int: Width of video frames.""" + return self._width + + @property + def height(self): + """int: Height of video frames.""" + return self._height + + @property + def resolution(self): + """tuple: Video resolution (width, height).""" + return (self._width, self._height) + + @property + def fps(self): + """float: FPS of the video.""" + return self._fps + + @property + def frame_cnt(self): + """int: Total frames of the video.""" + return self._frame_cnt + + @property + def fourcc(self): + """str: "Four character code" of the video.""" + return self._fourcc + + @property + def position(self): + """int: Current cursor position, indicating frame decoded.""" + return self._position + + def _get_real_position(self): + return int(round(self._vcap.get(CAP_PROP_POS_FRAMES))) + + def _set_real_position(self, frame_id): + self._vcap.set(CAP_PROP_POS_FRAMES, frame_id) + pos = self._get_real_position() + for _ in range(frame_id - pos): + self._vcap.read() + self._position = frame_id + + def read(self): + """Read the next frame. + + If the next frame have been decoded before and in the cache, then + return it directly, otherwise decode, cache and return it. + + Returns: + ndarray or None: Return the frame if successful, otherwise None. + """ + # pos = self._position + if self._cache: + img = self._cache.get(self._position) + if img is not None: + ret = True + else: + if self._position != self._get_real_position(): + self._set_real_position(self._position) + ret, img = self._vcap.read() + if ret: + self._cache.put(self._position, img) + else: + ret, img = self._vcap.read() + if ret: + self._position += 1 + return img + + def get_frame(self, frame_id): + """Get frame by index. + + Args: + frame_id (int): Index of the expected frame, 0-based. + + Returns: + ndarray or None: Return the frame if successful, otherwise None. + """ + if frame_id < 0 or frame_id >= self._frame_cnt: + raise IndexError( + f'"frame_id" must be between 0 and {self._frame_cnt - 1}') + if frame_id == self._position: + return self.read() + if self._cache: + img = self._cache.get(frame_id) + if img is not None: + self._position = frame_id + 1 + return img + self._set_real_position(frame_id) + ret, img = self._vcap.read() + if ret: + if self._cache: + self._cache.put(self._position, img) + self._position += 1 + return img + + def current_frame(self): + """Get the current frame (frame that is just visited). + + Returns: + ndarray or None: If the video is fresh, return None, otherwise + return the frame. + """ + if self._position == 0: + return None + return self._cache.get(self._position - 1) + + def cvt2frames(self, + frame_dir, + file_start=0, + filename_tmpl='{:06d}.jpg', + start=0, + max_num=0, + show_progress=True): + """Convert a video to frame images. + + Args: + frame_dir (str): Output directory to store all the frame images. + file_start (int): Filenames will start from the specified number. + filename_tmpl (str): Filename template with the index as the + placeholder. + start (int): The starting frame index. + max_num (int): Maximum number of frames to be written. + show_progress (bool): Whether to show a progress bar. + """ + mkdir_or_exist(frame_dir) + if max_num == 0: + task_num = self.frame_cnt - start + else: + task_num = min(self.frame_cnt - start, max_num) + if task_num <= 0: + raise ValueError('start must be less than total frame number') + if start > 0: + self._set_real_position(start) + + def write_frame(file_idx): + img = self.read() + if img is None: + return + filename = osp.join(frame_dir, filename_tmpl.format(file_idx)) + cv2.imwrite(filename, img) + + if show_progress: + track_progress(write_frame, range(file_start, + file_start + task_num)) + else: + for i in range(task_num): + write_frame(file_start + i) + + def __len__(self): + return self.frame_cnt + + def __getitem__(self, index): + if isinstance(index, slice): + return [ + self.get_frame(i) + for i in range(*index.indices(self.frame_cnt)) + ] + # support negative indexing + if index < 0: + index += self.frame_cnt + if index < 0: + raise IndexError('index out of range') + return self.get_frame(index) + + def __iter__(self): + self._set_real_position(0) + return self + + def __next__(self): + img = self.read() + if img is not None: + return img + else: + raise StopIteration + + next = __next__ + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._vcap.release() + + +def frames2video(frame_dir, + video_file, + fps=30, + fourcc='XVID', + filename_tmpl='{:06d}.jpg', + start=0, + end=0, + show_progress=True): + """Read the frame images from a directory and join them as a video. + + Args: + frame_dir (str): The directory containing video frames. + video_file (str): Output filename. + fps (float): FPS of the output video. + fourcc (str): Fourcc of the output video, this should be compatible + with the output file type. + filename_tmpl (str): Filename template with the index as the variable. + start (int): Starting frame index. + end (int): Ending frame index. + show_progress (bool): Whether to show a progress bar. + """ + if end == 0: + ext = filename_tmpl.split('.')[-1] + end = len([name for name in scandir(frame_dir, ext)]) + first_file = osp.join(frame_dir, filename_tmpl.format(start)) + check_file_exist(first_file, 'The start frame not found: ' + first_file) + img = cv2.imread(first_file) + height, width = img.shape[:2] + resolution = (width, height) + vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps, + resolution) + + def write_frame(file_idx): + filename = osp.join(frame_dir, filename_tmpl.format(file_idx)) + img = cv2.imread(filename) + vwriter.write(img) + + if show_progress: + track_progress(write_frame, range(start, end)) + else: + for i in range(start, end): + write_frame(i) + vwriter.release() diff --git a/lavis/common/annotator/uniformer/mmcv/video/optflow.py b/lavis/common/annotator/uniformer/mmcv/video/optflow.py new file mode 100644 index 0000000000000000000000000000000000000000..84160f8d6ef9fceb5a2f89e7481593109fc1905d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/video/optflow.py @@ -0,0 +1,254 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import cv2 +import numpy as np + +from annotator.uniformer.mmcv.arraymisc import dequantize, quantize +from annotator.uniformer.mmcv.image import imread, imwrite +from annotator.uniformer.mmcv.utils import is_str + + +def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs): + """Read an optical flow map. + + Args: + flow_or_path (ndarray or str): A flow map or filepath. + quantize (bool): whether to read quantized pair, if set to True, + remaining args will be passed to :func:`dequantize_flow`. + concat_axis (int): The axis that dx and dy are concatenated, + can be either 0 or 1. Ignored if quantize is False. + + Returns: + ndarray: Optical flow represented as a (h, w, 2) numpy array + """ + if isinstance(flow_or_path, np.ndarray): + if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2): + raise ValueError(f'Invalid flow with shape {flow_or_path.shape}') + return flow_or_path + elif not is_str(flow_or_path): + raise TypeError(f'"flow_or_path" must be a filename or numpy array, ' + f'not {type(flow_or_path)}') + + if not quantize: + with open(flow_or_path, 'rb') as f: + try: + header = f.read(4).decode('utf-8') + except Exception: + raise IOError(f'Invalid flow file: {flow_or_path}') + else: + if header != 'PIEH': + raise IOError(f'Invalid flow file: {flow_or_path}, ' + 'header does not contain PIEH') + + w = np.fromfile(f, np.int32, 1).squeeze() + h = np.fromfile(f, np.int32, 1).squeeze() + flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2)) + else: + assert concat_axis in [0, 1] + cat_flow = imread(flow_or_path, flag='unchanged') + if cat_flow.ndim != 2: + raise IOError( + f'{flow_or_path} is not a valid quantized flow file, ' + f'its dimension is {cat_flow.ndim}.') + assert cat_flow.shape[concat_axis] % 2 == 0 + dx, dy = np.split(cat_flow, 2, axis=concat_axis) + flow = dequantize_flow(dx, dy, *args, **kwargs) + + return flow.astype(np.float32) + + +def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs): + """Write optical flow to file. + + If the flow is not quantized, it will be saved as a .flo file losslessly, + otherwise a jpeg image which is lossy but of much smaller size. (dx and dy + will be concatenated horizontally into a single image if quantize is True.) + + Args: + flow (ndarray): (h, w, 2) array of optical flow. + filename (str): Output filepath. + quantize (bool): Whether to quantize the flow and save it to 2 jpeg + images. If set to True, remaining args will be passed to + :func:`quantize_flow`. + concat_axis (int): The axis that dx and dy are concatenated, + can be either 0 or 1. Ignored if quantize is False. + """ + if not quantize: + with open(filename, 'wb') as f: + f.write('PIEH'.encode('utf-8')) + np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f) + flow = flow.astype(np.float32) + flow.tofile(f) + f.flush() + else: + assert concat_axis in [0, 1] + dx, dy = quantize_flow(flow, *args, **kwargs) + dxdy = np.concatenate((dx, dy), axis=concat_axis) + imwrite(dxdy, filename) + + +def quantize_flow(flow, max_val=0.02, norm=True): + """Quantize flow to [0, 255]. + + After this step, the size of flow will be much smaller, and can be + dumped as jpeg images. + + Args: + flow (ndarray): (h, w, 2) array of optical flow. + max_val (float): Maximum value of flow, values beyond + [-max_val, max_val] will be truncated. + norm (bool): Whether to divide flow values by image width/height. + + Returns: + tuple[ndarray]: Quantized dx and dy. + """ + h, w, _ = flow.shape + dx = flow[..., 0] + dy = flow[..., 1] + if norm: + dx = dx / w # avoid inplace operations + dy = dy / h + # use 255 levels instead of 256 to make sure 0 is 0 after dequantization. + flow_comps = [ + quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy] + ] + return tuple(flow_comps) + + +def dequantize_flow(dx, dy, max_val=0.02, denorm=True): + """Recover from quantized flow. + + Args: + dx (ndarray): Quantized dx. + dy (ndarray): Quantized dy. + max_val (float): Maximum value used when quantizing. + denorm (bool): Whether to multiply flow values with width/height. + + Returns: + ndarray: Dequantized flow. + """ + assert dx.shape == dy.shape + assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1) + + dx, dy = [dequantize(d, -max_val, max_val, 255) for d in [dx, dy]] + + if denorm: + dx *= dx.shape[1] + dy *= dx.shape[0] + flow = np.dstack((dx, dy)) + return flow + + +def flow_warp(img, flow, filling_value=0, interpolate_mode='nearest'): + """Use flow to warp img. + + Args: + img (ndarray, float or uint8): Image to be warped. + flow (ndarray, float): Optical Flow. + filling_value (int): The missing pixels will be set with filling_value. + interpolate_mode (str): bilinear -> Bilinear Interpolation; + nearest -> Nearest Neighbor. + + Returns: + ndarray: Warped image with the same shape of img + """ + warnings.warn('This function is just for prototyping and cannot ' + 'guarantee the computational efficiency.') + assert flow.ndim == 3, 'Flow must be in 3D arrays.' + height = flow.shape[0] + width = flow.shape[1] + channels = img.shape[2] + + output = np.ones( + (height, width, channels), dtype=img.dtype) * filling_value + + grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2) + dx = grid[:, :, 0] + flow[:, :, 1] + dy = grid[:, :, 1] + flow[:, :, 0] + sx = np.floor(dx).astype(int) + sy = np.floor(dy).astype(int) + valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1) + + if interpolate_mode == 'nearest': + output[valid, :] = img[dx[valid].round().astype(int), + dy[valid].round().astype(int), :] + elif interpolate_mode == 'bilinear': + # dirty walkround for integer positions + eps_ = 1e-6 + dx, dy = dx + eps_, dy + eps_ + left_top_ = img[np.floor(dx[valid]).astype(int), + np.floor(dy[valid]).astype(int), :] * ( + np.ceil(dx[valid]) - dx[valid])[:, None] * ( + np.ceil(dy[valid]) - dy[valid])[:, None] + left_down_ = img[np.ceil(dx[valid]).astype(int), + np.floor(dy[valid]).astype(int), :] * ( + dx[valid] - np.floor(dx[valid]))[:, None] * ( + np.ceil(dy[valid]) - dy[valid])[:, None] + right_top_ = img[np.floor(dx[valid]).astype(int), + np.ceil(dy[valid]).astype(int), :] * ( + np.ceil(dx[valid]) - dx[valid])[:, None] * ( + dy[valid] - np.floor(dy[valid]))[:, None] + right_down_ = img[np.ceil(dx[valid]).astype(int), + np.ceil(dy[valid]).astype(int), :] * ( + dx[valid] - np.floor(dx[valid]))[:, None] * ( + dy[valid] - np.floor(dy[valid]))[:, None] + output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_ + else: + raise NotImplementedError( + 'We only support interpolation modes of nearest and bilinear, ' + f'but got {interpolate_mode}.') + return output.astype(img.dtype) + + +def flow_from_bytes(content): + """Read dense optical flow from bytes. + + .. note:: + This load optical flow function works for FlyingChairs, FlyingThings3D, + Sintel, FlyingChairsOcc datasets, but cannot load the data from + ChairsSDHom. + + Args: + content (bytes): Optical flow bytes got from files or other streams. + + Returns: + ndarray: Loaded optical flow with the shape (H, W, 2). + """ + + # header in first 4 bytes + header = content[:4] + if header.decode('utf-8') != 'PIEH': + raise Exception('Flow file header does not contain PIEH') + # width in second 4 bytes + width = np.frombuffer(content[4:], np.int32, 1).squeeze() + # height in third 4 bytes + height = np.frombuffer(content[8:], np.int32, 1).squeeze() + # after first 12 bytes, all bytes are flow + flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape( + (height, width, 2)) + + return flow + + +def sparse_flow_from_bytes(content): + """Read the optical flow in KITTI datasets from bytes. + + This function is modified from RAFT load the `KITTI datasets + `_. + + Args: + content (bytes): Optical flow bytes got from files or other streams. + + Returns: + Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2) + and flow valid mask with the shape (H, W). + """ # nopa + + content = np.frombuffer(content, np.uint8) + flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR) + flow = flow[:, :, ::-1].astype(np.float32) + # flow shape (H, W, 2) valid shape (H, W) + flow, valid = flow[:, :, :2], flow[:, :, 2] + flow = (flow - 2**15) / 64.0 + return flow, valid diff --git a/lavis/common/annotator/uniformer/mmcv/video/processing.py b/lavis/common/annotator/uniformer/mmcv/video/processing.py new file mode 100644 index 0000000000000000000000000000000000000000..3d90b96e0823d5f116755e7f498d25d17017224a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/video/processing.py @@ -0,0 +1,160 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import subprocess +import tempfile + +from annotator.uniformer.mmcv.utils import requires_executable + + +@requires_executable('ffmpeg') +def convert_video(in_file, + out_file, + print_cmd=False, + pre_options='', + **kwargs): + """Convert a video with ffmpeg. + + This provides a general api to ffmpeg, the executed command is:: + + `ffmpeg -y -i ` + + Options(kwargs) are mapped to ffmpeg commands with the following rules: + + - key=val: "-key val" + - key=True: "-key" + - key=False: "" + + Args: + in_file (str): Input video filename. + out_file (str): Output video filename. + pre_options (str): Options appears before "-i ". + print_cmd (bool): Whether to print the final ffmpeg command. + """ + options = [] + for k, v in kwargs.items(): + if isinstance(v, bool): + if v: + options.append(f'-{k}') + elif k == 'log_level': + assert v in [ + 'quiet', 'panic', 'fatal', 'error', 'warning', 'info', + 'verbose', 'debug', 'trace' + ] + options.append(f'-loglevel {v}') + else: + options.append(f'-{k} {v}') + cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \ + f'{out_file}' + if print_cmd: + print(cmd) + subprocess.call(cmd, shell=True) + + +@requires_executable('ffmpeg') +def resize_video(in_file, + out_file, + size=None, + ratio=None, + keep_ar=False, + log_level='info', + print_cmd=False): + """Resize a video. + + Args: + in_file (str): Input video filename. + out_file (str): Output video filename. + size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1). + ratio (tuple or float): Expected resize ratio, (2, 0.5) means + (w*2, h*0.5). + keep_ar (bool): Whether to keep original aspect ratio. + log_level (str): Logging level of ffmpeg. + print_cmd (bool): Whether to print the final ffmpeg command. + """ + if size is None and ratio is None: + raise ValueError('expected size or ratio must be specified') + if size is not None and ratio is not None: + raise ValueError('size and ratio cannot be specified at the same time') + options = {'log_level': log_level} + if size: + if not keep_ar: + options['vf'] = f'scale={size[0]}:{size[1]}' + else: + options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \ + 'force_original_aspect_ratio=decrease' + else: + if not isinstance(ratio, tuple): + ratio = (ratio, ratio) + options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"' + convert_video(in_file, out_file, print_cmd, **options) + + +@requires_executable('ffmpeg') +def cut_video(in_file, + out_file, + start=None, + end=None, + vcodec=None, + acodec=None, + log_level='info', + print_cmd=False): + """Cut a clip from a video. + + Args: + in_file (str): Input video filename. + out_file (str): Output video filename. + start (None or float): Start time (in seconds). + end (None or float): End time (in seconds). + vcodec (None or str): Output video codec, None for unchanged. + acodec (None or str): Output audio codec, None for unchanged. + log_level (str): Logging level of ffmpeg. + print_cmd (bool): Whether to print the final ffmpeg command. + """ + options = {'log_level': log_level} + if vcodec is None: + options['vcodec'] = 'copy' + if acodec is None: + options['acodec'] = 'copy' + if start: + options['ss'] = start + else: + start = 0 + if end: + options['t'] = end - start + convert_video(in_file, out_file, print_cmd, **options) + + +@requires_executable('ffmpeg') +def concat_video(video_list, + out_file, + vcodec=None, + acodec=None, + log_level='info', + print_cmd=False): + """Concatenate multiple videos into a single one. + + Args: + video_list (list): A list of video filenames + out_file (str): Output video filename + vcodec (None or str): Output video codec, None for unchanged + acodec (None or str): Output audio codec, None for unchanged + log_level (str): Logging level of ffmpeg. + print_cmd (bool): Whether to print the final ffmpeg command. + """ + tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True) + with open(tmp_filename, 'w') as f: + for filename in video_list: + f.write(f'file {osp.abspath(filename)}\n') + options = {'log_level': log_level} + if vcodec is None: + options['vcodec'] = 'copy' + if acodec is None: + options['acodec'] = 'copy' + convert_video( + tmp_filename, + out_file, + print_cmd, + pre_options='-f concat -safe 0', + **options) + os.close(tmp_filehandler) + os.remove(tmp_filename) diff --git a/lavis/common/annotator/uniformer/mmcv/visualization/__init__.py b/lavis/common/annotator/uniformer/mmcv/visualization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..835df136bdcf69348281d22914d41aa84cdf92b1 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/visualization/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .color import Color, color_val +from .image import imshow, imshow_bboxes, imshow_det_bboxes +from .optflow import flow2rgb, flowshow, make_color_wheel + +__all__ = [ + 'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes', + 'flowshow', 'flow2rgb', 'make_color_wheel' +] diff --git a/lavis/common/annotator/uniformer/mmcv/visualization/color.py b/lavis/common/annotator/uniformer/mmcv/visualization/color.py new file mode 100644 index 0000000000000000000000000000000000000000..9041e0e6b7581c3356795d6a3c5e84667c88f025 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/visualization/color.py @@ -0,0 +1,51 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from enum import Enum + +import numpy as np + +from annotator.uniformer.mmcv.utils import is_str + + +class Color(Enum): + """An enum that defines common colors. + + Contains red, green, blue, cyan, yellow, magenta, white and black. + """ + red = (0, 0, 255) + green = (0, 255, 0) + blue = (255, 0, 0) + cyan = (255, 255, 0) + yellow = (0, 255, 255) + magenta = (255, 0, 255) + white = (255, 255, 255) + black = (0, 0, 0) + + +def color_val(color): + """Convert various input to color tuples. + + Args: + color (:obj:`Color`/str/tuple/int/ndarray): Color inputs + + Returns: + tuple[int]: A tuple of 3 integers indicating BGR channels. + """ + if is_str(color): + return Color[color].value + elif isinstance(color, Color): + return color.value + elif isinstance(color, tuple): + assert len(color) == 3 + for channel in color: + assert 0 <= channel <= 255 + return color + elif isinstance(color, int): + assert 0 <= color <= 255 + return color, color, color + elif isinstance(color, np.ndarray): + assert color.ndim == 1 and color.size == 3 + assert np.all((color >= 0) & (color <= 255)) + color = color.astype(np.uint8) + return tuple(color) + else: + raise TypeError(f'Invalid type for color: {type(color)}') diff --git a/lavis/common/annotator/uniformer/mmcv/visualization/image.py b/lavis/common/annotator/uniformer/mmcv/visualization/image.py new file mode 100644 index 0000000000000000000000000000000000000000..61a56c75b67f593c298408462c63c0468be8e276 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/visualization/image.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + +from annotator.uniformer.mmcv.image import imread, imwrite +from .color import color_val + + +def imshow(img, win_name='', wait_time=0): + """Show an image. + + Args: + img (str or ndarray): The image to be displayed. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + """ + cv2.imshow(win_name, imread(img)) + if wait_time == 0: # prevent from hanging if windows was closed + while True: + ret = cv2.waitKey(1) + + closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1 + # if user closed window or if some key pressed + if closed or ret != -1: + break + else: + ret = cv2.waitKey(wait_time) + + +def imshow_bboxes(img, + bboxes, + colors='green', + top_k=-1, + thickness=1, + show=True, + win_name='', + wait_time=0, + out_file=None): + """Draw bboxes on an image. + + Args: + img (str or ndarray): The image to be displayed. + bboxes (list or ndarray): A list of ndarray of shape (k, 4). + colors (list[str or tuple or Color]): A list of colors. + top_k (int): Plot the first k bboxes only if set positive. + thickness (int): Thickness of lines. + show (bool): Whether to show the image. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + out_file (str, optional): The filename to write the image. + + Returns: + ndarray: The image with bboxes drawn on it. + """ + img = imread(img) + img = np.ascontiguousarray(img) + + if isinstance(bboxes, np.ndarray): + bboxes = [bboxes] + if not isinstance(colors, list): + colors = [colors for _ in range(len(bboxes))] + colors = [color_val(c) for c in colors] + assert len(bboxes) == len(colors) + + for i, _bboxes in enumerate(bboxes): + _bboxes = _bboxes.astype(np.int32) + if top_k <= 0: + _top_k = _bboxes.shape[0] + else: + _top_k = min(top_k, _bboxes.shape[0]) + for j in range(_top_k): + left_top = (_bboxes[j, 0], _bboxes[j, 1]) + right_bottom = (_bboxes[j, 2], _bboxes[j, 3]) + cv2.rectangle( + img, left_top, right_bottom, colors[i], thickness=thickness) + + if show: + imshow(img, win_name, wait_time) + if out_file is not None: + imwrite(img, out_file) + return img + + +def imshow_det_bboxes(img, + bboxes, + labels, + class_names=None, + score_thr=0, + bbox_color='green', + text_color='green', + thickness=1, + font_scale=0.5, + show=True, + win_name='', + wait_time=0, + out_file=None): + """Draw bboxes and class labels (with scores) on an image. + + Args: + img (str or ndarray): The image to be displayed. + bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or + (n, 5). + labels (ndarray): Labels of bboxes. + class_names (list[str]): Names of each classes. + score_thr (float): Minimum score of bboxes to be shown. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + text_color (str or tuple or :obj:`Color`): Color of texts. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + show (bool): Whether to show the image. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + out_file (str or None): The filename to write the image. + + Returns: + ndarray: The image with bboxes drawn on it. + """ + assert bboxes.ndim == 2 + assert labels.ndim == 1 + assert bboxes.shape[0] == labels.shape[0] + assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5 + img = imread(img) + img = np.ascontiguousarray(img) + + if score_thr > 0: + assert bboxes.shape[1] == 5 + scores = bboxes[:, -1] + inds = scores > score_thr + bboxes = bboxes[inds, :] + labels = labels[inds] + + bbox_color = color_val(bbox_color) + text_color = color_val(text_color) + + for bbox, label in zip(bboxes, labels): + bbox_int = bbox.astype(np.int32) + left_top = (bbox_int[0], bbox_int[1]) + right_bottom = (bbox_int[2], bbox_int[3]) + cv2.rectangle( + img, left_top, right_bottom, bbox_color, thickness=thickness) + label_text = class_names[ + label] if class_names is not None else f'cls {label}' + if len(bbox) > 4: + label_text += f'|{bbox[-1]:.02f}' + cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2), + cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color) + + if show: + imshow(img, win_name, wait_time) + if out_file is not None: + imwrite(img, out_file) + return img diff --git a/lavis/common/annotator/uniformer/mmcv/visualization/optflow.py b/lavis/common/annotator/uniformer/mmcv/visualization/optflow.py new file mode 100644 index 0000000000000000000000000000000000000000..c3870c700f7c946177ee5d536ce3f6c814a77ce7 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv/visualization/optflow.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import division + +import numpy as np + +from annotator.uniformer.mmcv.image import rgb2bgr +from annotator.uniformer.mmcv.video import flowread +from .image import imshow + + +def flowshow(flow, win_name='', wait_time=0): + """Show optical flow. + + Args: + flow (ndarray or str): The optical flow to be displayed. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + """ + flow = flowread(flow) + flow_img = flow2rgb(flow) + imshow(rgb2bgr(flow_img), win_name, wait_time) + + +def flow2rgb(flow, color_wheel=None, unknown_thr=1e6): + """Convert flow map to RGB image. + + Args: + flow (ndarray): Array of optical flow. + color_wheel (ndarray or None): Color wheel used to map flow field to + RGB colorspace. Default color wheel will be used if not specified. + unknown_thr (str): Values above this threshold will be marked as + unknown and thus ignored. + + Returns: + ndarray: RGB image that can be visualized. + """ + assert flow.ndim == 3 and flow.shape[-1] == 2 + if color_wheel is None: + color_wheel = make_color_wheel() + assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3 + num_bins = color_wheel.shape[0] + + dx = flow[:, :, 0].copy() + dy = flow[:, :, 1].copy() + + ignore_inds = ( + np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) | + (np.abs(dy) > unknown_thr)) + dx[ignore_inds] = 0 + dy[ignore_inds] = 0 + + rad = np.sqrt(dx**2 + dy**2) + if np.any(rad > np.finfo(float).eps): + max_rad = np.max(rad) + dx /= max_rad + dy /= max_rad + + rad = np.sqrt(dx**2 + dy**2) + angle = np.arctan2(-dy, -dx) / np.pi + + bin_real = (angle + 1) / 2 * (num_bins - 1) + bin_left = np.floor(bin_real).astype(int) + bin_right = (bin_left + 1) % num_bins + w = (bin_real - bin_left.astype(np.float32))[..., None] + flow_img = (1 - + w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :] + small_ind = rad <= 1 + flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind]) + flow_img[np.logical_not(small_ind)] *= 0.75 + + flow_img[ignore_inds, :] = 0 + + return flow_img + + +def make_color_wheel(bins=None): + """Build a color wheel. + + Args: + bins(list or tuple, optional): Specify the number of bins for each + color range, corresponding to six ranges: red -> yellow, + yellow -> green, green -> cyan, cyan -> blue, blue -> magenta, + magenta -> red. [15, 6, 4, 11, 13, 6] is used for default + (see Middlebury). + + Returns: + ndarray: Color wheel of shape (total_bins, 3). + """ + if bins is None: + bins = [15, 6, 4, 11, 13, 6] + assert len(bins) == 6 + + RY, YG, GC, CB, BM, MR = tuple(bins) + + ry = [1, np.arange(RY) / RY, 0] + yg = [1 - np.arange(YG) / YG, 1, 0] + gc = [0, 1, np.arange(GC) / GC] + cb = [0, 1 - np.arange(CB) / CB, 1] + bm = [np.arange(BM) / BM, 0, 1] + mr = [1, 0, 1 - np.arange(MR) / MR] + + num_bins = RY + YG + GC + CB + BM + MR + + color_wheel = np.zeros((3, num_bins), dtype=np.float32) + + col = 0 + for i, color in enumerate([ry, yg, gc, cb, bm, mr]): + for j in range(3): + color_wheel[j, col:col + bins[i]] = color[j] + col += bins[i] + + return color_wheel.T diff --git a/lavis/common/annotator/uniformer/mmcv_custom/__init__.py b/lavis/common/annotator/uniformer/mmcv_custom/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b958738b9fd93bfcec239c550df1d9a44b8c536 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv_custom/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +from .checkpoint import load_checkpoint + +__all__ = ['load_checkpoint'] \ No newline at end of file diff --git a/lavis/common/annotator/uniformer/mmcv_custom/checkpoint.py b/lavis/common/annotator/uniformer/mmcv_custom/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..19b87fef0a52d31babcdb3edb8f3089b6420173f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmcv_custom/checkpoint.py @@ -0,0 +1,500 @@ +# Copyright (c) Open-MMLab. All rights reserved. +import io +import os +import os.path as osp +import pkgutil +import time +import warnings +from collections import OrderedDict +from importlib import import_module +from tempfile import TemporaryDirectory + +import torch +import torchvision +from torch.optim import Optimizer +from torch.utils import model_zoo +from torch.nn import functional as F + +import annotator.uniformer.mmcv as mmcv +from annotator.uniformer.mmcv.fileio import FileClient +from annotator.uniformer.mmcv.fileio import load as load_file +from annotator.uniformer.mmcv.parallel import is_module_wrapper +from annotator.uniformer.mmcv.utils import mkdir_or_exist +from annotator.uniformer.mmcv.runner import get_dist_info + +ENV_MMCV_HOME = 'MMCV_HOME' +ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' +DEFAULT_CACHE_DIR = '~/.cache' + + +def _get_mmcv_home(): + mmcv_home = os.path.expanduser( + os.getenv( + ENV_MMCV_HOME, + os.path.join( + os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv'))) + + mkdir_or_exist(mmcv_home) + return mmcv_home + + +def load_state_dict(module, state_dict, strict=False, logger=None): + """Load state_dict to a module. + + This method is modified from :meth:`torch.nn.Module.load_state_dict`. + Default value for ``strict`` is set to ``False`` and the message for + param mismatch will be shown even if strict is False. + + Args: + module (Module): Module that receives the state_dict. + state_dict (OrderedDict): Weights. + strict (bool): whether to strictly enforce that the keys + in :attr:`state_dict` match the keys returned by this module's + :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. + logger (:obj:`logging.Logger`, optional): Logger to log the error + message. If not specified, print function will be used. + """ + unexpected_keys = [] + all_missing_keys = [] + err_msg = [] + + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + # use _load_from_state_dict to enable checkpoint version control + def load(module, prefix=''): + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_module_wrapper(module): + module = module.module + local_metadata = {} if metadata is None else metadata.get( + prefix[:-1], {}) + module._load_from_state_dict(state_dict, prefix, local_metadata, True, + all_missing_keys, unexpected_keys, + err_msg) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(module) + load = None # break load->load reference cycle + + # ignore "num_batches_tracked" of BN layers + missing_keys = [ + key for key in all_missing_keys if 'num_batches_tracked' not in key + ] + + if unexpected_keys: + err_msg.append('unexpected key in source ' + f'state_dict: {", ".join(unexpected_keys)}\n') + if missing_keys: + err_msg.append( + f'missing keys in source state_dict: {", ".join(missing_keys)}\n') + + rank, _ = get_dist_info() + if len(err_msg) > 0 and rank == 0: + err_msg.insert( + 0, 'The model and loaded state dict do not match exactly\n') + err_msg = '\n'.join(err_msg) + if strict: + raise RuntimeError(err_msg) + elif logger is not None: + logger.warning(err_msg) + else: + print(err_msg) + + +def load_url_dist(url, model_dir=None): + """In distributed setting, this function only download checkpoint at local + rank 0.""" + rank, world_size = get_dist_info() + rank = int(os.environ.get('LOCAL_RANK', rank)) + if rank == 0: + checkpoint = model_zoo.load_url(url, model_dir=model_dir) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + checkpoint = model_zoo.load_url(url, model_dir=model_dir) + return checkpoint + + +def load_pavimodel_dist(model_path, map_location=None): + """In distributed setting, this function only download checkpoint at local + rank 0.""" + try: + from pavi import modelcloud + except ImportError: + raise ImportError( + 'Please install pavi to load checkpoint from modelcloud.') + rank, world_size = get_dist_info() + rank = int(os.environ.get('LOCAL_RANK', rank)) + if rank == 0: + model = modelcloud.get(model_path) + with TemporaryDirectory() as tmp_dir: + downloaded_file = osp.join(tmp_dir, model.name) + model.download(downloaded_file) + checkpoint = torch.load(downloaded_file, map_location=map_location) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + model = modelcloud.get(model_path) + with TemporaryDirectory() as tmp_dir: + downloaded_file = osp.join(tmp_dir, model.name) + model.download(downloaded_file) + checkpoint = torch.load( + downloaded_file, map_location=map_location) + return checkpoint + + +def load_fileclient_dist(filename, backend, map_location): + """In distributed setting, this function only download checkpoint at local + rank 0.""" + rank, world_size = get_dist_info() + rank = int(os.environ.get('LOCAL_RANK', rank)) + allowed_backends = ['ceph'] + if backend not in allowed_backends: + raise ValueError(f'Load from Backend {backend} is not supported.') + if rank == 0: + fileclient = FileClient(backend=backend) + buffer = io.BytesIO(fileclient.get(filename)) + checkpoint = torch.load(buffer, map_location=map_location) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + fileclient = FileClient(backend=backend) + buffer = io.BytesIO(fileclient.get(filename)) + checkpoint = torch.load(buffer, map_location=map_location) + return checkpoint + + +def get_torchvision_models(): + model_urls = dict() + for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): + if ispkg: + continue + _zoo = import_module(f'torchvision.models.{name}') + if hasattr(_zoo, 'model_urls'): + _urls = getattr(_zoo, 'model_urls') + model_urls.update(_urls) + return model_urls + + +def get_external_models(): + mmcv_home = _get_mmcv_home() + default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json') + default_urls = load_file(default_json_path) + assert isinstance(default_urls, dict) + external_json_path = osp.join(mmcv_home, 'open_mmlab.json') + if osp.exists(external_json_path): + external_urls = load_file(external_json_path) + assert isinstance(external_urls, dict) + default_urls.update(external_urls) + + return default_urls + + +def get_mmcls_models(): + mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json') + mmcls_urls = load_file(mmcls_json_path) + + return mmcls_urls + + +def get_deprecated_model_names(): + deprecate_json_path = osp.join(mmcv.__path__[0], + 'model_zoo/deprecated.json') + deprecate_urls = load_file(deprecate_json_path) + assert isinstance(deprecate_urls, dict) + + return deprecate_urls + + +def _process_mmcls_checkpoint(checkpoint): + state_dict = checkpoint['state_dict'] + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + if k.startswith('backbone.'): + new_state_dict[k[9:]] = v + new_checkpoint = dict(state_dict=new_state_dict) + + return new_checkpoint + + +def _load_checkpoint(filename, map_location=None): + """Load checkpoint from somewhere (modelzoo, file, url). + + Args: + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str | None): Same as :func:`torch.load`. Default: None. + + Returns: + dict | OrderedDict: The loaded checkpoint. It can be either an + OrderedDict storing model weights or a dict containing other + information, which depends on the checkpoint. + """ + if filename.startswith('modelzoo://'): + warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' + 'use "torchvision://" instead') + model_urls = get_torchvision_models() + model_name = filename[11:] + checkpoint = load_url_dist(model_urls[model_name]) + elif filename.startswith('torchvision://'): + model_urls = get_torchvision_models() + model_name = filename[14:] + checkpoint = load_url_dist(model_urls[model_name]) + elif filename.startswith('open-mmlab://'): + model_urls = get_external_models() + model_name = filename[13:] + deprecated_urls = get_deprecated_model_names() + if model_name in deprecated_urls: + warnings.warn(f'open-mmlab://{model_name} is deprecated in favor ' + f'of open-mmlab://{deprecated_urls[model_name]}') + model_name = deprecated_urls[model_name] + model_url = model_urls[model_name] + # check if is url + if model_url.startswith(('http://', 'https://')): + checkpoint = load_url_dist(model_url) + else: + filename = osp.join(_get_mmcv_home(), model_url) + if not osp.isfile(filename): + raise IOError(f'{filename} is not a checkpoint file') + checkpoint = torch.load(filename, map_location=map_location) + elif filename.startswith('mmcls://'): + model_urls = get_mmcls_models() + model_name = filename[8:] + checkpoint = load_url_dist(model_urls[model_name]) + checkpoint = _process_mmcls_checkpoint(checkpoint) + elif filename.startswith(('http://', 'https://')): + checkpoint = load_url_dist(filename) + elif filename.startswith('pavi://'): + model_path = filename[7:] + checkpoint = load_pavimodel_dist(model_path, map_location=map_location) + elif filename.startswith('s3://'): + checkpoint = load_fileclient_dist( + filename, backend='ceph', map_location=map_location) + else: + if not osp.isfile(filename): + raise IOError(f'{filename} is not a checkpoint file') + checkpoint = torch.load(filename, map_location=map_location) + return checkpoint + + +def load_checkpoint(model, + filename, + map_location='cpu', + strict=False, + logger=None): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + checkpoint = _load_checkpoint(filename, map_location) + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + elif 'model' in checkpoint: + state_dict = checkpoint['model'] + else: + state_dict = checkpoint + # strip prefix of state_dict + if list(state_dict.keys())[0].startswith('module.'): + state_dict = {k[7:]: v for k, v in state_dict.items()} + + # for MoBY, load model of online branch + if sorted(list(state_dict.keys()))[0].startswith('encoder'): + state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')} + + # reshape absolute position embedding + if state_dict.get('absolute_pos_embed') is not None: + absolute_pos_embed = state_dict['absolute_pos_embed'] + N1, L, C1 = absolute_pos_embed.size() + N2, C2, H, W = model.absolute_pos_embed.size() + if N1 != N2 or C1 != C2 or L != H*W: + logger.warning("Error in loading absolute_pos_embed, pass") + else: + state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2) + + # interpolate position bias table if needed + relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k] + for table_key in relative_position_bias_table_keys: + table_pretrained = state_dict[table_key] + table_current = model.state_dict()[table_key] + L1, nH1 = table_pretrained.size() + L2, nH2 = table_current.size() + if nH1 != nH2: + logger.warning(f"Error in loading {table_key}, pass") + else: + if L1 != L2: + S1 = int(L1 ** 0.5) + S2 = int(L2 ** 0.5) + table_pretrained_resized = F.interpolate( + table_pretrained.permute(1, 0).view(1, nH1, S1, S1), + size=(S2, S2), mode='bicubic') + state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0) + + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint + + +def weights_to_cpu(state_dict): + """Copy a model state_dict to cpu. + + Args: + state_dict (OrderedDict): Model weights on GPU. + + Returns: + OrderedDict: Model weights on GPU. + """ + state_dict_cpu = OrderedDict() + for key, val in state_dict.items(): + state_dict_cpu[key] = val.cpu() + return state_dict_cpu + + +def _save_to_state_dict(module, destination, prefix, keep_vars): + """Saves module state to `destination` dictionary. + + This method is modified from :meth:`torch.nn.Module._save_to_state_dict`. + + Args: + module (nn.Module): The module to generate state_dict. + destination (dict): A dict where state will be stored. + prefix (str): The prefix for parameters and buffers used in this + module. + """ + for name, param in module._parameters.items(): + if param is not None: + destination[prefix + name] = param if keep_vars else param.detach() + for name, buf in module._buffers.items(): + # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d + if buf is not None: + destination[prefix + name] = buf if keep_vars else buf.detach() + + +def get_state_dict(module, destination=None, prefix='', keep_vars=False): + """Returns a dictionary containing a whole state of the module. + + Both parameters and persistent buffers (e.g. running averages) are + included. Keys are corresponding parameter and buffer names. + + This method is modified from :meth:`torch.nn.Module.state_dict` to + recursively check parallel module in case that the model has a complicated + structure, e.g., nn.Module(nn.Module(DDP)). + + Args: + module (nn.Module): The module to generate state_dict. + destination (OrderedDict): Returned dict for the state of the + module. + prefix (str): Prefix of the key. + keep_vars (bool): Whether to keep the variable property of the + parameters. Default: False. + + Returns: + dict: A dictionary containing a whole state of the module. + """ + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_module_wrapper(module): + module = module.module + + # below is the same as torch.nn.Module.state_dict() + if destination is None: + destination = OrderedDict() + destination._metadata = OrderedDict() + destination._metadata[prefix[:-1]] = local_metadata = dict( + version=module._version) + _save_to_state_dict(module, destination, prefix, keep_vars) + for name, child in module._modules.items(): + if child is not None: + get_state_dict( + child, destination, prefix + name + '.', keep_vars=keep_vars) + for hook in module._state_dict_hooks.values(): + hook_result = hook(module, destination, prefix, local_metadata) + if hook_result is not None: + destination = hook_result + return destination + + +def save_checkpoint(model, filename, optimizer=None, meta=None): + """Save checkpoint to file. + + The checkpoint will have 3 fields: ``meta``, ``state_dict`` and + ``optimizer``. By default ``meta`` will contain version and time info. + + Args: + model (Module): Module whose params are to be saved. + filename (str): Checkpoint filename. + optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. + meta (dict, optional): Metadata to be saved in checkpoint. + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError(f'meta must be a dict or None, but got {type(meta)}') + meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) + + if is_module_wrapper(model): + model = model.module + + if hasattr(model, 'CLASSES') and model.CLASSES is not None: + # save class name to the meta + meta.update(CLASSES=model.CLASSES) + + checkpoint = { + 'meta': meta, + 'state_dict': weights_to_cpu(get_state_dict(model)) + } + # save optimizer state dict in the checkpoint + if isinstance(optimizer, Optimizer): + checkpoint['optimizer'] = optimizer.state_dict() + elif isinstance(optimizer, dict): + checkpoint['optimizer'] = {} + for name, optim in optimizer.items(): + checkpoint['optimizer'][name] = optim.state_dict() + + if filename.startswith('pavi://'): + try: + from pavi import modelcloud + from pavi.exception import NodeNotFoundError + except ImportError: + raise ImportError( + 'Please install pavi to load checkpoint from modelcloud.') + model_path = filename[7:] + root = modelcloud.Folder() + model_dir, model_name = osp.split(model_path) + try: + model = modelcloud.get(model_dir) + except NodeNotFoundError: + model = root.create_training_model(model_dir) + with TemporaryDirectory() as tmp_dir: + checkpoint_file = osp.join(tmp_dir, model_name) + with open(checkpoint_file, 'wb') as f: + torch.save(checkpoint, f) + f.flush() + model.create_file(checkpoint_file, name=model_name) + else: + mmcv.mkdir_or_exist(osp.dirname(filename)) + # immediately flush buffer + with open(filename, 'wb') as f: + torch.save(checkpoint, f) + f.flush() \ No newline at end of file diff --git a/lavis/common/annotator/uniformer/mmseg/apis/__init__.py b/lavis/common/annotator/uniformer/mmseg/apis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..170724be38de42daf2bc1a1910e181d68818f165 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/apis/__init__.py @@ -0,0 +1,9 @@ +from .inference import inference_segmentor, init_segmentor, show_result_pyplot +from .test import multi_gpu_test, single_gpu_test +from .train import get_root_logger, set_random_seed, train_segmentor + +__all__ = [ + 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor', + 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test', + 'show_result_pyplot' +] diff --git a/lavis/common/annotator/uniformer/mmseg/apis/inference.py b/lavis/common/annotator/uniformer/mmseg/apis/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..90bc1c0c68525734bd6793f07c15fe97d3c8342c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/apis/inference.py @@ -0,0 +1,136 @@ +import matplotlib.pyplot as plt +import annotator.uniformer.mmcv as mmcv +import torch +from annotator.uniformer.mmcv.parallel import collate, scatter +from annotator.uniformer.mmcv.runner import load_checkpoint + +from annotator.uniformer.mmseg.datasets.pipelines import Compose +from annotator.uniformer.mmseg.models import build_segmentor + + +def init_segmentor(config, checkpoint=None, device='cuda:0'): + """Initialize a segmentor from config file. + + Args: + config (str or :obj:`mmcv.Config`): Config file path or the config + object. + checkpoint (str, optional): Checkpoint path. If left as None, the model + will not load any weights. + device (str, optional) CPU/CUDA device option. Default 'cuda:0'. + Use 'cpu' for loading model on CPU. + Returns: + nn.Module: The constructed segmentor. + """ + if isinstance(config, str): + config = mmcv.Config.fromfile(config) + elif not isinstance(config, mmcv.Config): + raise TypeError('config must be a filename or Config object, ' + 'but got {}'.format(type(config))) + config.model.pretrained = None + config.model.train_cfg = None + model = build_segmentor(config.model, test_cfg=config.get('test_cfg')) + if checkpoint is not None: + checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') + model.CLASSES = checkpoint['meta']['CLASSES'] + model.PALETTE = checkpoint['meta']['PALETTE'] + model.cfg = config # save the config in the model for convenience + model.to(device) + model.eval() + return model + + +class LoadImage: + """A simple pipeline to load image.""" + + def __call__(self, results): + """Call function to load images into results. + + Args: + results (dict): A result dict contains the file name + of the image to be read. + + Returns: + dict: ``results`` will be returned containing loaded image. + """ + + if isinstance(results['img'], str): + results['filename'] = results['img'] + results['ori_filename'] = results['img'] + else: + results['filename'] = None + results['ori_filename'] = None + img = mmcv.imread(results['img']) + results['img'] = img + results['img_shape'] = img.shape + results['ori_shape'] = img.shape + return results + + +def inference_segmentor(model, img): + """Inference image(s) with the segmentor. + + Args: + model (nn.Module): The loaded segmentor. + imgs (str/ndarray or list[str/ndarray]): Either image files or loaded + images. + + Returns: + (list[Tensor]): The segmentation result. + """ + cfg = model.cfg + device = next(model.parameters()).device # model device + # build the data pipeline + test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:] + test_pipeline = Compose(test_pipeline) + # prepare data + data = dict(img=img) + data = test_pipeline(data) + data = collate([data], samples_per_gpu=1) + if next(model.parameters()).is_cuda: + # scatter to specified GPU + data = scatter(data, [device])[0] + else: + data['img_metas'] = [i.data[0] for i in data['img_metas']] + + # forward the model + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + return result + + +def show_result_pyplot(model, + img, + result, + palette=None, + fig_size=(15, 10), + opacity=0.5, + title='', + block=True): + """Visualize the segmentation results on the image. + + Args: + model (nn.Module): The loaded segmentor. + img (str or np.ndarray): Image filename or loaded image. + result (list): The segmentation result. + palette (list[list[int]]] | None): The palette of segmentation + map. If None is given, random palette will be generated. + Default: None + fig_size (tuple): Figure size of the pyplot figure. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + title (str): The title of pyplot figure. + Default is ''. + block (bool): Whether to block the pyplot figure. + Default is True. + """ + if hasattr(model, 'module'): + model = model.module + img = model.show_result( + img, result, palette=palette, show=False, opacity=opacity) + # plt.figure(figsize=fig_size) + # plt.imshow(mmcv.bgr2rgb(img)) + # plt.title(title) + # plt.tight_layout() + # plt.show(block=block) + return mmcv.bgr2rgb(img) diff --git a/lavis/common/annotator/uniformer/mmseg/apis/test.py b/lavis/common/annotator/uniformer/mmseg/apis/test.py new file mode 100644 index 0000000000000000000000000000000000000000..e574eb7da04f09a59cf99ff953c36468ae87a326 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/apis/test.py @@ -0,0 +1,238 @@ +import os.path as osp +import pickle +import shutil +import tempfile + +import annotator.uniformer.mmcv as mmcv +import numpy as np +import torch +import torch.distributed as dist +from annotator.uniformer.mmcv.image import tensor2imgs +from annotator.uniformer.mmcv.runner import get_dist_info + + +def np2tmp(array, temp_file_name=None): + """Save ndarray to local numpy file. + + Args: + array (ndarray): Ndarray to save. + temp_file_name (str): Numpy file name. If 'temp_file_name=None', this + function will generate a file name with tempfile.NamedTemporaryFile + to save ndarray. Default: None. + + Returns: + str: The numpy file name. + """ + + if temp_file_name is None: + temp_file_name = tempfile.NamedTemporaryFile( + suffix='.npy', delete=False).name + np.save(temp_file_name, array) + return temp_file_name + + +def single_gpu_test(model, + data_loader, + show=False, + out_dir=None, + efficient_test=False, + opacity=0.5): + """Test with single GPU. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + show (bool): Whether show results during inference. Default: False. + out_dir (str, optional): If specified, the results will be dumped into + the directory to save output results. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + Returns: + list: The prediction results. + """ + + model.eval() + results = [] + dataset = data_loader.dataset + prog_bar = mmcv.ProgressBar(len(dataset)) + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, **data) + + if show or out_dir: + img_tensor = data['img'][0] + img_metas = data['img_metas'][0].data[0] + imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg']) + assert len(imgs) == len(img_metas) + + for img, img_meta in zip(imgs, img_metas): + h, w, _ = img_meta['img_shape'] + img_show = img[:h, :w, :] + + ori_h, ori_w = img_meta['ori_shape'][:-1] + img_show = mmcv.imresize(img_show, (ori_w, ori_h)) + + if out_dir: + out_file = osp.join(out_dir, img_meta['ori_filename']) + else: + out_file = None + + model.module.show_result( + img_show, + result, + palette=dataset.PALETTE, + show=show, + out_file=out_file, + opacity=opacity) + + if isinstance(result, list): + if efficient_test: + result = [np2tmp(_) for _ in result] + results.extend(result) + else: + if efficient_test: + result = np2tmp(result) + results.append(result) + + batch_size = len(result) + for _ in range(batch_size): + prog_bar.update() + return results + + +def multi_gpu_test(model, + data_loader, + tmpdir=None, + gpu_collect=False, + efficient_test=False): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. + + Returns: + list: The prediction results. + """ + + model.eval() + results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + + if isinstance(result, list): + if efficient_test: + result = [np2tmp(_) for _ in result] + results.extend(result) + else: + if efficient_test: + result = np2tmp(result) + results.append(result) + + if rank == 0: + batch_size = data['img'][0].size(0) + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results + + +def collect_results_cpu(result_part, size, tmpdir=None): + """Collect results with CPU.""" + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + tmpdir = tempfile.mkdtemp() + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + """Collect results with GPU.""" + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_list.append( + pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results diff --git a/lavis/common/annotator/uniformer/mmseg/apis/train.py b/lavis/common/annotator/uniformer/mmseg/apis/train.py new file mode 100644 index 0000000000000000000000000000000000000000..63f319a919ff023931a6a663e668f27dd1a07a2e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/apis/train.py @@ -0,0 +1,116 @@ +import random +import warnings + +import numpy as np +import torch +from annotator.uniformer.mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from annotator.uniformer.mmcv.runner import build_optimizer, build_runner + +from annotator.uniformer.mmseg.core import DistEvalHook, EvalHook +from annotator.uniformer.mmseg.datasets import build_dataloader, build_dataset +from annotator.uniformer.mmseg.utils import get_root_logger + + +def set_random_seed(seed, deterministic=False): + """Set random seed. + + Args: + seed (int): Seed to be used. + deterministic (bool): Whether to set the deterministic option for + CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` + to True and `torch.backends.cudnn.benchmark` to False. + Default: False. + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if deterministic: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def train_segmentor(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + """Launch segmentor training.""" + logger = get_root_logger(cfg.log_level) + + # prepare data loaders + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + data_loaders = [ + build_dataloader( + ds, + cfg.data.samples_per_gpu, + cfg.data.workers_per_gpu, + # cfg.gpus will be ignored if distributed + len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed, + drop_last=True) for ds in dataset + ] + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = MMDataParallel( + model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + + if cfg.get('runner') is None: + cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters} + warnings.warn( + 'config is now expected to have a `runner` section, ' + 'please set `runner` in your config.', UserWarning) + + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + batch_processor=None, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + + # register hooks + runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, + cfg.checkpoint_config, cfg.log_config, + cfg.get('momentum_config', None)) + + # an ugly walkaround to make the .log and .log.json filenames the same + runner.timestamp = timestamp + + # register eval hooks + if validate: + val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) + val_dataloader = build_dataloader( + val_dataset, + samples_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False) + eval_cfg = cfg.get('evaluation', {}) + eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' + eval_hook = DistEvalHook if distributed else EvalHook + runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow) diff --git a/lavis/common/annotator/uniformer/mmseg/core/__init__.py b/lavis/common/annotator/uniformer/mmseg/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..965605587211b7bf0bd6bc3acdbb33dd49cab023 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/__init__.py @@ -0,0 +1,3 @@ +from .evaluation import * # noqa: F401, F403 +from .seg import * # noqa: F401, F403 +from .utils import * # noqa: F401, F403 diff --git a/lavis/common/annotator/uniformer/mmseg/core/evaluation/__init__.py b/lavis/common/annotator/uniformer/mmseg/core/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f7cc4b23413a0639e9de00eeb0bf600632d2c6cd --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/evaluation/__init__.py @@ -0,0 +1,8 @@ +from .class_names import get_classes, get_palette +from .eval_hooks import DistEvalHook, EvalHook +from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou + +__all__ = [ + 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', + 'eval_metrics', 'get_classes', 'get_palette' +] diff --git a/lavis/common/annotator/uniformer/mmseg/core/evaluation/class_names.py b/lavis/common/annotator/uniformer/mmseg/core/evaluation/class_names.py new file mode 100644 index 0000000000000000000000000000000000000000..ffae816cf980ce4b03e491cc0c4298cb823797e6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/evaluation/class_names.py @@ -0,0 +1,152 @@ +import annotator.uniformer.mmcv as mmcv + + +def cityscapes_classes(): + """Cityscapes class names for external use.""" + return [ + 'road', 'sidewalk', 'building', 'wall', 'fence', 'pole', + 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', + 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', + 'bicycle' + ] + + +def ade_classes(): + """ADE20K class names for external use.""" + return [ + 'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ', + 'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth', + 'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car', + 'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug', + 'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe', + 'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column', + 'signboard', 'chest of drawers', 'counter', 'sand', 'sink', + 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path', + 'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door', + 'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table', + 'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove', + 'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar', + 'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower', + 'chandelier', 'awning', 'streetlight', 'booth', 'television receiver', + 'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister', + 'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van', + 'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything', + 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent', + 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank', + 'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake', + 'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce', + 'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen', + 'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass', + 'clock', 'flag' + ] + + +def voc_classes(): + """Pascal VOC class names for external use.""" + return [ + 'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', + 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', + 'tvmonitor' + ] + + +def cityscapes_palette(): + """Cityscapes palette for external use.""" + return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], + [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0], + [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60], + [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100], + [0, 0, 230], [119, 11, 32]] + + +def ade_palette(): + """ADE20K palette for external use.""" + return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], + [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], + [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], + [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], + [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], + [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], + [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], + [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], + [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], + [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], + [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], + [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], + [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], + [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], + [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255], + [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255], + [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0], + [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0], + [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255], + [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255], + [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20], + [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255], + [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255], + [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255], + [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0], + [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0], + [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255], + [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112], + [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160], + [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163], + [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0], + [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0], + [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255], + [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204], + [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255], + [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255], + [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194], + [102, 255, 0], [92, 0, 255]] + + +def voc_palette(): + """Pascal VOC palette for external use.""" + return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], + [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], + [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], + [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], + [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] + + +dataset_aliases = { + 'cityscapes': ['cityscapes'], + 'ade': ['ade', 'ade20k'], + 'voc': ['voc', 'pascal_voc', 'voc12', 'voc12aug'] +} + + +def get_classes(dataset): + """Get class names of a dataset.""" + alias2name = {} + for name, aliases in dataset_aliases.items(): + for alias in aliases: + alias2name[alias] = name + + if mmcv.is_str(dataset): + if dataset in alias2name: + labels = eval(alias2name[dataset] + '_classes()') + else: + raise ValueError(f'Unrecognized dataset: {dataset}') + else: + raise TypeError(f'dataset must a str, but got {type(dataset)}') + return labels + + +def get_palette(dataset): + """Get class palette (RGB) of a dataset.""" + alias2name = {} + for name, aliases in dataset_aliases.items(): + for alias in aliases: + alias2name[alias] = name + + if mmcv.is_str(dataset): + if dataset in alias2name: + labels = eval(alias2name[dataset] + '_palette()') + else: + raise ValueError(f'Unrecognized dataset: {dataset}') + else: + raise TypeError(f'dataset must a str, but got {type(dataset)}') + return labels diff --git a/lavis/common/annotator/uniformer/mmseg/core/evaluation/eval_hooks.py b/lavis/common/annotator/uniformer/mmseg/core/evaluation/eval_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..6fc100c8f96e817a6ed2666f7c9f762af2463b48 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/evaluation/eval_hooks.py @@ -0,0 +1,109 @@ +import os.path as osp + +from annotator.uniformer.mmcv.runner import DistEvalHook as _DistEvalHook +from annotator.uniformer.mmcv.runner import EvalHook as _EvalHook + + +class EvalHook(_EvalHook): + """Single GPU EvalHook, with efficient test support. + + Args: + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + Default: False. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. + Returns: + list: The prediction results. + """ + + greater_keys = ['mIoU', 'mAcc', 'aAcc'] + + def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): + super().__init__(*args, by_epoch=by_epoch, **kwargs) + self.efficient_test = efficient_test + + def after_train_iter(self, runner): + """After train epoch hook. + + Override default ``single_gpu_test``. + """ + if self.by_epoch or not self.every_n_iters(runner, self.interval): + return + from annotator.uniformer.mmseg.apis import single_gpu_test + runner.log_buffer.clear() + results = single_gpu_test( + runner.model, + self.dataloader, + show=False, + efficient_test=self.efficient_test) + self.evaluate(runner, results) + + def after_train_epoch(self, runner): + """After train epoch hook. + + Override default ``single_gpu_test``. + """ + if not self.by_epoch or not self.every_n_epochs(runner, self.interval): + return + from annotator.uniformer.mmseg.apis import single_gpu_test + runner.log_buffer.clear() + results = single_gpu_test(runner.model, self.dataloader, show=False) + self.evaluate(runner, results) + + +class DistEvalHook(_DistEvalHook): + """Distributed EvalHook, with efficient test support. + + Args: + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + Default: False. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. + Returns: + list: The prediction results. + """ + + greater_keys = ['mIoU', 'mAcc', 'aAcc'] + + def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): + super().__init__(*args, by_epoch=by_epoch, **kwargs) + self.efficient_test = efficient_test + + def after_train_iter(self, runner): + """After train epoch hook. + + Override default ``multi_gpu_test``. + """ + if self.by_epoch or not self.every_n_iters(runner, self.interval): + return + from annotator.uniformer.mmseg.apis import multi_gpu_test + runner.log_buffer.clear() + results = multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=osp.join(runner.work_dir, '.eval_hook'), + gpu_collect=self.gpu_collect, + efficient_test=self.efficient_test) + if runner.rank == 0: + print('\n') + self.evaluate(runner, results) + + def after_train_epoch(self, runner): + """After train epoch hook. + + Override default ``multi_gpu_test``. + """ + if not self.by_epoch or not self.every_n_epochs(runner, self.interval): + return + from annotator.uniformer.mmseg.apis import multi_gpu_test + runner.log_buffer.clear() + results = multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=osp.join(runner.work_dir, '.eval_hook'), + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + self.evaluate(runner, results) diff --git a/lavis/common/annotator/uniformer/mmseg/core/evaluation/metrics.py b/lavis/common/annotator/uniformer/mmseg/core/evaluation/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..16c7dd47cadd53cf1caaa194e28a343f2aacc599 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/evaluation/metrics.py @@ -0,0 +1,326 @@ +from collections import OrderedDict + +import annotator.uniformer.mmcv as mmcv +import numpy as np +import torch + + +def f_score(precision, recall, beta=1): + """calcuate the f-score value. + + Args: + precision (float | torch.Tensor): The precision value. + recall (float | torch.Tensor): The recall value. + beta (int): Determines the weight of recall in the combined score. + Default: False. + + Returns: + [torch.tensor]: The f-score value. + """ + score = (1 + beta**2) * (precision * recall) / ( + (beta**2 * precision) + recall) + return score + + +def intersect_and_union(pred_label, + label, + num_classes, + ignore_index, + label_map=dict(), + reduce_zero_label=False): + """Calculate intersection and Union. + + Args: + pred_label (ndarray | str): Prediction segmentation map + or predict result filename. + label (ndarray | str): Ground truth segmentation map + or label filename. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + label_map (dict): Mapping old labels to new labels. The parameter will + work only when label is str. Default: dict(). + reduce_zero_label (bool): Wether ignore zero label. The parameter will + work only when label is str. Default: False. + + Returns: + torch.Tensor: The intersection of prediction and ground truth + histogram on all classes. + torch.Tensor: The union of prediction and ground truth histogram on + all classes. + torch.Tensor: The prediction histogram on all classes. + torch.Tensor: The ground truth histogram on all classes. + """ + + if isinstance(pred_label, str): + pred_label = torch.from_numpy(np.load(pred_label)) + else: + pred_label = torch.from_numpy((pred_label)) + + if isinstance(label, str): + label = torch.from_numpy( + mmcv.imread(label, flag='unchanged', backend='pillow')) + else: + label = torch.from_numpy(label) + + if label_map is not None: + for old_id, new_id in label_map.items(): + label[label == old_id] = new_id + if reduce_zero_label: + label[label == 0] = 255 + label = label - 1 + label[label == 254] = 255 + + mask = (label != ignore_index) + pred_label = pred_label[mask] + label = label[mask] + + intersect = pred_label[pred_label == label] + area_intersect = torch.histc( + intersect.float(), bins=(num_classes), min=0, max=num_classes - 1) + area_pred_label = torch.histc( + pred_label.float(), bins=(num_classes), min=0, max=num_classes - 1) + area_label = torch.histc( + label.float(), bins=(num_classes), min=0, max=num_classes - 1) + area_union = area_pred_label + area_label - area_intersect + return area_intersect, area_union, area_pred_label, area_label + + +def total_intersect_and_union(results, + gt_seg_maps, + num_classes, + ignore_index, + label_map=dict(), + reduce_zero_label=False): + """Calculate Total Intersection and Union. + + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Wether ignore zero label. Default: False. + + Returns: + ndarray: The intersection of prediction and ground truth histogram + on all classes. + ndarray: The union of prediction and ground truth histogram on all + classes. + ndarray: The prediction histogram on all classes. + ndarray: The ground truth histogram on all classes. + """ + num_imgs = len(results) + assert len(gt_seg_maps) == num_imgs + total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) + for i in range(num_imgs): + area_intersect, area_union, area_pred_label, area_label = \ + intersect_and_union( + results[i], gt_seg_maps[i], num_classes, ignore_index, + label_map, reduce_zero_label) + total_area_intersect += area_intersect + total_area_union += area_union + total_area_pred_label += area_pred_label + total_area_label += area_label + return total_area_intersect, total_area_union, total_area_pred_label, \ + total_area_label + + +def mean_iou(results, + gt_seg_maps, + num_classes, + ignore_index, + nan_to_num=None, + label_map=dict(), + reduce_zero_label=False): + """Calculate Mean Intersection and Union (mIoU) + + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Wether ignore zero label. Default: False. + + Returns: + dict[str, float | ndarray]: + float: Overall accuracy on all images. + ndarray: Per category accuracy, shape (num_classes, ). + ndarray: Per category IoU, shape (num_classes, ). + """ + iou_result = eval_metrics( + results=results, + gt_seg_maps=gt_seg_maps, + num_classes=num_classes, + ignore_index=ignore_index, + metrics=['mIoU'], + nan_to_num=nan_to_num, + label_map=label_map, + reduce_zero_label=reduce_zero_label) + return iou_result + + +def mean_dice(results, + gt_seg_maps, + num_classes, + ignore_index, + nan_to_num=None, + label_map=dict(), + reduce_zero_label=False): + """Calculate Mean Dice (mDice) + + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Wether ignore zero label. Default: False. + + Returns: + dict[str, float | ndarray]: Default metrics. + float: Overall accuracy on all images. + ndarray: Per category accuracy, shape (num_classes, ). + ndarray: Per category dice, shape (num_classes, ). + """ + + dice_result = eval_metrics( + results=results, + gt_seg_maps=gt_seg_maps, + num_classes=num_classes, + ignore_index=ignore_index, + metrics=['mDice'], + nan_to_num=nan_to_num, + label_map=label_map, + reduce_zero_label=reduce_zero_label) + return dice_result + + +def mean_fscore(results, + gt_seg_maps, + num_classes, + ignore_index, + nan_to_num=None, + label_map=dict(), + reduce_zero_label=False, + beta=1): + """Calculate Mean Intersection and Union (mIoU) + + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Wether ignore zero label. Default: False. + beta (int): Determines the weight of recall in the combined score. + Default: False. + + + Returns: + dict[str, float | ndarray]: Default metrics. + float: Overall accuracy on all images. + ndarray: Per category recall, shape (num_classes, ). + ndarray: Per category precision, shape (num_classes, ). + ndarray: Per category f-score, shape (num_classes, ). + """ + fscore_result = eval_metrics( + results=results, + gt_seg_maps=gt_seg_maps, + num_classes=num_classes, + ignore_index=ignore_index, + metrics=['mFscore'], + nan_to_num=nan_to_num, + label_map=label_map, + reduce_zero_label=reduce_zero_label, + beta=beta) + return fscore_result + + +def eval_metrics(results, + gt_seg_maps, + num_classes, + ignore_index, + metrics=['mIoU'], + nan_to_num=None, + label_map=dict(), + reduce_zero_label=False, + beta=1): + """Calculate evaluation metrics + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Wether ignore zero label. Default: False. + Returns: + float: Overall accuracy on all images. + ndarray: Per category accuracy, shape (num_classes, ). + ndarray: Per category evaluation metrics, shape (num_classes, ). + """ + if isinstance(metrics, str): + metrics = [metrics] + allowed_metrics = ['mIoU', 'mDice', 'mFscore'] + if not set(metrics).issubset(set(allowed_metrics)): + raise KeyError('metrics {} is not supported'.format(metrics)) + + total_area_intersect, total_area_union, total_area_pred_label, \ + total_area_label = total_intersect_and_union( + results, gt_seg_maps, num_classes, ignore_index, label_map, + reduce_zero_label) + all_acc = total_area_intersect.sum() / total_area_label.sum() + ret_metrics = OrderedDict({'aAcc': all_acc}) + for metric in metrics: + if metric == 'mIoU': + iou = total_area_intersect / total_area_union + acc = total_area_intersect / total_area_label + ret_metrics['IoU'] = iou + ret_metrics['Acc'] = acc + elif metric == 'mDice': + dice = 2 * total_area_intersect / ( + total_area_pred_label + total_area_label) + acc = total_area_intersect / total_area_label + ret_metrics['Dice'] = dice + ret_metrics['Acc'] = acc + elif metric == 'mFscore': + precision = total_area_intersect / total_area_pred_label + recall = total_area_intersect / total_area_label + f_value = torch.tensor( + [f_score(x[0], x[1], beta) for x in zip(precision, recall)]) + ret_metrics['Fscore'] = f_value + ret_metrics['Precision'] = precision + ret_metrics['Recall'] = recall + + ret_metrics = { + metric: value.numpy() + for metric, value in ret_metrics.items() + } + if nan_to_num is not None: + ret_metrics = OrderedDict({ + metric: np.nan_to_num(metric_value, nan=nan_to_num) + for metric, metric_value in ret_metrics.items() + }) + return ret_metrics diff --git a/lavis/common/annotator/uniformer/mmseg/core/seg/__init__.py b/lavis/common/annotator/uniformer/mmseg/core/seg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..93bc129b685e4a3efca2cc891729981b2865900d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/seg/__init__.py @@ -0,0 +1,4 @@ +from .builder import build_pixel_sampler +from .sampler import BasePixelSampler, OHEMPixelSampler + +__all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler'] diff --git a/lavis/common/annotator/uniformer/mmseg/core/seg/builder.py b/lavis/common/annotator/uniformer/mmseg/core/seg/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..db61f03d4abb2072f2532ce4429c0842495e015b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/seg/builder.py @@ -0,0 +1,8 @@ +from annotator.uniformer.mmcv.utils import Registry, build_from_cfg + +PIXEL_SAMPLERS = Registry('pixel sampler') + + +def build_pixel_sampler(cfg, **default_args): + """Build pixel sampler for segmentation map.""" + return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args) diff --git a/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/__init__.py b/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..332b242c03d1c5e80d4577df442a9a037b1816e1 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/__init__.py @@ -0,0 +1,4 @@ +from .base_pixel_sampler import BasePixelSampler +from .ohem_pixel_sampler import OHEMPixelSampler + +__all__ = ['BasePixelSampler', 'OHEMPixelSampler'] diff --git a/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py b/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..b75b1566c9f18169cee51d4b55d75e0357b69c57 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py @@ -0,0 +1,12 @@ +from abc import ABCMeta, abstractmethod + + +class BasePixelSampler(metaclass=ABCMeta): + """Base class of pixel sampler.""" + + def __init__(self, **kwargs): + pass + + @abstractmethod + def sample(self, seg_logit, seg_label): + """Placeholder for sample function.""" diff --git a/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/ohem_pixel_sampler.py b/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/ohem_pixel_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..88bb10d44026ba9f21756eaea9e550841cd59b9f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/ohem_pixel_sampler.py @@ -0,0 +1,76 @@ +import torch +import torch.nn.functional as F + +from ..builder import PIXEL_SAMPLERS +from .base_pixel_sampler import BasePixelSampler + + +@PIXEL_SAMPLERS.register_module() +class OHEMPixelSampler(BasePixelSampler): + """Online Hard Example Mining Sampler for segmentation. + + Args: + context (nn.Module): The context of sampler, subclass of + :obj:`BaseDecodeHead`. + thresh (float, optional): The threshold for hard example selection. + Below which, are prediction with low confidence. If not + specified, the hard examples will be pixels of top ``min_kept`` + loss. Default: None. + min_kept (int, optional): The minimum number of predictions to keep. + Default: 100000. + """ + + def __init__(self, context, thresh=None, min_kept=100000): + super(OHEMPixelSampler, self).__init__() + self.context = context + assert min_kept > 1 + self.thresh = thresh + self.min_kept = min_kept + + def sample(self, seg_logit, seg_label): + """Sample pixels that have high loss or with low prediction confidence. + + Args: + seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W) + seg_label (torch.Tensor): segmentation label, shape (N, 1, H, W) + + Returns: + torch.Tensor: segmentation weight, shape (N, H, W) + """ + with torch.no_grad(): + assert seg_logit.shape[2:] == seg_label.shape[2:] + assert seg_label.shape[1] == 1 + seg_label = seg_label.squeeze(1).long() + batch_kept = self.min_kept * seg_label.size(0) + valid_mask = seg_label != self.context.ignore_index + seg_weight = seg_logit.new_zeros(size=seg_label.size()) + valid_seg_weight = seg_weight[valid_mask] + if self.thresh is not None: + seg_prob = F.softmax(seg_logit, dim=1) + + tmp_seg_label = seg_label.clone().unsqueeze(1) + tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0 + seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1) + sort_prob, sort_indices = seg_prob[valid_mask].sort() + + if sort_prob.numel() > 0: + min_threshold = sort_prob[min(batch_kept, + sort_prob.numel() - 1)] + else: + min_threshold = 0.0 + threshold = max(min_threshold, self.thresh) + valid_seg_weight[seg_prob[valid_mask] < threshold] = 1. + else: + losses = self.context.loss_decode( + seg_logit, + seg_label, + weight=None, + ignore_index=self.context.ignore_index, + reduction_override='none') + # faster than topk according to https://github.com/pytorch/pytorch/issues/22812 # noqa + _, sort_indices = losses[valid_mask].sort(descending=True) + valid_seg_weight[sort_indices[:batch_kept]] = 1. + + seg_weight[valid_mask] = valid_seg_weight + + return seg_weight diff --git a/lavis/common/annotator/uniformer/mmseg/core/utils/__init__.py b/lavis/common/annotator/uniformer/mmseg/core/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f2678b321c295bcceaef945111ac3524be19d6e4 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/utils/__init__.py @@ -0,0 +1,3 @@ +from .misc import add_prefix + +__all__ = ['add_prefix'] diff --git a/lavis/common/annotator/uniformer/mmseg/core/utils/misc.py b/lavis/common/annotator/uniformer/mmseg/core/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..eb862a82bd47c8624db3dd5c6fb6ad8a03b62466 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/core/utils/misc.py @@ -0,0 +1,17 @@ +def add_prefix(inputs, prefix): + """Add prefix for dict. + + Args: + inputs (dict): The input dict with str keys. + prefix (str): The prefix to add. + + Returns: + + dict: The dict with keys updated with ``prefix``. + """ + + outputs = dict() + for name, value in inputs.items(): + outputs[f'{prefix}.{name}'] = value + + return outputs diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/__init__.py b/lavis/common/annotator/uniformer/mmseg/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebeaef4a28ef655e43578552a8aef6b77f13a636 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/__init__.py @@ -0,0 +1,19 @@ +from .ade import ADE20KDataset +from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset +from .chase_db1 import ChaseDB1Dataset +from .cityscapes import CityscapesDataset +from .custom import CustomDataset +from .dataset_wrappers import ConcatDataset, RepeatDataset +from .drive import DRIVEDataset +from .hrf import HRFDataset +from .pascal_context import PascalContextDataset, PascalContextDataset59 +from .stare import STAREDataset +from .voc import PascalVOCDataset + +__all__ = [ + 'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset', + 'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset', + 'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset', + 'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset', + 'STAREDataset' +] diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/ade.py b/lavis/common/annotator/uniformer/mmseg/datasets/ade.py new file mode 100644 index 0000000000000000000000000000000000000000..5913e43775ed4920b6934c855eb5a37c54218ebf --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/ade.py @@ -0,0 +1,84 @@ +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class ADE20KDataset(CustomDataset): + """ADE20K dataset. + + In segmentation map annotation for ADE20K, 0 stands for background, which + is not included in 150 categories. ``reduce_zero_label`` is fixed to True. + The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to + '.png'. + """ + CLASSES = ( + 'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ', + 'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth', + 'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car', + 'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug', + 'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe', + 'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column', + 'signboard', 'chest of drawers', 'counter', 'sand', 'sink', + 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path', + 'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door', + 'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table', + 'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove', + 'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar', + 'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower', + 'chandelier', 'awning', 'streetlight', 'booth', 'television receiver', + 'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister', + 'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van', + 'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything', + 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent', + 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank', + 'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake', + 'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce', + 'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen', + 'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass', + 'clock', 'flag') + + PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], + [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], + [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], + [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], + [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], + [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], + [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], + [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], + [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], + [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], + [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], + [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], + [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], + [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], + [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255], + [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255], + [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0], + [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0], + [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255], + [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255], + [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20], + [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255], + [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255], + [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255], + [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0], + [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0], + [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255], + [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112], + [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160], + [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163], + [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0], + [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0], + [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255], + [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204], + [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255], + [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255], + [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194], + [102, 255, 0], [92, 0, 255]] + + def __init__(self, **kwargs): + super(ADE20KDataset, self).__init__( + img_suffix='.jpg', + seg_map_suffix='.png', + reduce_zero_label=True, + **kwargs) diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/builder.py b/lavis/common/annotator/uniformer/mmseg/datasets/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..0798b14cd8b39fc58d8f2a4930f1e079b5bf8b55 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/builder.py @@ -0,0 +1,169 @@ +import copy +import platform +import random +from functools import partial + +import numpy as np +from annotator.uniformer.mmcv.parallel import collate +from annotator.uniformer.mmcv.runner import get_dist_info +from annotator.uniformer.mmcv.utils import Registry, build_from_cfg +from annotator.uniformer.mmcv.utils.parrots_wrapper import DataLoader, PoolDataLoader +from torch.utils.data import DistributedSampler + +if platform.system() != 'Windows': + # https://github.com/pytorch/pytorch/issues/973 + import resource + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + hard_limit = rlimit[1] + soft_limit = min(4096, hard_limit) + resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) + +DATASETS = Registry('dataset') +PIPELINES = Registry('pipeline') + + +def _concat_dataset(cfg, default_args=None): + """Build :obj:`ConcatDataset by.""" + from .dataset_wrappers import ConcatDataset + img_dir = cfg['img_dir'] + ann_dir = cfg.get('ann_dir', None) + split = cfg.get('split', None) + num_img_dir = len(img_dir) if isinstance(img_dir, (list, tuple)) else 1 + if ann_dir is not None: + num_ann_dir = len(ann_dir) if isinstance(ann_dir, (list, tuple)) else 1 + else: + num_ann_dir = 0 + if split is not None: + num_split = len(split) if isinstance(split, (list, tuple)) else 1 + else: + num_split = 0 + if num_img_dir > 1: + assert num_img_dir == num_ann_dir or num_ann_dir == 0 + assert num_img_dir == num_split or num_split == 0 + else: + assert num_split == num_ann_dir or num_ann_dir <= 1 + num_dset = max(num_split, num_img_dir) + + datasets = [] + for i in range(num_dset): + data_cfg = copy.deepcopy(cfg) + if isinstance(img_dir, (list, tuple)): + data_cfg['img_dir'] = img_dir[i] + if isinstance(ann_dir, (list, tuple)): + data_cfg['ann_dir'] = ann_dir[i] + if isinstance(split, (list, tuple)): + data_cfg['split'] = split[i] + datasets.append(build_dataset(data_cfg, default_args)) + + return ConcatDataset(datasets) + + +def build_dataset(cfg, default_args=None): + """Build datasets.""" + from .dataset_wrappers import ConcatDataset, RepeatDataset + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) + elif cfg['type'] == 'RepeatDataset': + dataset = RepeatDataset( + build_dataset(cfg['dataset'], default_args), cfg['times']) + elif isinstance(cfg.get('img_dir'), (list, tuple)) or isinstance( + cfg.get('split', None), (list, tuple)): + dataset = _concat_dataset(cfg, default_args) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset + + +def build_dataloader(dataset, + samples_per_gpu, + workers_per_gpu, + num_gpus=1, + dist=True, + shuffle=True, + seed=None, + drop_last=False, + pin_memory=True, + dataloader_type='PoolDataLoader', + **kwargs): + """Build PyTorch DataLoader. + + In distributed training, each GPU/process has a dataloader. + In non-distributed training, there is only one dataloader for all GPUs. + + Args: + dataset (Dataset): A PyTorch dataset. + samples_per_gpu (int): Number of training samples on each GPU, i.e., + batch size of each GPU. + workers_per_gpu (int): How many subprocesses to use for data loading + for each GPU. + num_gpus (int): Number of GPUs. Only used in non-distributed training. + dist (bool): Distributed training/test or not. Default: True. + shuffle (bool): Whether to shuffle the data at every epoch. + Default: True. + seed (int | None): Seed to be used. Default: None. + drop_last (bool): Whether to drop the last incomplete batch in epoch. + Default: False + pin_memory (bool): Whether to use pin_memory in DataLoader. + Default: True + dataloader_type (str): Type of dataloader. Default: 'PoolDataLoader' + kwargs: any keyword argument to be used to initialize DataLoader + + Returns: + DataLoader: A PyTorch dataloader. + """ + rank, world_size = get_dist_info() + if dist: + sampler = DistributedSampler( + dataset, world_size, rank, shuffle=shuffle) + shuffle = False + batch_size = samples_per_gpu + num_workers = workers_per_gpu + else: + sampler = None + batch_size = num_gpus * samples_per_gpu + num_workers = num_gpus * workers_per_gpu + + init_fn = partial( + worker_init_fn, num_workers=num_workers, rank=rank, + seed=seed) if seed is not None else None + + assert dataloader_type in ( + 'DataLoader', + 'PoolDataLoader'), f'unsupported dataloader {dataloader_type}' + + if dataloader_type == 'PoolDataLoader': + dataloader = PoolDataLoader + elif dataloader_type == 'DataLoader': + dataloader = DataLoader + + data_loader = dataloader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + pin_memory=pin_memory, + shuffle=shuffle, + worker_init_fn=init_fn, + drop_last=drop_last, + **kwargs) + + return data_loader + + +def worker_init_fn(worker_id, num_workers, rank, seed): + """Worker init func for dataloader. + + The seed of each worker equals to num_worker * rank + worker_id + user_seed + + Args: + worker_id (int): Worker id. + num_workers (int): Number of workers. + rank (int): The rank of current process. + seed (int): The random seed to use. + """ + + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/chase_db1.py b/lavis/common/annotator/uniformer/mmseg/datasets/chase_db1.py new file mode 100644 index 0000000000000000000000000000000000000000..8bc29bea14704a4407f83474610cbc3bef32c708 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/chase_db1.py @@ -0,0 +1,27 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class ChaseDB1Dataset(CustomDataset): + """Chase_db1 dataset. + + In segmentation map annotation for Chase_db1, 0 stands for background, + which is included in 2 categories. ``reduce_zero_label`` is fixed to False. + The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to + '_1stHO.png'. + """ + + CLASSES = ('background', 'vessel') + + PALETTE = [[120, 120, 120], [6, 230, 230]] + + def __init__(self, **kwargs): + super(ChaseDB1Dataset, self).__init__( + img_suffix='.png', + seg_map_suffix='_1stHO.png', + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/cityscapes.py b/lavis/common/annotator/uniformer/mmseg/datasets/cityscapes.py new file mode 100644 index 0000000000000000000000000000000000000000..81e47a914a1aa2e5458e18669d65ffb742f46fc6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/cityscapes.py @@ -0,0 +1,217 @@ +import os.path as osp +import tempfile + +import annotator.uniformer.mmcv as mmcv +import numpy as np +from annotator.uniformer.mmcv.utils import print_log +from PIL import Image + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class CityscapesDataset(CustomDataset): + """Cityscapes dataset. + + The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is + fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset. + """ + + CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', + 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', + 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', + 'bicycle') + + PALETTE = [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], + [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0], + [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60], + [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], + [0, 80, 100], [0, 0, 230], [119, 11, 32]] + + def __init__(self, **kwargs): + super(CityscapesDataset, self).__init__( + img_suffix='_leftImg8bit.png', + seg_map_suffix='_gtFine_labelTrainIds.png', + **kwargs) + + @staticmethod + def _convert_to_label_id(result): + """Convert trainId to id for cityscapes.""" + if isinstance(result, str): + result = np.load(result) + import cityscapesscripts.helpers.labels as CSLabels + result_copy = result.copy() + for trainId, label in CSLabels.trainId2label.items(): + result_copy[result == trainId] = label.id + + return result_copy + + def results2img(self, results, imgfile_prefix, to_label_id): + """Write the segmentation results to images. + + Args: + results (list[list | tuple | ndarray]): Testing results of the + dataset. + imgfile_prefix (str): The filename prefix of the png files. + If the prefix is "somepath/xxx", + the png files will be named "somepath/xxx.png". + to_label_id (bool): whether convert output to label_id for + submission + + Returns: + list[str: str]: result txt files which contains corresponding + semantic segmentation images. + """ + mmcv.mkdir_or_exist(imgfile_prefix) + result_files = [] + prog_bar = mmcv.ProgressBar(len(self)) + for idx in range(len(self)): + result = results[idx] + if to_label_id: + result = self._convert_to_label_id(result) + filename = self.img_infos[idx]['filename'] + basename = osp.splitext(osp.basename(filename))[0] + + png_filename = osp.join(imgfile_prefix, f'{basename}.png') + + output = Image.fromarray(result.astype(np.uint8)).convert('P') + import cityscapesscripts.helpers.labels as CSLabels + palette = np.zeros((len(CSLabels.id2label), 3), dtype=np.uint8) + for label_id, label in CSLabels.id2label.items(): + palette[label_id] = label.color + + output.putpalette(palette) + output.save(png_filename) + result_files.append(png_filename) + prog_bar.update() + + return result_files + + def format_results(self, results, imgfile_prefix=None, to_label_id=True): + """Format the results into dir (standard format for Cityscapes + evaluation). + + Args: + results (list): Testing results of the dataset. + imgfile_prefix (str | None): The prefix of images files. It + includes the file path and the prefix of filename, e.g., + "a/b/prefix". If not specified, a temp file will be created. + Default: None. + to_label_id (bool): whether convert output to label_id for + submission. Default: False + + Returns: + tuple: (result_files, tmp_dir), result_files is a list containing + the image paths, tmp_dir is the temporal directory created + for saving json/png files when img_prefix is not specified. + """ + + assert isinstance(results, list), 'results must be a list' + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: ' + f'{len(results)} != {len(self)}') + + if imgfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + imgfile_prefix = tmp_dir.name + else: + tmp_dir = None + result_files = self.results2img(results, imgfile_prefix, to_label_id) + + return result_files, tmp_dir + + def evaluate(self, + results, + metric='mIoU', + logger=None, + imgfile_prefix=None, + efficient_test=False): + """Evaluation in Cityscapes/default protocol. + + Args: + results (list): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | None | str): Logger used for printing + related information during evaluation. Default: None. + imgfile_prefix (str | None): The prefix of output image file, + for cityscapes evaluation only. It includes the file path and + the prefix of filename, e.g., "a/b/prefix". + If results are evaluated with cityscapes protocol, it would be + the prefix of output png files. The output files would be + png images under folder "a/b/prefix/xxx.png", where "xxx" is + the image name of cityscapes. If not specified, a temp file + will be created for evaluation. + Default: None. + + Returns: + dict[str, float]: Cityscapes/default metrics. + """ + + eval_results = dict() + metrics = metric.copy() if isinstance(metric, list) else [metric] + if 'cityscapes' in metrics: + eval_results.update( + self._evaluate_cityscapes(results, logger, imgfile_prefix)) + metrics.remove('cityscapes') + if len(metrics) > 0: + eval_results.update( + super(CityscapesDataset, + self).evaluate(results, metrics, logger, efficient_test)) + + return eval_results + + def _evaluate_cityscapes(self, results, logger, imgfile_prefix): + """Evaluation in Cityscapes protocol. + + Args: + results (list): Testing results of the dataset. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + imgfile_prefix (str | None): The prefix of output image file + + Returns: + dict[str: float]: Cityscapes evaluation results. + """ + try: + import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as CSEval # noqa + except ImportError: + raise ImportError('Please run "pip install cityscapesscripts" to ' + 'install cityscapesscripts first.') + msg = 'Evaluating in Cityscapes style' + if logger is None: + msg = '\n' + msg + print_log(msg, logger=logger) + + result_files, tmp_dir = self.format_results(results, imgfile_prefix) + + if tmp_dir is None: + result_dir = imgfile_prefix + else: + result_dir = tmp_dir.name + + eval_results = dict() + print_log(f'Evaluating results under {result_dir} ...', logger=logger) + + CSEval.args.evalInstLevelScore = True + CSEval.args.predictionPath = osp.abspath(result_dir) + CSEval.args.evalPixelAccuracy = True + CSEval.args.JSONOutput = False + + seg_map_list = [] + pred_list = [] + + # when evaluating with official cityscapesscripts, + # **_gtFine_labelIds.png is used + for seg_map in mmcv.scandir( + self.ann_dir, 'gtFine_labelIds.png', recursive=True): + seg_map_list.append(osp.join(self.ann_dir, seg_map)) + pred_list.append(CSEval.getPrediction(CSEval.args, seg_map)) + + eval_results.update( + CSEval.evaluateImgLists(pred_list, seg_map_list, CSEval.args)) + + if tmp_dir is not None: + tmp_dir.cleanup() + + return eval_results diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/custom.py b/lavis/common/annotator/uniformer/mmseg/datasets/custom.py new file mode 100644 index 0000000000000000000000000000000000000000..d8eb2a709cc7a3a68fc6a1e3a1ad98faef4c5b7b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/custom.py @@ -0,0 +1,400 @@ +import os +import os.path as osp +from collections import OrderedDict +from functools import reduce + +import annotator.uniformer.mmcv as mmcv +import numpy as np +from annotator.uniformer.mmcv.utils import print_log +from prettytable import PrettyTable +from torch.utils.data import Dataset + +from annotator.uniformer.mmseg.core import eval_metrics +from annotator.uniformer.mmseg.utils import get_root_logger +from .builder import DATASETS +from .pipelines import Compose + + +@DATASETS.register_module() +class CustomDataset(Dataset): + """Custom dataset for semantic segmentation. An example of file structure + is as followed. + + .. code-block:: none + + ├── data + │ ├── my_dataset + │ │ ├── img_dir + │ │ │ ├── train + │ │ │ │ ├── xxx{img_suffix} + │ │ │ │ ├── yyy{img_suffix} + │ │ │ │ ├── zzz{img_suffix} + │ │ │ ├── val + │ │ ├── ann_dir + │ │ │ ├── train + │ │ │ │ ├── xxx{seg_map_suffix} + │ │ │ │ ├── yyy{seg_map_suffix} + │ │ │ │ ├── zzz{seg_map_suffix} + │ │ │ ├── val + + The img/gt_semantic_seg pair of CustomDataset should be of the same + except suffix. A valid img/gt_semantic_seg filename pair should be like + ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included + in the suffix). If split is given, then ``xxx`` is specified in txt file. + Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded. + Please refer to ``docs/tutorials/new_dataset.md`` for more details. + + + Args: + pipeline (list[dict]): Processing pipeline + img_dir (str): Path to image directory + img_suffix (str): Suffix of images. Default: '.jpg' + ann_dir (str, optional): Path to annotation directory. Default: None + seg_map_suffix (str): Suffix of segmentation maps. Default: '.png' + split (str, optional): Split txt file. If split is specified, only + file with suffix in the splits will be loaded. Otherwise, all + images in img_dir/ann_dir will be loaded. Default: None + data_root (str, optional): Data root for img_dir/ann_dir. Default: + None. + test_mode (bool): If test_mode=True, gt wouldn't be loaded. + ignore_index (int): The label index to be ignored. Default: 255 + reduce_zero_label (bool): Whether to mark label zero as ignored. + Default: False + classes (str | Sequence[str], optional): Specify classes to load. + If is None, ``cls.CLASSES`` will be used. Default: None. + palette (Sequence[Sequence[int]]] | np.ndarray | None): + The palette of segmentation map. If None is given, and + self.PALETTE is None, random palette will be generated. + Default: None + """ + + CLASSES = None + + PALETTE = None + + def __init__(self, + pipeline, + img_dir, + img_suffix='.jpg', + ann_dir=None, + seg_map_suffix='.png', + split=None, + data_root=None, + test_mode=False, + ignore_index=255, + reduce_zero_label=False, + classes=None, + palette=None): + self.pipeline = Compose(pipeline) + self.img_dir = img_dir + self.img_suffix = img_suffix + self.ann_dir = ann_dir + self.seg_map_suffix = seg_map_suffix + self.split = split + self.data_root = data_root + self.test_mode = test_mode + self.ignore_index = ignore_index + self.reduce_zero_label = reduce_zero_label + self.label_map = None + self.CLASSES, self.PALETTE = self.get_classes_and_palette( + classes, palette) + + # join paths if data_root is specified + if self.data_root is not None: + if not osp.isabs(self.img_dir): + self.img_dir = osp.join(self.data_root, self.img_dir) + if not (self.ann_dir is None or osp.isabs(self.ann_dir)): + self.ann_dir = osp.join(self.data_root, self.ann_dir) + if not (self.split is None or osp.isabs(self.split)): + self.split = osp.join(self.data_root, self.split) + + # load annotations + self.img_infos = self.load_annotations(self.img_dir, self.img_suffix, + self.ann_dir, + self.seg_map_suffix, self.split) + + def __len__(self): + """Total number of samples of data.""" + return len(self.img_infos) + + def load_annotations(self, img_dir, img_suffix, ann_dir, seg_map_suffix, + split): + """Load annotation from directory. + + Args: + img_dir (str): Path to image directory + img_suffix (str): Suffix of images. + ann_dir (str|None): Path to annotation directory. + seg_map_suffix (str|None): Suffix of segmentation maps. + split (str|None): Split txt file. If split is specified, only file + with suffix in the splits will be loaded. Otherwise, all images + in img_dir/ann_dir will be loaded. Default: None + + Returns: + list[dict]: All image info of dataset. + """ + + img_infos = [] + if split is not None: + with open(split) as f: + for line in f: + img_name = line.strip() + img_info = dict(filename=img_name + img_suffix) + if ann_dir is not None: + seg_map = img_name + seg_map_suffix + img_info['ann'] = dict(seg_map=seg_map) + img_infos.append(img_info) + else: + for img in mmcv.scandir(img_dir, img_suffix, recursive=True): + img_info = dict(filename=img) + if ann_dir is not None: + seg_map = img.replace(img_suffix, seg_map_suffix) + img_info['ann'] = dict(seg_map=seg_map) + img_infos.append(img_info) + + print_log(f'Loaded {len(img_infos)} images', logger=get_root_logger()) + return img_infos + + def get_ann_info(self, idx): + """Get annotation by index. + + Args: + idx (int): Index of data. + + Returns: + dict: Annotation info of specified index. + """ + + return self.img_infos[idx]['ann'] + + def pre_pipeline(self, results): + """Prepare results dict for pipeline.""" + results['seg_fields'] = [] + results['img_prefix'] = self.img_dir + results['seg_prefix'] = self.ann_dir + if self.custom_classes: + results['label_map'] = self.label_map + + def __getitem__(self, idx): + """Get training/test data after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Training/test data (with annotation if `test_mode` is set + False). + """ + + if self.test_mode: + return self.prepare_test_img(idx) + else: + return self.prepare_train_img(idx) + + def prepare_train_img(self, idx): + """Get training data and annotations after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Training data and annotation after pipeline with new keys + introduced by pipeline. + """ + + img_info = self.img_infos[idx] + ann_info = self.get_ann_info(idx) + results = dict(img_info=img_info, ann_info=ann_info) + self.pre_pipeline(results) + return self.pipeline(results) + + def prepare_test_img(self, idx): + """Get testing data after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Testing data after pipeline with new keys introduced by + pipeline. + """ + + img_info = self.img_infos[idx] + results = dict(img_info=img_info) + self.pre_pipeline(results) + return self.pipeline(results) + + def format_results(self, results, **kwargs): + """Place holder to format result to dataset specific output.""" + + def get_gt_seg_maps(self, efficient_test=False): + """Get ground truth segmentation maps for evaluation.""" + gt_seg_maps = [] + for img_info in self.img_infos: + seg_map = osp.join(self.ann_dir, img_info['ann']['seg_map']) + if efficient_test: + gt_seg_map = seg_map + else: + gt_seg_map = mmcv.imread( + seg_map, flag='unchanged', backend='pillow') + gt_seg_maps.append(gt_seg_map) + return gt_seg_maps + + def get_classes_and_palette(self, classes=None, palette=None): + """Get class names of current dataset. + + Args: + classes (Sequence[str] | str | None): If classes is None, use + default CLASSES defined by builtin dataset. If classes is a + string, take it as a file name. The file contains the name of + classes where each line contains one class name. If classes is + a tuple or list, override the CLASSES defined by the dataset. + palette (Sequence[Sequence[int]]] | np.ndarray | None): + The palette of segmentation map. If None is given, random + palette will be generated. Default: None + """ + if classes is None: + self.custom_classes = False + return self.CLASSES, self.PALETTE + + self.custom_classes = True + if isinstance(classes, str): + # take it as a file path + class_names = mmcv.list_from_file(classes) + elif isinstance(classes, (tuple, list)): + class_names = classes + else: + raise ValueError(f'Unsupported type {type(classes)} of classes.') + + if self.CLASSES: + if not set(classes).issubset(self.CLASSES): + raise ValueError('classes is not a subset of CLASSES.') + + # dictionary, its keys are the old label ids and its values + # are the new label ids. + # used for changing pixel labels in load_annotations. + self.label_map = {} + for i, c in enumerate(self.CLASSES): + if c not in class_names: + self.label_map[i] = -1 + else: + self.label_map[i] = classes.index(c) + + palette = self.get_palette_for_custom_classes(class_names, palette) + + return class_names, palette + + def get_palette_for_custom_classes(self, class_names, palette=None): + + if self.label_map is not None: + # return subset of palette + palette = [] + for old_id, new_id in sorted( + self.label_map.items(), key=lambda x: x[1]): + if new_id != -1: + palette.append(self.PALETTE[old_id]) + palette = type(self.PALETTE)(palette) + + elif palette is None: + if self.PALETTE is None: + palette = np.random.randint(0, 255, size=(len(class_names), 3)) + else: + palette = self.PALETTE + + return palette + + def evaluate(self, + results, + metric='mIoU', + logger=None, + efficient_test=False, + **kwargs): + """Evaluate the dataset. + + Args: + results (list): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. 'mIoU', + 'mDice' and 'mFscore' are supported. + logger (logging.Logger | None | str): Logger used for printing + related information during evaluation. Default: None. + + Returns: + dict[str, float]: Default metrics. + """ + + if isinstance(metric, str): + metric = [metric] + allowed_metrics = ['mIoU', 'mDice', 'mFscore'] + if not set(metric).issubset(set(allowed_metrics)): + raise KeyError('metric {} is not supported'.format(metric)) + eval_results = {} + gt_seg_maps = self.get_gt_seg_maps(efficient_test) + if self.CLASSES is None: + num_classes = len( + reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) + else: + num_classes = len(self.CLASSES) + ret_metrics = eval_metrics( + results, + gt_seg_maps, + num_classes, + self.ignore_index, + metric, + label_map=self.label_map, + reduce_zero_label=self.reduce_zero_label) + + if self.CLASSES is None: + class_names = tuple(range(num_classes)) + else: + class_names = self.CLASSES + + # summary table + ret_metrics_summary = OrderedDict({ + ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + }) + + # each class table + ret_metrics.pop('aAcc', None) + ret_metrics_class = OrderedDict({ + ret_metric: np.round(ret_metric_value * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + }) + ret_metrics_class.update({'Class': class_names}) + ret_metrics_class.move_to_end('Class', last=False) + + # for logger + class_table_data = PrettyTable() + for key, val in ret_metrics_class.items(): + class_table_data.add_column(key, val) + + summary_table_data = PrettyTable() + for key, val in ret_metrics_summary.items(): + if key == 'aAcc': + summary_table_data.add_column(key, [val]) + else: + summary_table_data.add_column('m' + key, [val]) + + print_log('per class results:', logger) + print_log('\n' + class_table_data.get_string(), logger=logger) + print_log('Summary:', logger) + print_log('\n' + summary_table_data.get_string(), logger=logger) + + # each metric dict + for key, value in ret_metrics_summary.items(): + if key == 'aAcc': + eval_results[key] = value / 100.0 + else: + eval_results['m' + key] = value / 100.0 + + ret_metrics_class.pop('Class', None) + for key, value in ret_metrics_class.items(): + eval_results.update({ + key + '.' + str(name): value[idx] / 100.0 + for idx, name in enumerate(class_names) + }) + + if mmcv.is_list_of(results, str): + for file_name in results: + os.remove(file_name) + return eval_results diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/dataset_wrappers.py b/lavis/common/annotator/uniformer/mmseg/datasets/dataset_wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..d6a5e957ec3b44465432617cf6e8f0b86a8a5efa --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/dataset_wrappers.py @@ -0,0 +1,50 @@ +from torch.utils.data.dataset import ConcatDataset as _ConcatDataset + +from .builder import DATASETS + + +@DATASETS.register_module() +class ConcatDataset(_ConcatDataset): + """A wrapper of concatenated dataset. + + Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but + concat the group flag for image aspect ratio. + + Args: + datasets (list[:obj:`Dataset`]): A list of datasets. + """ + + def __init__(self, datasets): + super(ConcatDataset, self).__init__(datasets) + self.CLASSES = datasets[0].CLASSES + self.PALETTE = datasets[0].PALETTE + + +@DATASETS.register_module() +class RepeatDataset(object): + """A wrapper of repeated dataset. + + The length of repeated dataset will be `times` larger than the original + dataset. This is useful when the data loading time is long but the dataset + is small. Using RepeatDataset can reduce the data loading time between + epochs. + + Args: + dataset (:obj:`Dataset`): The dataset to be repeated. + times (int): Repeat times. + """ + + def __init__(self, dataset, times): + self.dataset = dataset + self.times = times + self.CLASSES = dataset.CLASSES + self.PALETTE = dataset.PALETTE + self._ori_len = len(self.dataset) + + def __getitem__(self, idx): + """Get item from original dataset.""" + return self.dataset[idx % self._ori_len] + + def __len__(self): + """The length is multiplied by ``times``""" + return self.times * self._ori_len diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/drive.py b/lavis/common/annotator/uniformer/mmseg/datasets/drive.py new file mode 100644 index 0000000000000000000000000000000000000000..3cbfda8ae74bdf26c5aef197ff2866a7c7ad0cfd --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/drive.py @@ -0,0 +1,27 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class DRIVEDataset(CustomDataset): + """DRIVE dataset. + + In segmentation map annotation for DRIVE, 0 stands for background, which is + included in 2 categories. ``reduce_zero_label`` is fixed to False. The + ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to + '_manual1.png'. + """ + + CLASSES = ('background', 'vessel') + + PALETTE = [[120, 120, 120], [6, 230, 230]] + + def __init__(self, **kwargs): + super(DRIVEDataset, self).__init__( + img_suffix='.png', + seg_map_suffix='_manual1.png', + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/hrf.py b/lavis/common/annotator/uniformer/mmseg/datasets/hrf.py new file mode 100644 index 0000000000000000000000000000000000000000..923203b51377f9344277fc561803d7a78bd2c684 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/hrf.py @@ -0,0 +1,27 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class HRFDataset(CustomDataset): + """HRF dataset. + + In segmentation map annotation for HRF, 0 stands for background, which is + included in 2 categories. ``reduce_zero_label`` is fixed to False. The + ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to + '.png'. + """ + + CLASSES = ('background', 'vessel') + + PALETTE = [[120, 120, 120], [6, 230, 230]] + + def __init__(self, **kwargs): + super(HRFDataset, self).__init__( + img_suffix='.png', + seg_map_suffix='.png', + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/pascal_context.py b/lavis/common/annotator/uniformer/mmseg/datasets/pascal_context.py new file mode 100644 index 0000000000000000000000000000000000000000..541a63c66a13fb16fd52921e755715ad8d078fdd --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/pascal_context.py @@ -0,0 +1,103 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class PascalContextDataset(CustomDataset): + """PascalContext dataset. + + In segmentation map annotation for PascalContext, 0 stands for background, + which is included in 60 categories. ``reduce_zero_label`` is fixed to + False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is + fixed to '.png'. + + Args: + split (str): Split txt file for PascalContext. + """ + + CLASSES = ('background', 'aeroplane', 'bag', 'bed', 'bedclothes', 'bench', + 'bicycle', 'bird', 'boat', 'book', 'bottle', 'building', 'bus', + 'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth', + 'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence', + 'floor', 'flower', 'food', 'grass', 'ground', 'horse', + 'keyboard', 'light', 'motorbike', 'mountain', 'mouse', 'person', + 'plate', 'platform', 'pottedplant', 'road', 'rock', 'sheep', + 'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table', + 'track', 'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water', + 'window', 'wood') + + PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], + [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], + [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], + [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], + [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], + [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], + [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], + [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], + [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], + [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], + [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], + [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], + [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], + [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], + [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]] + + def __init__(self, split, **kwargs): + super(PascalContextDataset, self).__init__( + img_suffix='.jpg', + seg_map_suffix='.png', + split=split, + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) and self.split is not None + + +@DATASETS.register_module() +class PascalContextDataset59(CustomDataset): + """PascalContext dataset. + + In segmentation map annotation for PascalContext, 0 stands for background, + which is included in 60 categories. ``reduce_zero_label`` is fixed to + False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is + fixed to '.png'. + + Args: + split (str): Split txt file for PascalContext. + """ + + CLASSES = ('aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', + 'bird', 'boat', 'book', 'bottle', 'building', 'bus', 'cabinet', + 'car', 'cat', 'ceiling', 'chair', 'cloth', 'computer', 'cow', + 'cup', 'curtain', 'dog', 'door', 'fence', 'floor', 'flower', + 'food', 'grass', 'ground', 'horse', 'keyboard', 'light', + 'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform', + 'pottedplant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk', + 'sign', 'sky', 'snow', 'sofa', 'table', 'track', 'train', + 'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window', 'wood') + + PALETTE = [[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3], + [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230], + [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61], + [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140], + [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200], + [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71], + [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92], + [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6], + [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8], + [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8], + [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255], + [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140], + [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0], + [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0], + [0, 235, 255], [0, 173, 255], [31, 0, 255]] + + def __init__(self, split, **kwargs): + super(PascalContextDataset59, self).__init__( + img_suffix='.jpg', + seg_map_suffix='.png', + split=split, + reduce_zero_label=True, + **kwargs) + assert osp.exists(self.img_dir) and self.split is not None diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/__init__.py b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b9046b07bb4ddea7a707a392b42e72db7c9df67 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/__init__.py @@ -0,0 +1,16 @@ +from .compose import Compose +from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor, + Transpose, to_tensor) +from .loading import LoadAnnotations, LoadImageFromFile +from .test_time_aug import MultiScaleFlipAug +from .transforms import (CLAHE, AdjustGamma, Normalize, Pad, + PhotoMetricDistortion, RandomCrop, RandomFlip, + RandomRotate, Rerange, Resize, RGB2Gray, SegRescale) + +__all__ = [ + 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', + 'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile', + 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', + 'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate', + 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray' +] diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/compose.py b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/compose.py new file mode 100644 index 0000000000000000000000000000000000000000..cbfcbb925c6d4ebf849328b9f94ef6fc24359bf5 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/compose.py @@ -0,0 +1,51 @@ +import collections + +from annotator.uniformer.mmcv.utils import build_from_cfg + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class Compose(object): + """Compose multiple transforms sequentially. + + Args: + transforms (Sequence[dict | callable]): Sequence of transform object or + config dict to be composed. + """ + + def __init__(self, transforms): + assert isinstance(transforms, collections.abc.Sequence) + self.transforms = [] + for transform in transforms: + if isinstance(transform, dict): + transform = build_from_cfg(transform, PIPELINES) + self.transforms.append(transform) + elif callable(transform): + self.transforms.append(transform) + else: + raise TypeError('transform must be callable or a dict') + + def __call__(self, data): + """Call function to apply transforms sequentially. + + Args: + data (dict): A result dict contains the data to transform. + + Returns: + dict: Transformed data. + """ + + for t in self.transforms: + data = t(data) + if data is None: + return None + return data + + def __repr__(self): + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + format_string += '\n' + format_string += f' {t}' + format_string += '\n)' + return format_string diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/formating.py b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/formating.py new file mode 100644 index 0000000000000000000000000000000000000000..97db85f4f9db39fb86ba77ead7d1a8407d810adb --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/formating.py @@ -0,0 +1,288 @@ +from collections.abc import Sequence + +import annotator.uniformer.mmcv as mmcv +import numpy as np +import torch +from annotator.uniformer.mmcv.parallel import DataContainer as DC + +from ..builder import PIPELINES + + +def to_tensor(data): + """Convert objects of various python types to :obj:`torch.Tensor`. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int` and :class:`float`. + + Args: + data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to + be converted. + """ + + if isinstance(data, torch.Tensor): + return data + elif isinstance(data, np.ndarray): + return torch.from_numpy(data) + elif isinstance(data, Sequence) and not mmcv.is_str(data): + return torch.tensor(data) + elif isinstance(data, int): + return torch.LongTensor([data]) + elif isinstance(data, float): + return torch.FloatTensor([data]) + else: + raise TypeError(f'type {type(data)} cannot be converted to tensor.') + + +@PIPELINES.register_module() +class ToTensor(object): + """Convert some results to :obj:`torch.Tensor` by given keys. + + Args: + keys (Sequence[str]): Keys that need to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert data in results to :obj:`torch.Tensor`. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data converted + to :obj:`torch.Tensor`. + """ + + for key in self.keys: + results[key] = to_tensor(results[key]) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PIPELINES.register_module() +class ImageToTensor(object): + """Convert image to :obj:`torch.Tensor` by given keys. + + The dimension order of input image is (H, W, C). The pipeline will convert + it to (C, H, W). If only 2 dimension (H, W) is given, the output would be + (1, H, W). + + Args: + keys (Sequence[str]): Key of images to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert image in results to :obj:`torch.Tensor` and + transpose the channel order. + + Args: + results (dict): Result dict contains the image data to convert. + + Returns: + dict: The result dict contains the image converted + to :obj:`torch.Tensor` and transposed to (C, H, W) order. + """ + + for key in self.keys: + img = results[key] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + results[key] = to_tensor(img.transpose(2, 0, 1)) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PIPELINES.register_module() +class Transpose(object): + """Transpose some results by given keys. + + Args: + keys (Sequence[str]): Keys of results to be transposed. + order (Sequence[int]): Order of transpose. + """ + + def __init__(self, keys, order): + self.keys = keys + self.order = order + + def __call__(self, results): + """Call function to convert image in results to :obj:`torch.Tensor` and + transpose the channel order. + + Args: + results (dict): Result dict contains the image data to convert. + + Returns: + dict: The result dict contains the image converted + to :obj:`torch.Tensor` and transposed to (C, H, W) order. + """ + + for key in self.keys: + results[key] = results[key].transpose(self.order) + return results + + def __repr__(self): + return self.__class__.__name__ + \ + f'(keys={self.keys}, order={self.order})' + + +@PIPELINES.register_module() +class ToDataContainer(object): + """Convert results to :obj:`mmcv.DataContainer` by given fields. + + Args: + fields (Sequence[dict]): Each field is a dict like + ``dict(key='xxx', **kwargs)``. The ``key`` in result will + be converted to :obj:`mmcv.DataContainer` with ``**kwargs``. + Default: ``(dict(key='img', stack=True), + dict(key='gt_semantic_seg'))``. + """ + + def __init__(self, + fields=(dict(key='img', + stack=True), dict(key='gt_semantic_seg'))): + self.fields = fields + + def __call__(self, results): + """Call function to convert data in results to + :obj:`mmcv.DataContainer`. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data converted to + :obj:`mmcv.DataContainer`. + """ + + for field in self.fields: + field = field.copy() + key = field.pop('key') + results[key] = DC(results[key], **field) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(fields={self.fields})' + + +@PIPELINES.register_module() +class DefaultFormatBundle(object): + """Default formatting bundle. + + It simplifies the pipeline of formatting common fields, including "img" + and "gt_semantic_seg". These fields are formatted as follows. + + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, + (3)to DataContainer (stack=True) + """ + + def __call__(self, results): + """Call function to transform and format common fields in results. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data that is formatted with + default bundle. + """ + + if 'img' in results: + img = results['img'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + results['img'] = DC(to_tensor(img), stack=True) + if 'gt_semantic_seg' in results: + # convert to long + results['gt_semantic_seg'] = DC( + to_tensor(results['gt_semantic_seg'][None, + ...].astype(np.int64)), + stack=True) + return results + + def __repr__(self): + return self.__class__.__name__ + + +@PIPELINES.register_module() +class Collect(object): + """Collect data from the loader relevant to the specific task. + + This is usually the last stage of the data loader pipeline. Typically keys + is set to some subset of "img", "gt_semantic_seg". + + The "img_meta" item is always populated. The contents of the "img_meta" + dictionary depends on "meta_keys". By default this includes: + + - "img_shape": shape of the image input to the network as a tuple + (h, w, c). Note that images may be zero padded on the bottom/right + if the batch tensor is larger than this shape. + + - "scale_factor": a float indicating the preprocessing scale + + - "flip": a boolean indicating if image flip transform was used + + - "filename": path to the image file + + - "ori_shape": original shape of the image as a tuple (h, w, c) + + - "pad_shape": image shape after padding + + - "img_norm_cfg": a dict of normalization information: + - mean - per channel mean subtraction + - std - per channel std divisor + - to_rgb - bool indicating if bgr was converted to rgb + + Args: + keys (Sequence[str]): Keys of results to be collected in ``data``. + meta_keys (Sequence[str], optional): Meta keys to be converted to + ``mmcv.DataContainer`` and collected in ``data[img_metas]``. + Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape', + 'pad_shape', 'scale_factor', 'flip', 'flip_direction', + 'img_norm_cfg')`` + """ + + def __init__(self, + keys, + meta_keys=('filename', 'ori_filename', 'ori_shape', + 'img_shape', 'pad_shape', 'scale_factor', 'flip', + 'flip_direction', 'img_norm_cfg')): + self.keys = keys + self.meta_keys = meta_keys + + def __call__(self, results): + """Call function to collect keys in results. The keys in ``meta_keys`` + will be converted to :obj:mmcv.DataContainer. + + Args: + results (dict): Result dict contains the data to collect. + + Returns: + dict: The result dict contains the following keys + - keys in``self.keys`` + - ``img_metas`` + """ + + data = {} + img_meta = {} + for key in self.meta_keys: + img_meta[key] = results[key] + data['img_metas'] = DC(img_meta, cpu_only=True) + for key in self.keys: + data[key] = results[key] + return data + + def __repr__(self): + return self.__class__.__name__ + \ + f'(keys={self.keys}, meta_keys={self.meta_keys})' diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/loading.py b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/loading.py new file mode 100644 index 0000000000000000000000000000000000000000..d3692ae91f19b9c7ccf6023168788ff42c9e93e3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/loading.py @@ -0,0 +1,153 @@ +import os.path as osp + +import annotator.uniformer.mmcv as mmcv +import numpy as np + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class LoadImageFromFile(object): + """Load an image from file. + + Required keys are "img_prefix" and "img_info" (a dict that must contain the + key "filename"). Added or updated keys are "filename", "img", "img_shape", + "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`), + "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1). + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + color_type (str): The flag argument for :func:`mmcv.imfrombytes`. + Defaults to 'color'. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + imdecode_backend (str): Backend for :func:`mmcv.imdecode`. Default: + 'cv2' + """ + + def __init__(self, + to_float32=False, + color_type='color', + file_client_args=dict(backend='disk'), + imdecode_backend='cv2'): + self.to_float32 = to_float32 + self.color_type = color_type + self.file_client_args = file_client_args.copy() + self.file_client = None + self.imdecode_backend = imdecode_backend + + def __call__(self, results): + """Call functions to load image and get image meta information. + + Args: + results (dict): Result dict from :obj:`mmseg.CustomDataset`. + + Returns: + dict: The dict contains loaded image and meta information. + """ + + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + if results.get('img_prefix') is not None: + filename = osp.join(results['img_prefix'], + results['img_info']['filename']) + else: + filename = results['img_info']['filename'] + img_bytes = self.file_client.get(filename) + img = mmcv.imfrombytes( + img_bytes, flag=self.color_type, backend=self.imdecode_backend) + if self.to_float32: + img = img.astype(np.float32) + + results['filename'] = filename + results['ori_filename'] = results['img_info']['filename'] + results['img'] = img + results['img_shape'] = img.shape + results['ori_shape'] = img.shape + # Set initial values for default meta_keys + results['pad_shape'] = img.shape + results['scale_factor'] = 1.0 + num_channels = 1 if len(img.shape) < 3 else img.shape[2] + results['img_norm_cfg'] = dict( + mean=np.zeros(num_channels, dtype=np.float32), + std=np.ones(num_channels, dtype=np.float32), + to_rgb=False) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(to_float32={self.to_float32},' + repr_str += f"color_type='{self.color_type}'," + repr_str += f"imdecode_backend='{self.imdecode_backend}')" + return repr_str + + +@PIPELINES.register_module() +class LoadAnnotations(object): + """Load annotations for semantic segmentation. + + Args: + reduce_zero_label (bool): Whether reduce all label value by 1. + Usually used for datasets where 0 is background label. + Default: False. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + imdecode_backend (str): Backend for :func:`mmcv.imdecode`. Default: + 'pillow' + """ + + def __init__(self, + reduce_zero_label=False, + file_client_args=dict(backend='disk'), + imdecode_backend='pillow'): + self.reduce_zero_label = reduce_zero_label + self.file_client_args = file_client_args.copy() + self.file_client = None + self.imdecode_backend = imdecode_backend + + def __call__(self, results): + """Call function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:`mmseg.CustomDataset`. + + Returns: + dict: The dict contains loaded semantic segmentation annotations. + """ + + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + if results.get('seg_prefix', None) is not None: + filename = osp.join(results['seg_prefix'], + results['ann_info']['seg_map']) + else: + filename = results['ann_info']['seg_map'] + img_bytes = self.file_client.get(filename) + gt_semantic_seg = mmcv.imfrombytes( + img_bytes, flag='unchanged', + backend=self.imdecode_backend).squeeze().astype(np.uint8) + # modify if custom classes + if results.get('label_map', None) is not None: + for old_id, new_id in results['label_map'].items(): + gt_semantic_seg[gt_semantic_seg == old_id] = new_id + # reduce zero_label + if self.reduce_zero_label: + # avoid using underflow conversion + gt_semantic_seg[gt_semantic_seg == 0] = 255 + gt_semantic_seg = gt_semantic_seg - 1 + gt_semantic_seg[gt_semantic_seg == 254] = 255 + results['gt_semantic_seg'] = gt_semantic_seg + results['seg_fields'].append('gt_semantic_seg') + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(reduce_zero_label={self.reduce_zero_label},' + repr_str += f"imdecode_backend='{self.imdecode_backend}')" + return repr_str diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/test_time_aug.py b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/test_time_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..6a1611a04d9d927223c9afbe5bf68af04d62937a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/test_time_aug.py @@ -0,0 +1,133 @@ +import warnings + +import annotator.uniformer.mmcv as mmcv + +from ..builder import PIPELINES +from .compose import Compose + + +@PIPELINES.register_module() +class MultiScaleFlipAug(object): + """Test-time augmentation with multiple scales and flipping. + + An example configuration is as followed: + + .. code-block:: + + img_scale=(2048, 1024), + img_ratios=[0.5, 1.0], + flip=True, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ] + + After MultiScaleFLipAug with above configuration, the results are wrapped + into lists of the same length as followed: + + .. code-block:: + + dict( + img=[...], + img_shape=[...], + scale=[(1024, 512), (1024, 512), (2048, 1024), (2048, 1024)] + flip=[False, True, False, True] + ... + ) + + Args: + transforms (list[dict]): Transforms to apply in each augmentation. + img_scale (None | tuple | list[tuple]): Images scales for resizing. + img_ratios (float | list[float]): Image ratios for resizing + flip (bool): Whether apply flip augmentation. Default: False. + flip_direction (str | list[str]): Flip augmentation directions, + options are "horizontal" and "vertical". If flip_direction is list, + multiple flip augmentations will be applied. + It has no effect when flip == False. Default: "horizontal". + """ + + def __init__(self, + transforms, + img_scale, + img_ratios=None, + flip=False, + flip_direction='horizontal'): + self.transforms = Compose(transforms) + if img_ratios is not None: + img_ratios = img_ratios if isinstance(img_ratios, + list) else [img_ratios] + assert mmcv.is_list_of(img_ratios, float) + if img_scale is None: + # mode 1: given img_scale=None and a range of image ratio + self.img_scale = None + assert mmcv.is_list_of(img_ratios, float) + elif isinstance(img_scale, tuple) and mmcv.is_list_of( + img_ratios, float): + assert len(img_scale) == 2 + # mode 2: given a scale and a range of image ratio + self.img_scale = [(int(img_scale[0] * ratio), + int(img_scale[1] * ratio)) + for ratio in img_ratios] + else: + # mode 3: given multiple scales + self.img_scale = img_scale if isinstance(img_scale, + list) else [img_scale] + assert mmcv.is_list_of(self.img_scale, tuple) or self.img_scale is None + self.flip = flip + self.img_ratios = img_ratios + self.flip_direction = flip_direction if isinstance( + flip_direction, list) else [flip_direction] + assert mmcv.is_list_of(self.flip_direction, str) + if not self.flip and self.flip_direction != ['horizontal']: + warnings.warn( + 'flip_direction has no effect when flip is set to False') + if (self.flip + and not any([t['type'] == 'RandomFlip' for t in transforms])): + warnings.warn( + 'flip has no effect when RandomFlip is not in transforms') + + def __call__(self, results): + """Call function to apply test time augment transforms on results. + + Args: + results (dict): Result dict contains the data to transform. + + Returns: + dict[str: list]: The augmented data, where each value is wrapped + into a list. + """ + + aug_data = [] + if self.img_scale is None and mmcv.is_list_of(self.img_ratios, float): + h, w = results['img'].shape[:2] + img_scale = [(int(w * ratio), int(h * ratio)) + for ratio in self.img_ratios] + else: + img_scale = self.img_scale + flip_aug = [False, True] if self.flip else [False] + for scale in img_scale: + for flip in flip_aug: + for direction in self.flip_direction: + _results = results.copy() + _results['scale'] = scale + _results['flip'] = flip + _results['flip_direction'] = direction + data = self.transforms(_results) + aug_data.append(data) + # list of dict to dict of list + aug_data_dict = {key: [] for key in aug_data[0]} + for data in aug_data: + for key, val in data.items(): + aug_data_dict[key].append(val) + return aug_data_dict + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(transforms={self.transforms}, ' + repr_str += f'img_scale={self.img_scale}, flip={self.flip})' + repr_str += f'flip_direction={self.flip_direction}' + return repr_str diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/transforms.py b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..94e869b252ef6d8b43604add2bbc02f034614bfb --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/transforms.py @@ -0,0 +1,889 @@ +import annotator.uniformer.mmcv as mmcv +import numpy as np +from annotator.uniformer.mmcv.utils import deprecated_api_warning, is_tuple_of +from numpy import random + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class Resize(object): + """Resize images & seg. + + This transform resizes the input image to some scale. If the input dict + contains the key "scale", then the scale in the input dict is used, + otherwise the specified scale in the init method is used. + + ``img_scale`` can be None, a tuple (single-scale) or a list of tuple + (multi-scale). There are 4 multiscale modes: + + - ``ratio_range is not None``: + 1. When img_scale is None, img_scale is the shape of image in results + (img_scale = results['img'].shape[:2]) and the image is resized based + on the original size. (mode 1) + 2. When img_scale is a tuple (single-scale), randomly sample a ratio from + the ratio range and multiply it with the image scale. (mode 2) + + - ``ratio_range is None and multiscale_mode == "range"``: randomly sample a + scale from the a range. (mode 3) + + - ``ratio_range is None and multiscale_mode == "value"``: randomly sample a + scale from multiple scales. (mode 4) + + Args: + img_scale (tuple or list[tuple]): Images scales for resizing. + multiscale_mode (str): Either "range" or "value". + ratio_range (tuple[float]): (min_ratio, max_ratio) + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. + """ + + def __init__(self, + img_scale=None, + multiscale_mode='range', + ratio_range=None, + keep_ratio=True): + if img_scale is None: + self.img_scale = None + else: + if isinstance(img_scale, list): + self.img_scale = img_scale + else: + self.img_scale = [img_scale] + assert mmcv.is_list_of(self.img_scale, tuple) + + if ratio_range is not None: + # mode 1: given img_scale=None and a range of image ratio + # mode 2: given a scale and a range of image ratio + assert self.img_scale is None or len(self.img_scale) == 1 + else: + # mode 3 and 4: given multiple scales or a range of scales + assert multiscale_mode in ['value', 'range'] + + self.multiscale_mode = multiscale_mode + self.ratio_range = ratio_range + self.keep_ratio = keep_ratio + + @staticmethod + def random_select(img_scales): + """Randomly select an img_scale from given candidates. + + Args: + img_scales (list[tuple]): Images scales for selection. + + Returns: + (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, + where ``img_scale`` is the selected image scale and + ``scale_idx`` is the selected index in the given candidates. + """ + + assert mmcv.is_list_of(img_scales, tuple) + scale_idx = np.random.randint(len(img_scales)) + img_scale = img_scales[scale_idx] + return img_scale, scale_idx + + @staticmethod + def random_sample(img_scales): + """Randomly sample an img_scale when ``multiscale_mode=='range'``. + + Args: + img_scales (list[tuple]): Images scale range for sampling. + There must be two tuples in img_scales, which specify the lower + and upper bound of image scales. + + Returns: + (tuple, None): Returns a tuple ``(img_scale, None)``, where + ``img_scale`` is sampled scale and None is just a placeholder + to be consistent with :func:`random_select`. + """ + + assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 + img_scale_long = [max(s) for s in img_scales] + img_scale_short = [min(s) for s in img_scales] + long_edge = np.random.randint( + min(img_scale_long), + max(img_scale_long) + 1) + short_edge = np.random.randint( + min(img_scale_short), + max(img_scale_short) + 1) + img_scale = (long_edge, short_edge) + return img_scale, None + + @staticmethod + def random_sample_ratio(img_scale, ratio_range): + """Randomly sample an img_scale when ``ratio_range`` is specified. + + A ratio will be randomly sampled from the range specified by + ``ratio_range``. Then it would be multiplied with ``img_scale`` to + generate sampled scale. + + Args: + img_scale (tuple): Images scale base to multiply with ratio. + ratio_range (tuple[float]): The minimum and maximum ratio to scale + the ``img_scale``. + + Returns: + (tuple, None): Returns a tuple ``(scale, None)``, where + ``scale`` is sampled ratio multiplied with ``img_scale`` and + None is just a placeholder to be consistent with + :func:`random_select`. + """ + + assert isinstance(img_scale, tuple) and len(img_scale) == 2 + min_ratio, max_ratio = ratio_range + assert min_ratio <= max_ratio + ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio + scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) + return scale, None + + def _random_scale(self, results): + """Randomly sample an img_scale according to ``ratio_range`` and + ``multiscale_mode``. + + If ``ratio_range`` is specified, a ratio will be sampled and be + multiplied with ``img_scale``. + If multiple scales are specified by ``img_scale``, a scale will be + sampled according to ``multiscale_mode``. + Otherwise, single scale will be used. + + Args: + results (dict): Result dict from :obj:`dataset`. + + Returns: + dict: Two new keys 'scale` and 'scale_idx` are added into + ``results``, which would be used by subsequent pipelines. + """ + + if self.ratio_range is not None: + if self.img_scale is None: + h, w = results['img'].shape[:2] + scale, scale_idx = self.random_sample_ratio((w, h), + self.ratio_range) + else: + scale, scale_idx = self.random_sample_ratio( + self.img_scale[0], self.ratio_range) + elif len(self.img_scale) == 1: + scale, scale_idx = self.img_scale[0], 0 + elif self.multiscale_mode == 'range': + scale, scale_idx = self.random_sample(self.img_scale) + elif self.multiscale_mode == 'value': + scale, scale_idx = self.random_select(self.img_scale) + else: + raise NotImplementedError + + results['scale'] = scale + results['scale_idx'] = scale_idx + + def _resize_img(self, results): + """Resize images with ``results['scale']``.""" + if self.keep_ratio: + img, scale_factor = mmcv.imrescale( + results['img'], results['scale'], return_scale=True) + # the w_scale and h_scale has minor difference + # a real fix should be done in the mmcv.imrescale in the future + new_h, new_w = img.shape[:2] + h, w = results['img'].shape[:2] + w_scale = new_w / w + h_scale = new_h / h + else: + img, w_scale, h_scale = mmcv.imresize( + results['img'], results['scale'], return_scale=True) + scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], + dtype=np.float32) + results['img'] = img + results['img_shape'] = img.shape + results['pad_shape'] = img.shape # in case that there is no padding + results['scale_factor'] = scale_factor + results['keep_ratio'] = self.keep_ratio + + def _resize_seg(self, results): + """Resize semantic segmentation map with ``results['scale']``.""" + for key in results.get('seg_fields', []): + if self.keep_ratio: + gt_seg = mmcv.imrescale( + results[key], results['scale'], interpolation='nearest') + else: + gt_seg = mmcv.imresize( + results[key], results['scale'], interpolation='nearest') + results[key] = gt_seg + + def __call__(self, results): + """Call function to resize images, bounding boxes, masks, semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', + 'keep_ratio' keys are added into result dict. + """ + + if 'scale' not in results: + self._random_scale(results) + self._resize_img(results) + self._resize_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += (f'(img_scale={self.img_scale}, ' + f'multiscale_mode={self.multiscale_mode}, ' + f'ratio_range={self.ratio_range}, ' + f'keep_ratio={self.keep_ratio})') + return repr_str + + +@PIPELINES.register_module() +class RandomFlip(object): + """Flip the image & seg. + + If the input dict contains the key "flip", then the flag will be used, + otherwise it will be randomly decided by a ratio specified in the init + method. + + Args: + prob (float, optional): The flipping probability. Default: None. + direction(str, optional): The flipping direction. Options are + 'horizontal' and 'vertical'. Default: 'horizontal'. + """ + + @deprecated_api_warning({'flip_ratio': 'prob'}, cls_name='RandomFlip') + def __init__(self, prob=None, direction='horizontal'): + self.prob = prob + self.direction = direction + if prob is not None: + assert prob >= 0 and prob <= 1 + assert direction in ['horizontal', 'vertical'] + + def __call__(self, results): + """Call function to flip bounding boxes, masks, semantic segmentation + maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Flipped results, 'flip', 'flip_direction' keys are added into + result dict. + """ + + if 'flip' not in results: + flip = True if np.random.rand() < self.prob else False + results['flip'] = flip + if 'flip_direction' not in results: + results['flip_direction'] = self.direction + if results['flip']: + # flip image + results['img'] = mmcv.imflip( + results['img'], direction=results['flip_direction']) + + # flip segs + for key in results.get('seg_fields', []): + # use copy() to make numpy stride positive + results[key] = mmcv.imflip( + results[key], direction=results['flip_direction']).copy() + return results + + def __repr__(self): + return self.__class__.__name__ + f'(prob={self.prob})' + + +@PIPELINES.register_module() +class Pad(object): + """Pad the image & mask. + + There are two padding modes: (1) pad to a fixed size and (2) pad to the + minimum size that is divisible by some number. + Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", + + Args: + size (tuple, optional): Fixed padding size. + size_divisor (int, optional): The divisor of padded size. + pad_val (float, optional): Padding value. Default: 0. + seg_pad_val (float, optional): Padding value of segmentation map. + Default: 255. + """ + + def __init__(self, + size=None, + size_divisor=None, + pad_val=0, + seg_pad_val=255): + self.size = size + self.size_divisor = size_divisor + self.pad_val = pad_val + self.seg_pad_val = seg_pad_val + # only one of size and size_divisor should be valid + assert size is not None or size_divisor is not None + assert size is None or size_divisor is None + + def _pad_img(self, results): + """Pad images according to ``self.size``.""" + if self.size is not None: + padded_img = mmcv.impad( + results['img'], shape=self.size, pad_val=self.pad_val) + elif self.size_divisor is not None: + padded_img = mmcv.impad_to_multiple( + results['img'], self.size_divisor, pad_val=self.pad_val) + results['img'] = padded_img + results['pad_shape'] = padded_img.shape + results['pad_fixed_size'] = self.size + results['pad_size_divisor'] = self.size_divisor + + def _pad_seg(self, results): + """Pad masks according to ``results['pad_shape']``.""" + for key in results.get('seg_fields', []): + results[key] = mmcv.impad( + results[key], + shape=results['pad_shape'][:2], + pad_val=self.seg_pad_val) + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Updated result dict. + """ + + self._pad_img(results) + self._pad_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(size={self.size}, size_divisor={self.size_divisor}, ' \ + f'pad_val={self.pad_val})' + return repr_str + + +@PIPELINES.register_module() +class Normalize(object): + """Normalize the image. + + Added key is "img_norm_cfg". + + Args: + mean (sequence): Mean values of 3 channels. + std (sequence): Std values of 3 channels. + to_rgb (bool): Whether to convert the image from BGR to RGB, + default is true. + """ + + def __init__(self, mean, std, to_rgb=True): + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_rgb = to_rgb + + def __call__(self, results): + """Call function to normalize images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Normalized results, 'img_norm_cfg' key is added into + result dict. + """ + + results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std, + self.to_rgb) + results['img_norm_cfg'] = dict( + mean=self.mean, std=self.std, to_rgb=self.to_rgb) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(mean={self.mean}, std={self.std}, to_rgb=' \ + f'{self.to_rgb})' + return repr_str + + +@PIPELINES.register_module() +class Rerange(object): + """Rerange the image pixel value. + + Args: + min_value (float or int): Minimum value of the reranged image. + Default: 0. + max_value (float or int): Maximum value of the reranged image. + Default: 255. + """ + + def __init__(self, min_value=0, max_value=255): + assert isinstance(min_value, float) or isinstance(min_value, int) + assert isinstance(max_value, float) or isinstance(max_value, int) + assert min_value < max_value + self.min_value = min_value + self.max_value = max_value + + def __call__(self, results): + """Call function to rerange images. + + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Reranged results. + """ + + img = results['img'] + img_min_value = np.min(img) + img_max_value = np.max(img) + + assert img_min_value < img_max_value + # rerange to [0, 1] + img = (img - img_min_value) / (img_max_value - img_min_value) + # rerange to [min_value, max_value] + img = img * (self.max_value - self.min_value) + self.min_value + results['img'] = img + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(min_value={self.min_value}, max_value={self.max_value})' + return repr_str + + +@PIPELINES.register_module() +class CLAHE(object): + """Use CLAHE method to process the image. + + See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J]. + Graphics Gems, 1994:474-485.` for more information. + + Args: + clip_limit (float): Threshold for contrast limiting. Default: 40.0. + tile_grid_size (tuple[int]): Size of grid for histogram equalization. + Input image will be divided into equally sized rectangular tiles. + It defines the number of tiles in row and column. Default: (8, 8). + """ + + def __init__(self, clip_limit=40.0, tile_grid_size=(8, 8)): + assert isinstance(clip_limit, (float, int)) + self.clip_limit = clip_limit + assert is_tuple_of(tile_grid_size, int) + assert len(tile_grid_size) == 2 + self.tile_grid_size = tile_grid_size + + def __call__(self, results): + """Call function to Use CLAHE method process images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Processed results. + """ + + for i in range(results['img'].shape[2]): + results['img'][:, :, i] = mmcv.clahe( + np.array(results['img'][:, :, i], dtype=np.uint8), + self.clip_limit, self.tile_grid_size) + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(clip_limit={self.clip_limit}, '\ + f'tile_grid_size={self.tile_grid_size})' + return repr_str + + +@PIPELINES.register_module() +class RandomCrop(object): + """Random crop the image & seg. + + Args: + crop_size (tuple): Expected size after cropping, (h, w). + cat_max_ratio (float): The maximum ratio that single category could + occupy. + """ + + def __init__(self, crop_size, cat_max_ratio=1., ignore_index=255): + assert crop_size[0] > 0 and crop_size[1] > 0 + self.crop_size = crop_size + self.cat_max_ratio = cat_max_ratio + self.ignore_index = ignore_index + + def get_crop_bbox(self, img): + """Randomly get a crop bounding box.""" + margin_h = max(img.shape[0] - self.crop_size[0], 0) + margin_w = max(img.shape[1] - self.crop_size[1], 0) + offset_h = np.random.randint(0, margin_h + 1) + offset_w = np.random.randint(0, margin_w + 1) + crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0] + crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1] + + return crop_y1, crop_y2, crop_x1, crop_x2 + + def crop(self, img, crop_bbox): + """Crop from ``img``""" + crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox + img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] + return img + + def __call__(self, results): + """Call function to randomly crop images, semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Randomly cropped results, 'img_shape' key in result dict is + updated according to crop size. + """ + + img = results['img'] + crop_bbox = self.get_crop_bbox(img) + if self.cat_max_ratio < 1.: + # Repeat 10 times + for _ in range(10): + seg_temp = self.crop(results['gt_semantic_seg'], crop_bbox) + labels, cnt = np.unique(seg_temp, return_counts=True) + cnt = cnt[labels != self.ignore_index] + if len(cnt) > 1 and np.max(cnt) / np.sum( + cnt) < self.cat_max_ratio: + break + crop_bbox = self.get_crop_bbox(img) + + # crop the image + img = self.crop(img, crop_bbox) + img_shape = img.shape + results['img'] = img + results['img_shape'] = img_shape + + # crop semantic seg + for key in results.get('seg_fields', []): + results[key] = self.crop(results[key], crop_bbox) + + return results + + def __repr__(self): + return self.__class__.__name__ + f'(crop_size={self.crop_size})' + + +@PIPELINES.register_module() +class RandomRotate(object): + """Rotate the image & seg. + + Args: + prob (float): The rotation probability. + degree (float, tuple[float]): Range of degrees to select from. If + degree is a number instead of tuple like (min, max), + the range of degree will be (``-degree``, ``+degree``) + pad_val (float, optional): Padding value of image. Default: 0. + seg_pad_val (float, optional): Padding value of segmentation map. + Default: 255. + center (tuple[float], optional): Center point (w, h) of the rotation in + the source image. If not specified, the center of the image will be + used. Default: None. + auto_bound (bool): Whether to adjust the image size to cover the whole + rotated image. Default: False + """ + + def __init__(self, + prob, + degree, + pad_val=0, + seg_pad_val=255, + center=None, + auto_bound=False): + self.prob = prob + assert prob >= 0 and prob <= 1 + if isinstance(degree, (float, int)): + assert degree > 0, f'degree {degree} should be positive' + self.degree = (-degree, degree) + else: + self.degree = degree + assert len(self.degree) == 2, f'degree {self.degree} should be a ' \ + f'tuple of (min, max)' + self.pal_val = pad_val + self.seg_pad_val = seg_pad_val + self.center = center + self.auto_bound = auto_bound + + def __call__(self, results): + """Call function to rotate image, semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Rotated results. + """ + + rotate = True if np.random.rand() < self.prob else False + degree = np.random.uniform(min(*self.degree), max(*self.degree)) + if rotate: + # rotate image + results['img'] = mmcv.imrotate( + results['img'], + angle=degree, + border_value=self.pal_val, + center=self.center, + auto_bound=self.auto_bound) + + # rotate segs + for key in results.get('seg_fields', []): + results[key] = mmcv.imrotate( + results[key], + angle=degree, + border_value=self.seg_pad_val, + center=self.center, + auto_bound=self.auto_bound, + interpolation='nearest') + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(prob={self.prob}, ' \ + f'degree={self.degree}, ' \ + f'pad_val={self.pal_val}, ' \ + f'seg_pad_val={self.seg_pad_val}, ' \ + f'center={self.center}, ' \ + f'auto_bound={self.auto_bound})' + return repr_str + + +@PIPELINES.register_module() +class RGB2Gray(object): + """Convert RGB image to grayscale image. + + This transform calculate the weighted mean of input image channels with + ``weights`` and then expand the channels to ``out_channels``. When + ``out_channels`` is None, the number of output channels is the same as + input channels. + + Args: + out_channels (int): Expected number of output channels after + transforming. Default: None. + weights (tuple[float]): The weights to calculate the weighted mean. + Default: (0.299, 0.587, 0.114). + """ + + def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)): + assert out_channels is None or out_channels > 0 + self.out_channels = out_channels + assert isinstance(weights, tuple) + for item in weights: + assert isinstance(item, (float, int)) + self.weights = weights + + def __call__(self, results): + """Call function to convert RGB image to grayscale image. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with grayscale image. + """ + img = results['img'] + assert len(img.shape) == 3 + assert img.shape[2] == len(self.weights) + weights = np.array(self.weights).reshape((1, 1, -1)) + img = (img * weights).sum(2, keepdims=True) + if self.out_channels is None: + img = img.repeat(weights.shape[2], axis=2) + else: + img = img.repeat(self.out_channels, axis=2) + + results['img'] = img + results['img_shape'] = img.shape + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(out_channels={self.out_channels}, ' \ + f'weights={self.weights})' + return repr_str + + +@PIPELINES.register_module() +class AdjustGamma(object): + """Using gamma correction to process the image. + + Args: + gamma (float or int): Gamma value used in gamma correction. + Default: 1.0. + """ + + def __init__(self, gamma=1.0): + assert isinstance(gamma, float) or isinstance(gamma, int) + assert gamma > 0 + self.gamma = gamma + inv_gamma = 1.0 / gamma + self.table = np.array([(i / 255.0)**inv_gamma * 255 + for i in np.arange(256)]).astype('uint8') + + def __call__(self, results): + """Call function to process the image with gamma correction. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Processed results. + """ + + results['img'] = mmcv.lut_transform( + np.array(results['img'], dtype=np.uint8), self.table) + + return results + + def __repr__(self): + return self.__class__.__name__ + f'(gamma={self.gamma})' + + +@PIPELINES.register_module() +class SegRescale(object): + """Rescale semantic segmentation maps. + + Args: + scale_factor (float): The scale factor of the final output. + """ + + def __init__(self, scale_factor=1): + self.scale_factor = scale_factor + + def __call__(self, results): + """Call function to scale the semantic segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with semantic segmentation map scaled. + """ + for key in results.get('seg_fields', []): + if self.scale_factor != 1: + results[key] = mmcv.imrescale( + results[key], self.scale_factor, interpolation='nearest') + return results + + def __repr__(self): + return self.__class__.__name__ + f'(scale_factor={self.scale_factor})' + + +@PIPELINES.register_module() +class PhotoMetricDistortion(object): + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. The position of random contrast is in + second or second to last. + + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + + Args: + brightness_delta (int): delta of brightness. + contrast_range (tuple): range of contrast. + saturation_range (tuple): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18): + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + def convert(self, img, alpha=1, beta=0): + """Multiple with alpha and add beat with clip.""" + img = img.astype(np.float32) * alpha + beta + img = np.clip(img, 0, 255) + return img.astype(np.uint8) + + def brightness(self, img): + """Brightness distortion.""" + if random.randint(2): + return self.convert( + img, + beta=random.uniform(-self.brightness_delta, + self.brightness_delta)) + return img + + def contrast(self, img): + """Contrast distortion.""" + if random.randint(2): + return self.convert( + img, + alpha=random.uniform(self.contrast_lower, self.contrast_upper)) + return img + + def saturation(self, img): + """Saturation distortion.""" + if random.randint(2): + img = mmcv.bgr2hsv(img) + img[:, :, 1] = self.convert( + img[:, :, 1], + alpha=random.uniform(self.saturation_lower, + self.saturation_upper)) + img = mmcv.hsv2bgr(img) + return img + + def hue(self, img): + """Hue distortion.""" + if random.randint(2): + img = mmcv.bgr2hsv(img) + img[:, :, + 0] = (img[:, :, 0].astype(int) + + random.randint(-self.hue_delta, self.hue_delta)) % 180 + img = mmcv.hsv2bgr(img) + return img + + def __call__(self, results): + """Call function to perform photometric distortion on images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images distorted. + """ + + img = results['img'] + # random brightness + img = self.brightness(img) + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + mode = random.randint(2) + if mode == 1: + img = self.contrast(img) + + # random saturation + img = self.saturation(img) + + # random hue + img = self.hue(img) + + # random contrast + if mode == 0: + img = self.contrast(img) + + results['img'] = img + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += (f'(brightness_delta={self.brightness_delta}, ' + f'contrast_range=({self.contrast_lower}, ' + f'{self.contrast_upper}), ' + f'saturation_range=({self.saturation_lower}, ' + f'{self.saturation_upper}), ' + f'hue_delta={self.hue_delta})') + return repr_str diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/stare.py b/lavis/common/annotator/uniformer/mmseg/datasets/stare.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd14e0920e7f6a73baff1432e5a32ccfdb0dfae --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/stare.py @@ -0,0 +1,27 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class STAREDataset(CustomDataset): + """STARE dataset. + + In segmentation map annotation for STARE, 0 stands for background, which is + included in 2 categories. ``reduce_zero_label`` is fixed to False. The + ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to + '.ah.png'. + """ + + CLASSES = ('background', 'vessel') + + PALETTE = [[120, 120, 120], [6, 230, 230]] + + def __init__(self, **kwargs): + super(STAREDataset, self).__init__( + img_suffix='.png', + seg_map_suffix='.ah.png', + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) diff --git a/lavis/common/annotator/uniformer/mmseg/datasets/voc.py b/lavis/common/annotator/uniformer/mmseg/datasets/voc.py new file mode 100644 index 0000000000000000000000000000000000000000..a8855203b14ee0dc4da9099a2945d4aedcffbcd6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/datasets/voc.py @@ -0,0 +1,29 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class PascalVOCDataset(CustomDataset): + """Pascal VOC dataset. + + Args: + split (str): Split txt file for Pascal VOC. + """ + + CLASSES = ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', + 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', + 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', + 'train', 'tvmonitor') + + PALETTE = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], + [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], + [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], + [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], + [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] + + def __init__(self, split, **kwargs): + super(PascalVOCDataset, self).__init__( + img_suffix='.jpg', seg_map_suffix='.png', split=split, **kwargs) + assert osp.exists(self.img_dir) and self.split is not None diff --git a/lavis/common/annotator/uniformer/mmseg/models/__init__.py b/lavis/common/annotator/uniformer/mmseg/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3cf93f8bec9cf0cef0a3bd76ca3ca92eb188f535 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/__init__.py @@ -0,0 +1,12 @@ +from .backbones import * # noqa: F401,F403 +from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone, + build_head, build_loss, build_segmentor) +from .decode_heads import * # noqa: F401,F403 +from .losses import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .segmentors import * # noqa: F401,F403 + +__all__ = [ + 'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone', + 'build_head', 'build_loss', 'build_segmentor' +] diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/__init__.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8339983905fb5d20bae42ba6f76fea75d278b1aa --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/__init__.py @@ -0,0 +1,17 @@ +from .cgnet import CGNet +# from .fast_scnn import FastSCNN +from .hrnet import HRNet +from .mobilenet_v2 import MobileNetV2 +from .mobilenet_v3 import MobileNetV3 +from .resnest import ResNeSt +from .resnet import ResNet, ResNetV1c, ResNetV1d +from .resnext import ResNeXt +from .unet import UNet +from .vit import VisionTransformer +from .uniformer import UniFormer + +__all__ = [ + 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', + 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', + 'VisionTransformer', 'UniFormer' +] diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/cgnet.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/cgnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f8bca442c8f18179f217e40c298fb5ef39df77c4 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/cgnet.py @@ -0,0 +1,367 @@ +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from annotator.uniformer.mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer, + constant_init, kaiming_init) +from annotator.uniformer.mmcv.runner import load_checkpoint +from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm + +from annotator.uniformer.mmseg.utils import get_root_logger +from ..builder import BACKBONES + + +class GlobalContextExtractor(nn.Module): + """Global Context Extractor for CGNet. + + This class is employed to refine the joint feature of both local feature + and surrounding context. + + Args: + channel (int): Number of input feature channels. + reduction (int): Reductions for global context extractor. Default: 16. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, channel, reduction=16, with_cp=False): + super(GlobalContextExtractor, self).__init__() + self.channel = channel + self.reduction = reduction + assert reduction >= 1 and channel >= reduction + self.with_cp = with_cp + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), nn.Sigmoid()) + + def forward(self, x): + + def _inner_forward(x): + num_batch, num_channel = x.size()[:2] + y = self.avg_pool(x).view(num_batch, num_channel) + y = self.fc(y).view(num_batch, num_channel, 1, 1) + return x * y + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class ContextGuidedBlock(nn.Module): + """Context Guided Block for CGNet. + + This class consists of four components: local feature extractor, + surrounding feature extractor, joint feature extractor and global + context extractor. + + Args: + in_channels (int): Number of input feature channels. + out_channels (int): Number of output feature channels. + dilation (int): Dilation rate for surrounding context extractor. + Default: 2. + reduction (int): Reduction for global context extractor. Default: 16. + skip_connect (bool): Add input to output or not. Default: True. + downsample (bool): Downsample the input to 1/2 or not. Default: False. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='PReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + out_channels, + dilation=2, + reduction=16, + skip_connect=True, + downsample=False, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'), + with_cp=False): + super(ContextGuidedBlock, self).__init__() + self.with_cp = with_cp + self.downsample = downsample + + channels = out_channels if downsample else out_channels // 2 + if 'type' in act_cfg and act_cfg['type'] == 'PReLU': + act_cfg['num_parameters'] = channels + kernel_size = 3 if downsample else 1 + stride = 2 if downsample else 1 + padding = (kernel_size - 1) // 2 + + self.conv1x1 = ConvModule( + in_channels, + channels, + kernel_size, + stride, + padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.f_loc = build_conv_layer( + conv_cfg, + channels, + channels, + kernel_size=3, + padding=1, + groups=channels, + bias=False) + self.f_sur = build_conv_layer( + conv_cfg, + channels, + channels, + kernel_size=3, + padding=dilation, + groups=channels, + dilation=dilation, + bias=False) + + self.bn = build_norm_layer(norm_cfg, 2 * channels)[1] + self.activate = nn.PReLU(2 * channels) + + if downsample: + self.bottleneck = build_conv_layer( + conv_cfg, + 2 * channels, + out_channels, + kernel_size=1, + bias=False) + + self.skip_connect = skip_connect and not downsample + self.f_glo = GlobalContextExtractor(out_channels, reduction, with_cp) + + def forward(self, x): + + def _inner_forward(x): + out = self.conv1x1(x) + loc = self.f_loc(out) + sur = self.f_sur(out) + + joi_feat = torch.cat([loc, sur], 1) # the joint feature + joi_feat = self.bn(joi_feat) + joi_feat = self.activate(joi_feat) + if self.downsample: + joi_feat = self.bottleneck(joi_feat) # channel = out_channels + # f_glo is employed to refine the joint feature + out = self.f_glo(joi_feat) + + if self.skip_connect: + return x + out + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class InputInjection(nn.Module): + """Downsampling module for CGNet.""" + + def __init__(self, num_downsampling): + super(InputInjection, self).__init__() + self.pool = nn.ModuleList() + for i in range(num_downsampling): + self.pool.append(nn.AvgPool2d(3, stride=2, padding=1)) + + def forward(self, x): + for pool in self.pool: + x = pool(x) + return x + + +@BACKBONES.register_module() +class CGNet(nn.Module): + """CGNet backbone. + + A Light-weight Context Guided Network for Semantic Segmentation + arXiv: https://arxiv.org/abs/1811.08201 + + Args: + in_channels (int): Number of input image channels. Normally 3. + num_channels (tuple[int]): Numbers of feature channels at each stages. + Default: (32, 64, 128). + num_blocks (tuple[int]): Numbers of CG blocks at stage 1 and stage 2. + Default: (3, 21). + dilations (tuple[int]): Dilation rate for surrounding context + extractors at stage 1 and stage 2. Default: (2, 4). + reductions (tuple[int]): Reductions for global context extractors at + stage 1 and stage 2. Default: (8, 16). + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='PReLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels=3, + num_channels=(32, 64, 128), + num_blocks=(3, 21), + dilations=(2, 4), + reductions=(8, 16), + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'), + norm_eval=False, + with_cp=False): + + super(CGNet, self).__init__() + self.in_channels = in_channels + self.num_channels = num_channels + assert isinstance(self.num_channels, tuple) and len( + self.num_channels) == 3 + self.num_blocks = num_blocks + assert isinstance(self.num_blocks, tuple) and len(self.num_blocks) == 2 + self.dilations = dilations + assert isinstance(self.dilations, tuple) and len(self.dilations) == 2 + self.reductions = reductions + assert isinstance(self.reductions, tuple) and len(self.reductions) == 2 + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + if 'type' in self.act_cfg and self.act_cfg['type'] == 'PReLU': + self.act_cfg['num_parameters'] = num_channels[0] + self.norm_eval = norm_eval + self.with_cp = with_cp + + cur_channels = in_channels + self.stem = nn.ModuleList() + for i in range(3): + self.stem.append( + ConvModule( + cur_channels, + num_channels[0], + 3, + 2 if i == 0 else 1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + cur_channels = num_channels[0] + + self.inject_2x = InputInjection(1) # down-sample for Input, factor=2 + self.inject_4x = InputInjection(2) # down-sample for Input, factor=4 + + cur_channels += in_channels + self.norm_prelu_0 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + # stage 1 + self.level1 = nn.ModuleList() + for i in range(num_blocks[0]): + self.level1.append( + ContextGuidedBlock( + cur_channels if i == 0 else num_channels[1], + num_channels[1], + dilations[0], + reductions[0], + downsample=(i == 0), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) # CG block + + cur_channels = 2 * num_channels[1] + in_channels + self.norm_prelu_1 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + # stage 2 + self.level2 = nn.ModuleList() + for i in range(num_blocks[1]): + self.level2.append( + ContextGuidedBlock( + cur_channels if i == 0 else num_channels[2], + num_channels[2], + dilations[1], + reductions[1], + downsample=(i == 0), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) # CG block + + cur_channels = 2 * num_channels[2] + self.norm_prelu_2 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + def forward(self, x): + output = [] + + # stage 0 + inp_2x = self.inject_2x(x) + inp_4x = self.inject_4x(x) + for layer in self.stem: + x = layer(x) + x = self.norm_prelu_0(torch.cat([x, inp_2x], 1)) + output.append(x) + + # stage 1 + for i, layer in enumerate(self.level1): + x = layer(x) + if i == 0: + down1 = x + x = self.norm_prelu_1(torch.cat([x, down1, inp_4x], 1)) + output.append(x) + + # stage 2 + for i, layer in enumerate(self.level2): + x = layer(x) + if i == 0: + down2 = x + x = self.norm_prelu_2(torch.cat([down2, x], 1)) + output.append(x) + + return output + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Linear)): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + elif isinstance(m, nn.PReLU): + constant_init(m, 0) + else: + raise TypeError('pretrained must be a str or None') + + def train(self, mode=True): + """Convert the model into training mode will keeping the normalization + layer freezed.""" + super(CGNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/fast_scnn.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/fast_scnn.py new file mode 100644 index 0000000000000000000000000000000000000000..38c2350177cbc2066f45add568d30eb6041f74f3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/fast_scnn.py @@ -0,0 +1,375 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, constant_init, + kaiming_init) +from torch.nn.modules.batchnorm import _BatchNorm + +from annotator.uniformer.mmseg.models.decode_heads.psp_head import PPM +from annotator.uniformer.mmseg.ops import resize +from ..builder import BACKBONES +from ..utils.inverted_residual import InvertedResidual + + +class LearningToDownsample(nn.Module): + """Learning to downsample module. + + Args: + in_channels (int): Number of input channels. + dw_channels (tuple[int]): Number of output channels of the first and + the second depthwise conv (dwconv) layers. + out_channels (int): Number of output channels of the whole + 'learning to downsample' module. + conv_cfg (dict | None): Config of conv layers. Default: None + norm_cfg (dict | None): Config of norm layers. Default: + dict(type='BN') + act_cfg (dict): Config of activation layers. Default: + dict(type='ReLU') + """ + + def __init__(self, + in_channels, + dw_channels, + out_channels, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU')): + super(LearningToDownsample, self).__init__() + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + dw_channels1 = dw_channels[0] + dw_channels2 = dw_channels[1] + + self.conv = ConvModule( + in_channels, + dw_channels1, + 3, + stride=2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.dsconv1 = DepthwiseSeparableConvModule( + dw_channels1, + dw_channels2, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg) + self.dsconv2 = DepthwiseSeparableConvModule( + dw_channels2, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg) + + def forward(self, x): + x = self.conv(x) + x = self.dsconv1(x) + x = self.dsconv2(x) + return x + + +class GlobalFeatureExtractor(nn.Module): + """Global feature extractor module. + + Args: + in_channels (int): Number of input channels of the GFE module. + Default: 64 + block_channels (tuple[int]): Tuple of ints. Each int specifies the + number of output channels of each Inverted Residual module. + Default: (64, 96, 128) + out_channels(int): Number of output channels of the GFE module. + Default: 128 + expand_ratio (int): Adjusts number of channels of the hidden layer + in InvertedResidual by this amount. + Default: 6 + num_blocks (tuple[int]): Tuple of ints. Each int specifies the + number of times each Inverted Residual module is repeated. + The repeated Inverted Residual modules are called a 'group'. + Default: (3, 3, 3) + strides (tuple[int]): Tuple of ints. Each int specifies + the downsampling factor of each 'group'. + Default: (2, 2, 1) + pool_scales (tuple[int]): Tuple of ints. Each int specifies + the parameter required in 'global average pooling' within PPM. + Default: (1, 2, 3, 6) + conv_cfg (dict | None): Config of conv layers. Default: None + norm_cfg (dict | None): Config of norm layers. Default: + dict(type='BN') + act_cfg (dict): Config of activation layers. Default: + dict(type='ReLU') + align_corners (bool): align_corners argument of F.interpolate. + Default: False + """ + + def __init__(self, + in_channels=64, + block_channels=(64, 96, 128), + out_channels=128, + expand_ratio=6, + num_blocks=(3, 3, 3), + strides=(2, 2, 1), + pool_scales=(1, 2, 3, 6), + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + align_corners=False): + super(GlobalFeatureExtractor, self).__init__() + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + assert len(block_channels) == len(num_blocks) == 3 + self.bottleneck1 = self._make_layer(in_channels, block_channels[0], + num_blocks[0], strides[0], + expand_ratio) + self.bottleneck2 = self._make_layer(block_channels[0], + block_channels[1], num_blocks[1], + strides[1], expand_ratio) + self.bottleneck3 = self._make_layer(block_channels[1], + block_channels[2], num_blocks[2], + strides[2], expand_ratio) + self.ppm = PPM( + pool_scales, + block_channels[2], + block_channels[2] // 4, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=align_corners) + self.out = ConvModule( + block_channels[2] * 2, + out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def _make_layer(self, + in_channels, + out_channels, + blocks, + stride=1, + expand_ratio=6): + layers = [ + InvertedResidual( + in_channels, + out_channels, + stride, + expand_ratio, + norm_cfg=self.norm_cfg) + ] + for i in range(1, blocks): + layers.append( + InvertedResidual( + out_channels, + out_channels, + 1, + expand_ratio, + norm_cfg=self.norm_cfg)) + return nn.Sequential(*layers) + + def forward(self, x): + x = self.bottleneck1(x) + x = self.bottleneck2(x) + x = self.bottleneck3(x) + x = torch.cat([x, *self.ppm(x)], dim=1) + x = self.out(x) + return x + + +class FeatureFusionModule(nn.Module): + """Feature fusion module. + + Args: + higher_in_channels (int): Number of input channels of the + higher-resolution branch. + lower_in_channels (int): Number of input channels of the + lower-resolution branch. + out_channels (int): Number of output channels. + conv_cfg (dict | None): Config of conv layers. Default: None + norm_cfg (dict | None): Config of norm layers. Default: + dict(type='BN') + act_cfg (dict): Config of activation layers. Default: + dict(type='ReLU') + align_corners (bool): align_corners argument of F.interpolate. + Default: False + """ + + def __init__(self, + higher_in_channels, + lower_in_channels, + out_channels, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + align_corners=False): + super(FeatureFusionModule, self).__init__() + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.align_corners = align_corners + self.dwconv = ConvModule( + lower_in_channels, + out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.conv_lower_res = ConvModule( + out_channels, + out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + self.conv_higher_res = ConvModule( + higher_in_channels, + out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + self.relu = nn.ReLU(True) + + def forward(self, higher_res_feature, lower_res_feature): + lower_res_feature = resize( + lower_res_feature, + size=higher_res_feature.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + lower_res_feature = self.dwconv(lower_res_feature) + lower_res_feature = self.conv_lower_res(lower_res_feature) + + higher_res_feature = self.conv_higher_res(higher_res_feature) + out = higher_res_feature + lower_res_feature + return self.relu(out) + + +@BACKBONES.register_module() +class FastSCNN(nn.Module): + """Fast-SCNN Backbone. + + Args: + in_channels (int): Number of input image channels. Default: 3. + downsample_dw_channels (tuple[int]): Number of output channels after + the first conv layer & the second conv layer in + Learning-To-Downsample (LTD) module. + Default: (32, 48). + global_in_channels (int): Number of input channels of + Global Feature Extractor(GFE). + Equal to number of output channels of LTD. + Default: 64. + global_block_channels (tuple[int]): Tuple of integers that describe + the output channels for each of the MobileNet-v2 bottleneck + residual blocks in GFE. + Default: (64, 96, 128). + global_block_strides (tuple[int]): Tuple of integers + that describe the strides (downsampling factors) for each of the + MobileNet-v2 bottleneck residual blocks in GFE. + Default: (2, 2, 1). + global_out_channels (int): Number of output channels of GFE. + Default: 128. + higher_in_channels (int): Number of input channels of the higher + resolution branch in FFM. + Equal to global_in_channels. + Default: 64. + lower_in_channels (int): Number of input channels of the lower + resolution branch in FFM. + Equal to global_out_channels. + Default: 128. + fusion_out_channels (int): Number of output channels of FFM. + Default: 128. + out_indices (tuple): Tuple of indices of list + [higher_res_features, lower_res_features, fusion_output]. + Often set to (0,1,2) to enable aux. heads. + Default: (0, 1, 2). + conv_cfg (dict | None): Config of conv layers. Default: None + norm_cfg (dict | None): Config of norm layers. Default: + dict(type='BN') + act_cfg (dict): Config of activation layers. Default: + dict(type='ReLU') + align_corners (bool): align_corners argument of F.interpolate. + Default: False + """ + + def __init__(self, + in_channels=3, + downsample_dw_channels=(32, 48), + global_in_channels=64, + global_block_channels=(64, 96, 128), + global_block_strides=(2, 2, 1), + global_out_channels=128, + higher_in_channels=64, + lower_in_channels=128, + fusion_out_channels=128, + out_indices=(0, 1, 2), + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + align_corners=False): + + super(FastSCNN, self).__init__() + if global_in_channels != higher_in_channels: + raise AssertionError('Global Input Channels must be the same \ + with Higher Input Channels!') + elif global_out_channels != lower_in_channels: + raise AssertionError('Global Output Channels must be the same \ + with Lower Input Channels!') + + self.in_channels = in_channels + self.downsample_dw_channels1 = downsample_dw_channels[0] + self.downsample_dw_channels2 = downsample_dw_channels[1] + self.global_in_channels = global_in_channels + self.global_block_channels = global_block_channels + self.global_block_strides = global_block_strides + self.global_out_channels = global_out_channels + self.higher_in_channels = higher_in_channels + self.lower_in_channels = lower_in_channels + self.fusion_out_channels = fusion_out_channels + self.out_indices = out_indices + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.align_corners = align_corners + self.learning_to_downsample = LearningToDownsample( + in_channels, + downsample_dw_channels, + global_in_channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.global_feature_extractor = GlobalFeatureExtractor( + global_in_channels, + global_block_channels, + global_out_channels, + strides=self.global_block_strides, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + self.feature_fusion = FeatureFusionModule( + higher_in_channels, + lower_in_channels, + fusion_out_channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + + def init_weights(self, pretrained=None): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + def forward(self, x): + higher_res_features = self.learning_to_downsample(x) + lower_res_features = self.global_feature_extractor(higher_res_features) + fusion_output = self.feature_fusion(higher_res_features, + lower_res_features) + + outs = [higher_res_features, lower_res_features, fusion_output] + outs = [outs[i] for i in self.out_indices] + return tuple(outs) diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/hrnet.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..331ebf3ccb8597b3f507670753789073fc3c946d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/hrnet.py @@ -0,0 +1,555 @@ +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, + kaiming_init) +from annotator.uniformer.mmcv.runner import load_checkpoint +from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm + +from annotator.uniformer.mmseg.ops import Upsample, resize +from annotator.uniformer.mmseg.utils import get_root_logger +from ..builder import BACKBONES +from .resnet import BasicBlock, Bottleneck + + +class HRModule(nn.Module): + """High-Resolution Module for HRNet. + + In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange + is in this module. + """ + + def __init__(self, + num_branches, + blocks, + num_blocks, + in_channels, + num_channels, + multiscale_output=True, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True)): + super(HRModule, self).__init__() + self._check_branches(num_branches, num_blocks, in_channels, + num_channels) + + self.in_channels = in_channels + self.num_branches = num_branches + + self.multiscale_output = multiscale_output + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.with_cp = with_cp + self.branches = self._make_branches(num_branches, blocks, num_blocks, + num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(inplace=False) + + def _check_branches(self, num_branches, num_blocks, in_channels, + num_channels): + """Check branches configuration.""" + if num_branches != len(num_blocks): + error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_BLOCKS(' \ + f'{len(num_blocks)})' + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_CHANNELS(' \ + f'{len(num_channels)})' + raise ValueError(error_msg) + + if num_branches != len(in_channels): + error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_INCHANNELS(' \ + f'{len(in_channels)})' + raise ValueError(error_msg) + + def _make_one_branch(self, + branch_index, + block, + num_blocks, + num_channels, + stride=1): + """Build one branch.""" + downsample = None + if stride != 1 or \ + self.in_channels[branch_index] != \ + num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + self.in_channels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, num_channels[branch_index] * + block.expansion)[1]) + + layers = [] + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + self.in_channels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + """Build multiple branch.""" + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels)) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + """Build fuse layer.""" + if self.num_branches == 1: + return None + + num_branches = self.num_branches + in_channels = self.in_channels + fuse_layers = [] + num_out_branches = num_branches if self.multiscale_output else 1 + for i in range(num_out_branches): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, in_channels[i])[1], + # we set align_corners=False for HRNet + Upsample( + scale_factor=2**(j - i), + mode='bilinear', + align_corners=False))) + elif j == i: + fuse_layer.append(None) + else: + conv_downsamples = [] + for k in range(i - j): + if k == i - j - 1: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[i])[1])) + else: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + nn.ReLU(inplace=False))) + fuse_layer.append(nn.Sequential(*conv_downsamples)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def forward(self, x): + """Forward function.""" + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + for i in range(len(self.fuse_layers)): + y = 0 + for j in range(self.num_branches): + if i == j: + y += x[j] + elif j > i: + y = y + resize( + self.fuse_layers[i][j](x[j]), + size=x[i].shape[2:], + mode='bilinear', + align_corners=False) + else: + y += self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + return x_fuse + + +@BACKBONES.register_module() +class HRNet(nn.Module): + """HRNet backbone. + + High-Resolution Representations for Labeling Pixels and Regions + arXiv: https://arxiv.org/abs/1904.04514 + + Args: + extra (dict): detailed configuration for each stage of HRNet. + in_channels (int): Number of input image channels. Normally 3. + conv_cfg (dict): dictionary to construct and config conv layer. + norm_cfg (dict): dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. + + Example: + >>> from annotator.uniformer.mmseg.models import HRNet + >>> import torch + >>> extra = dict( + >>> stage1=dict( + >>> num_modules=1, + >>> num_branches=1, + >>> block='BOTTLENECK', + >>> num_blocks=(4, ), + >>> num_channels=(64, )), + >>> stage2=dict( + >>> num_modules=1, + >>> num_branches=2, + >>> block='BASIC', + >>> num_blocks=(4, 4), + >>> num_channels=(32, 64)), + >>> stage3=dict( + >>> num_modules=4, + >>> num_branches=3, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4), + >>> num_channels=(32, 64, 128)), + >>> stage4=dict( + >>> num_modules=3, + >>> num_branches=4, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4, 4), + >>> num_channels=(32, 64, 128, 256))) + >>> self = HRNet(extra, in_channels=1) + >>> self.eval() + >>> inputs = torch.rand(1, 1, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 32, 8, 8) + (1, 64, 4, 4) + (1, 128, 2, 2) + (1, 256, 1, 1) + """ + + blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} + + def __init__(self, + extra, + in_channels=3, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + with_cp=False, + zero_init_residual=False): + super(HRNet, self).__init__() + self.extra = extra + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + # stem net + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) + self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) + + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + 64, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.relu = nn.ReLU(inplace=True) + + # stage 1 + self.stage1_cfg = self.extra['stage1'] + num_channels = self.stage1_cfg['num_channels'][0] + block_type = self.stage1_cfg['block'] + num_blocks = self.stage1_cfg['num_blocks'][0] + + block = self.blocks_dict[block_type] + stage1_out_channels = num_channels * block.expansion + self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) + + # stage 2 + self.stage2_cfg = self.extra['stage2'] + num_channels = self.stage2_cfg['num_channels'] + block_type = self.stage2_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition1 = self._make_transition_layer([stage1_out_channels], + num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + # stage 3 + self.stage3_cfg = self.extra['stage3'] + num_channels = self.stage3_cfg['num_channels'] + block_type = self.stage3_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition2 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + # stage 4 + self.stage4_cfg = self.extra['stage4'] + num_channels = self.stage4_cfg['num_channels'] + block_type = self.stage4_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition3 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + def _make_transition_layer(self, num_channels_pre_layer, + num_channels_cur_layer): + """Make transition layer.""" + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + num_channels_pre_layer[i], + num_channels_cur_layer[i], + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + num_channels_cur_layer[i])[1], + nn.ReLU(inplace=True))) + else: + transition_layers.append(None) + else: + conv_downsamples = [] + for j in range(i + 1 - num_branches_pre): + in_channels = num_channels_pre_layer[-1] + out_channels = num_channels_cur_layer[i] \ + if j == i - num_branches_pre else in_channels + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, out_channels)[1], + nn.ReLU(inplace=True))) + transition_layers.append(nn.Sequential(*conv_downsamples)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, inplanes, planes, blocks, stride=1): + """Make each layer.""" + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, planes * block.expansion)[1]) + + layers = [] + layers.append( + block( + inplanes, + planes, + stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, in_channels, multiscale_output=True): + """Make each stage.""" + num_modules = layer_config['num_modules'] + num_branches = layer_config['num_branches'] + num_blocks = layer_config['num_blocks'] + num_channels = layer_config['num_channels'] + block = self.blocks_dict[layer_config['block']] + + hr_modules = [] + for i in range(num_modules): + # multi_scale_output is only used for the last module + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + + hr_modules.append( + HRModule( + num_branches, + block, + num_blocks, + in_channels, + num_channels, + reset_multiscale_output, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + + return nn.Sequential(*hr_modules), in_channels + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + constant_init(m.norm3, 0) + elif isinstance(m, BasicBlock): + constant_init(m.norm2, 0) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Forward function.""" + + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.norm2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['num_branches']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['num_branches']): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['num_branches']): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + + return y_list + + def train(self, mode=True): + """Convert the model into training mode will keeping the normalization + layer freezed.""" + super(HRNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/mobilenet_v2.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/mobilenet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..ab6b3791692a0d1b5da3601875711710b7bd01ba --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/mobilenet_v2.py @@ -0,0 +1,180 @@ +import logging + +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule, constant_init, kaiming_init +from annotator.uniformer.mmcv.runner import load_checkpoint +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from ..utils import InvertedResidual, make_divisible + + +@BACKBONES.register_module() +class MobileNetV2(nn.Module): + """MobileNetV2 backbone. + + Args: + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Default: 1.0. + strides (Sequence[int], optional): Strides of the first block of each + layer. If not specified, default config in ``arch_setting`` will + be used. + dilations (Sequence[int]): Dilation of each layer. + out_indices (None or Sequence[int]): Output from which stages. + Default: (7, ). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + # Parameters to build layers. 3 parameters are needed to construct a + # layer, from left to right: expand_ratio, channel, num_blocks. + arch_settings = [[1, 16, 1], [6, 24, 2], [6, 32, 3], [6, 64, 4], + [6, 96, 3], [6, 160, 3], [6, 320, 1]] + + def __init__(self, + widen_factor=1., + strides=(1, 2, 2, 2, 1, 2, 1), + dilations=(1, 1, 1, 1, 1, 1, 1), + out_indices=(1, 2, 4, 6), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + norm_eval=False, + with_cp=False): + super(MobileNetV2, self).__init__() + self.widen_factor = widen_factor + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == len(self.arch_settings) + self.out_indices = out_indices + for index in out_indices: + if index not in range(0, 7): + raise ValueError('the item in out_indices must in ' + f'range(0, 8). But received {index}') + + if frozen_stages not in range(-1, 7): + raise ValueError('frozen_stages must be in range(-1, 7). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.in_channels = make_divisible(32 * widen_factor, 8) + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.layers = [] + + for i, layer_cfg in enumerate(self.arch_settings): + expand_ratio, channel, num_blocks = layer_cfg + stride = self.strides[i] + dilation = self.dilations[i] + out_channels = make_divisible(channel * widen_factor, 8) + inverted_res_layer = self.make_layer( + out_channels=out_channels, + num_blocks=num_blocks, + stride=stride, + dilation=dilation, + expand_ratio=expand_ratio) + layer_name = f'layer{i + 1}' + self.add_module(layer_name, inverted_res_layer) + self.layers.append(layer_name) + + def make_layer(self, out_channels, num_blocks, stride, dilation, + expand_ratio): + """Stack InvertedResidual blocks to build a layer for MobileNetV2. + + Args: + out_channels (int): out_channels of block. + num_blocks (int): Number of blocks. + stride (int): Stride of the first block. + dilation (int): Dilation of the first block. + expand_ratio (int): Expand the number of channels of the + hidden layer in InvertedResidual by this ratio. + """ + layers = [] + for i in range(num_blocks): + layers.append( + InvertedResidual( + self.in_channels, + out_channels, + stride if i == 0 else 1, + expand_ratio=expand_ratio, + dilation=dilation if i == 0 else 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(MobileNetV2, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/mobilenet_v3.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/mobilenet_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..16817400b4102899794fe64c9644713a4e54e2f9 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/mobilenet_v3.py @@ -0,0 +1,255 @@ +import logging + +import annotator.uniformer.mmcv as mmcv +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule, constant_init, kaiming_init +from annotator.uniformer.mmcv.cnn.bricks import Conv2dAdaptivePadding +from annotator.uniformer.mmcv.runner import load_checkpoint +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from ..utils import InvertedResidualV3 as InvertedResidual + + +@BACKBONES.register_module() +class MobileNetV3(nn.Module): + """MobileNetV3 backbone. + + This backbone is the improved implementation of `Searching for MobileNetV3 + `_. + + Args: + arch (str): Architecture of mobilnetv3, from {'small', 'large'}. + Default: 'small'. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + out_indices (tuple[int]): Output from which layer. + Default: (0, 1, 12). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save + some memory while slowing down the training speed. + Default: False. + """ + # Parameters to build each block: + # [kernel size, mid channels, out channels, with_se, act type, stride] + arch_settings = { + 'small': [[3, 16, 16, True, 'ReLU', 2], # block0 layer1 os=4 + [3, 72, 24, False, 'ReLU', 2], # block1 layer2 os=8 + [3, 88, 24, False, 'ReLU', 1], + [5, 96, 40, True, 'HSwish', 2], # block2 layer4 os=16 + [5, 240, 40, True, 'HSwish', 1], + [5, 240, 40, True, 'HSwish', 1], + [5, 120, 48, True, 'HSwish', 1], # block3 layer7 os=16 + [5, 144, 48, True, 'HSwish', 1], + [5, 288, 96, True, 'HSwish', 2], # block4 layer9 os=32 + [5, 576, 96, True, 'HSwish', 1], + [5, 576, 96, True, 'HSwish', 1]], + 'large': [[3, 16, 16, False, 'ReLU', 1], # block0 layer1 os=2 + [3, 64, 24, False, 'ReLU', 2], # block1 layer2 os=4 + [3, 72, 24, False, 'ReLU', 1], + [5, 72, 40, True, 'ReLU', 2], # block2 layer4 os=8 + [5, 120, 40, True, 'ReLU', 1], + [5, 120, 40, True, 'ReLU', 1], + [3, 240, 80, False, 'HSwish', 2], # block3 layer7 os=16 + [3, 200, 80, False, 'HSwish', 1], + [3, 184, 80, False, 'HSwish', 1], + [3, 184, 80, False, 'HSwish', 1], + [3, 480, 112, True, 'HSwish', 1], # block4 layer11 os=16 + [3, 672, 112, True, 'HSwish', 1], + [5, 672, 160, True, 'HSwish', 2], # block5 layer13 os=32 + [5, 960, 160, True, 'HSwish', 1], + [5, 960, 160, True, 'HSwish', 1]] + } # yapf: disable + + def __init__(self, + arch='small', + conv_cfg=None, + norm_cfg=dict(type='BN'), + out_indices=(0, 1, 12), + frozen_stages=-1, + reduction_factor=1, + norm_eval=False, + with_cp=False): + super(MobileNetV3, self).__init__() + assert arch in self.arch_settings + assert isinstance(reduction_factor, int) and reduction_factor > 0 + assert mmcv.is_tuple_of(out_indices, int) + for index in out_indices: + if index not in range(0, len(self.arch_settings[arch]) + 2): + raise ValueError( + 'the item in out_indices must in ' + f'range(0, {len(self.arch_settings[arch])+2}). ' + f'But received {index}') + + if frozen_stages not in range(-1, len(self.arch_settings[arch]) + 2): + raise ValueError('frozen_stages must be in range(-1, ' + f'{len(self.arch_settings[arch])+2}). ' + f'But received {frozen_stages}') + self.arch = arch + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.reduction_factor = reduction_factor + self.norm_eval = norm_eval + self.with_cp = with_cp + self.layers = self._make_layer() + + def _make_layer(self): + layers = [] + + # build the first layer (layer0) + in_channels = 16 + layer = ConvModule( + in_channels=3, + out_channels=in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=dict(type='Conv2dAdaptivePadding'), + norm_cfg=self.norm_cfg, + act_cfg=dict(type='HSwish')) + self.add_module('layer0', layer) + layers.append('layer0') + + layer_setting = self.arch_settings[self.arch] + for i, params in enumerate(layer_setting): + (kernel_size, mid_channels, out_channels, with_se, act, + stride) = params + + if self.arch == 'large' and i >= 12 or self.arch == 'small' and \ + i >= 8: + mid_channels = mid_channels // self.reduction_factor + out_channels = out_channels // self.reduction_factor + + if with_se: + se_cfg = dict( + channels=mid_channels, + ratio=4, + act_cfg=(dict(type='ReLU'), + dict(type='HSigmoid', bias=3.0, divisor=6.0))) + else: + se_cfg = None + + layer = InvertedResidual( + in_channels=in_channels, + out_channels=out_channels, + mid_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + se_cfg=se_cfg, + with_expand_conv=(in_channels != mid_channels), + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type=act), + with_cp=self.with_cp) + in_channels = out_channels + layer_name = 'layer{}'.format(i + 1) + self.add_module(layer_name, layer) + layers.append(layer_name) + + # build the last layer + # block5 layer12 os=32 for small model + # block6 layer16 os=32 for large model + layer = ConvModule( + in_channels=in_channels, + out_channels=576 if self.arch == 'small' else 960, + kernel_size=1, + stride=1, + dilation=4, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type='HSwish')) + layer_name = 'layer{}'.format(len(layer_setting) + 1) + self.add_module(layer_name, layer) + layers.append(layer_name) + + # next, convert backbone MobileNetV3 to a semantic segmentation version + if self.arch == 'small': + self.layer4.depthwise_conv.conv.stride = (1, 1) + self.layer9.depthwise_conv.conv.stride = (1, 1) + for i in range(4, len(layers)): + layer = getattr(self, layers[i]) + if isinstance(layer, InvertedResidual): + modified_module = layer.depthwise_conv.conv + else: + modified_module = layer.conv + + if i < 9: + modified_module.dilation = (2, 2) + pad = 2 + else: + modified_module.dilation = (4, 4) + pad = 4 + + if not isinstance(modified_module, Conv2dAdaptivePadding): + # Adjust padding + pad *= (modified_module.kernel_size[0] - 1) // 2 + modified_module.padding = (pad, pad) + else: + self.layer7.depthwise_conv.conv.stride = (1, 1) + self.layer13.depthwise_conv.conv.stride = (1, 1) + for i in range(7, len(layers)): + layer = getattr(self, layers[i]) + if isinstance(layer, InvertedResidual): + modified_module = layer.depthwise_conv.conv + else: + modified_module = layer.conv + + if i < 13: + modified_module.dilation = (2, 2) + pad = 2 + else: + modified_module.dilation = (4, 4) + pad = 4 + + if not isinstance(modified_module, Conv2dAdaptivePadding): + # Adjust padding + pad *= (modified_module.kernel_size[0] - 1) // 2 + modified_module.padding = (pad, pad) + + return layers + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + return outs + + def _freeze_stages(self): + for i in range(self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(MobileNetV3, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/resnest.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/resnest.py new file mode 100644 index 0000000000000000000000000000000000000000..b45a837f395230029e9d4194ff9f7f2f8f7067b0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/resnest.py @@ -0,0 +1,314 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from annotator.uniformer.mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from ..utils import ResLayer +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResNetV1d + + +class RSoftmax(nn.Module): + """Radix Softmax module in ``SplitAttentionConv2d``. + + Args: + radix (int): Radix of input. + groups (int): Groups of input. + """ + + def __init__(self, radix, groups): + super().__init__() + self.radix = radix + self.groups = groups + + def forward(self, x): + batch = x.size(0) + if self.radix > 1: + x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2) + x = F.softmax(x, dim=1) + x = x.reshape(batch, -1) + else: + x = torch.sigmoid(x) + return x + + +class SplitAttentionConv2d(nn.Module): + """Split-Attention Conv2d in ResNeSt. + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int | tuple[int]): Same as nn.Conv2d. + stride (int | tuple[int]): Same as nn.Conv2d. + padding (int | tuple[int]): Same as nn.Conv2d. + dilation (int | tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels. Default: 4. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Default: None. + dcn (dict): Config dict for DCN. Default: None. + """ + + def __init__(self, + in_channels, + channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + radix=2, + reduction_factor=4, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None): + super(SplitAttentionConv2d, self).__init__() + inter_channels = max(in_channels * radix // reduction_factor, 32) + self.radix = radix + self.groups = groups + self.channels = channels + self.with_dcn = dcn is not None + self.dcn = dcn + fallback_on_stride = False + if self.with_dcn: + fallback_on_stride = self.dcn.pop('fallback_on_stride', False) + if self.with_dcn and not fallback_on_stride: + assert conv_cfg is None, 'conv_cfg must be None for DCN' + conv_cfg = dcn + self.conv = build_conv_layer( + conv_cfg, + in_channels, + channels * radix, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups * radix, + bias=False) + self.norm0_name, norm0 = build_norm_layer( + norm_cfg, channels * radix, postfix=0) + self.add_module(self.norm0_name, norm0) + self.relu = nn.ReLU(inplace=True) + self.fc1 = build_conv_layer( + None, channels, inter_channels, 1, groups=self.groups) + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, inter_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.fc2 = build_conv_layer( + None, inter_channels, channels * radix, 1, groups=self.groups) + self.rsoftmax = RSoftmax(radix, groups) + + @property + def norm0(self): + """nn.Module: the normalization layer named "norm0" """ + return getattr(self, self.norm0_name) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def forward(self, x): + x = self.conv(x) + x = self.norm0(x) + x = self.relu(x) + + batch, rchannel = x.shape[:2] + batch = x.size(0) + if self.radix > 1: + splits = x.view(batch, self.radix, -1, *x.shape[2:]) + gap = splits.sum(dim=1) + else: + gap = x + gap = F.adaptive_avg_pool2d(gap, 1) + gap = self.fc1(gap) + + gap = self.norm1(gap) + gap = self.relu(gap) + + atten = self.fc2(gap) + atten = self.rsoftmax(atten).view(batch, -1, 1, 1) + + if self.radix > 1: + attens = atten.view(batch, self.radix, -1, *atten.shape[2:]) + out = torch.sum(attens * splits, dim=1) + else: + out = atten * x + return out.contiguous() + + +class Bottleneck(_Bottleneck): + """Bottleneck block for ResNeSt. + + Args: + inplane (int): Input planes of this block. + planes (int): Middle planes of this block. + groups (int): Groups of conv2. + width_per_group (int): Width per group of conv2. 64x4d indicates + ``groups=64, width_per_group=4`` and 32x8d indicates + ``groups=32, width_per_group=8``. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels in + SplitAttentionConv2d. Default: 4. + avg_down_stride (bool): Whether to use average pool for stride in + Bottleneck. Default: True. + kwargs (dict): Key word arguments for base class. + """ + expansion = 4 + + def __init__(self, + inplanes, + planes, + groups=1, + base_width=4, + base_channels=64, + radix=2, + reduction_factor=4, + avg_down_stride=True, + **kwargs): + """Bottleneck block for ResNeSt.""" + super(Bottleneck, self).__init__(inplanes, planes, **kwargs) + + if groups == 1: + width = self.planes + else: + width = math.floor(self.planes * + (base_width / base_channels)) * groups + + self.avg_down_stride = avg_down_stride and self.conv2_stride > 1 + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, width, postfix=1) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.inplanes, + width, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.with_modulated_dcn = False + self.conv2 = SplitAttentionConv2d( + width, + width, + kernel_size=3, + stride=1 if self.avg_down_stride else self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + radix=radix, + reduction_factor=reduction_factor, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + dcn=self.dcn) + delattr(self, self.norm2_name) + + if self.avg_down_stride: + self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1) + + self.conv3 = build_conv_layer( + self.conv_cfg, + width, + self.planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + def forward(self, x): + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + out = self.conv2(out) + + if self.avg_down_stride: + out = self.avd_layer(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@BACKBONES.register_module() +class ResNeSt(ResNetV1d): + """ResNeSt backbone. + + Args: + groups (int): Number of groups of Bottleneck. Default: 1 + base_width (int): Base width of Bottleneck. Default: 4 + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels in + SplitAttentionConv2d. Default: 4. + avg_down_stride (bool): Whether to use average pool for stride in + Bottleneck. Default: True. + kwargs (dict): Keyword arguments for ResNet. + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)), + 200: (Bottleneck, (3, 24, 36, 3)) + } + + def __init__(self, + groups=1, + base_width=4, + radix=2, + reduction_factor=4, + avg_down_stride=True, + **kwargs): + self.groups = groups + self.base_width = base_width + self.radix = radix + self.reduction_factor = reduction_factor + self.avg_down_stride = avg_down_stride + super(ResNeSt, self).__init__(**kwargs) + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``.""" + return ResLayer( + groups=self.groups, + base_width=self.base_width, + base_channels=self.base_channels, + radix=self.radix, + reduction_factor=self.reduction_factor, + avg_down_stride=self.avg_down_stride, + **kwargs) diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/resnet.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..4e52bf048d28ecb069db4728e5f05ad85ac53198 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/resnet.py @@ -0,0 +1,688 @@ +import torch.nn as nn +import torch.utils.checkpoint as cp +from annotator.uniformer.mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer, + constant_init, kaiming_init) +from annotator.uniformer.mmcv.runner import load_checkpoint +from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm + +from annotator.uniformer.mmseg.utils import get_root_logger +from ..builder import BACKBONES +from ..utils import ResLayer + + +class BasicBlock(nn.Module): + """Basic block for ResNet.""" + + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None): + super(BasicBlock, self).__init__() + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + 3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + conv_cfg, planes, planes, 3, padding=1, bias=False) + self.add_module(self.norm2_name, norm2) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + """Bottleneck block for ResNet. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is + "caffe", the stride-two layer is the first 1x1 conv layer. + """ + + expansion = 4 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None): + super(Bottleneck, self).__init__() + assert style in ['pytorch', 'caffe'] + assert dcn is None or isinstance(dcn, dict) + assert plugins is None or isinstance(plugins, list) + if plugins is not None: + allowed_position = ['after_conv1', 'after_conv2', 'after_conv3'] + assert all(p['position'] in allowed_position for p in plugins) + + self.inplanes = inplanes + self.planes = planes + self.stride = stride + self.dilation = dilation + self.style = style + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.dcn = dcn + self.with_dcn = dcn is not None + self.plugins = plugins + self.with_plugins = plugins is not None + + if self.with_plugins: + # collect plugins for conv1/conv2/conv3 + self.after_conv1_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv1' + ] + self.after_conv2_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv2' + ] + self.after_conv3_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv3' + ] + + if self.style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + norm_cfg, planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + fallback_on_stride = False + if self.with_dcn: + fallback_on_stride = dcn.pop('fallback_on_stride', False) + if not self.with_dcn or fallback_on_stride: + self.conv2 = build_conv_layer( + conv_cfg, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + self.conv2 = build_conv_layer( + dcn, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + conv_cfg, + planes, + planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + if self.with_plugins: + self.after_conv1_plugin_names = self.make_block_plugins( + planes, self.after_conv1_plugins) + self.after_conv2_plugin_names = self.make_block_plugins( + planes, self.after_conv2_plugins) + self.after_conv3_plugin_names = self.make_block_plugins( + planes * self.expansion, self.after_conv3_plugins) + + def make_block_plugins(self, in_channels, plugins): + """make plugins for block. + + Args: + in_channels (int): Input channels of plugin. + plugins (list[dict]): List of plugins cfg to build. + + Returns: + list[str]: List of the names of plugin. + """ + assert isinstance(plugins, list) + plugin_names = [] + for plugin in plugins: + plugin = plugin.copy() + name, layer = build_plugin_layer( + plugin, + in_channels=in_channels, + postfix=plugin.pop('postfix', '')) + assert not hasattr(self, name), f'duplicate plugin {name}' + self.add_module(name, layer) + plugin_names.append(name) + return plugin_names + + def forward_plugin(self, x, plugin_names): + """Forward function for plugins.""" + out = x + for name in plugin_names: + out = getattr(self, name)(x) + return out + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) + + @property + def norm3(self): + """nn.Module: normalization layer after the third convolution layer""" + return getattr(self, self.norm3_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@BACKBONES.register_module() +class ResNet(nn.Module): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + in_channels (int): Number of input image channels. Default" 3. + stem_channels (int): Number of stem channels. Default: 64. + base_channels (int): Number of base channels of res layer. Default: 64. + num_stages (int): Resnet stages, normally 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + norm_cfg (dict): Dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + + - position (str, required): Position inside block to insert plugin, + options: 'after_conv1', 'after_conv2', 'after_conv3'. + + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages' + multi_grid (Sequence[int]|None): Multi grid dilation rates of last + stage. Default: None + contract_dilation (bool): Whether contract first dilation of each layer + Default: False + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. + + Example: + >>> from annotator.uniformer.mmseg.models import ResNet + >>> import torch + >>> self = ResNet(depth=18) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 64, 8, 8) + (1, 128, 4, 4) + (1, 256, 2, 2) + (1, 512, 1, 1) + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + depth, + in_channels=3, + stem_channels=64, + base_channels=64, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(0, 1, 2, 3), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + dcn=None, + stage_with_dcn=(False, False, False, False), + plugins=None, + multi_grid=None, + contract_dilation=False, + with_cp=False, + zero_init_residual=True): + super(ResNet, self).__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + self.depth = depth + self.stem_channels = stem_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert num_stages >= 1 and num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.dcn = dcn + self.stage_with_dcn = stage_with_dcn + if dcn is not None: + assert len(stage_with_dcn) == num_stages + self.plugins = plugins + self.multi_grid = multi_grid + self.contract_dilation = contract_dilation + self.zero_init_residual = zero_init_residual + self.block, stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + self.inplanes = stem_channels + + self._make_stem_layer(in_channels, stem_channels) + + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = strides[i] + dilation = dilations[i] + dcn = self.dcn if self.stage_with_dcn[i] else None + if plugins is not None: + stage_plugins = self.make_stage_plugins(plugins, i) + else: + stage_plugins = None + # multi grid is applied to last layer only + stage_multi_grid = multi_grid if i == len( + self.stage_blocks) - 1 else None + planes = base_channels * 2**i + res_layer = self.make_res_layer( + block=self.block, + inplanes=self.inplanes, + planes=planes, + num_blocks=num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + dcn=dcn, + plugins=stage_plugins, + multi_grid=stage_multi_grid, + contract_dilation=contract_dilation) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i+1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = self.block.expansion * base_channels * 2**( + len(self.stage_blocks) - 1) + + def make_stage_plugins(self, plugins, stage_idx): + """make plugins for ResNet 'stage_idx'th stage . + + Currently we support to insert 'context_block', + 'empirical_attention_block', 'nonlocal_block' into the backbone like + ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of + Bottleneck. + + An example of plugins format could be : + >>> plugins=[ + ... dict(cfg=dict(type='xxx', arg1='xxx'), + ... stages=(False, True, True, True), + ... position='after_conv2'), + ... dict(cfg=dict(type='yyy'), + ... stages=(True, True, True, True), + ... position='after_conv3'), + ... dict(cfg=dict(type='zzz', postfix='1'), + ... stages=(True, True, True, True), + ... position='after_conv3'), + ... dict(cfg=dict(type='zzz', postfix='2'), + ... stages=(True, True, True, True), + ... position='after_conv3') + ... ] + >>> self = ResNet(depth=18) + >>> stage_plugins = self.make_stage_plugins(plugins, 0) + >>> assert len(stage_plugins) == 3 + + Suppose 'stage_idx=0', the structure of blocks in the stage would be: + conv1-> conv2->conv3->yyy->zzz1->zzz2 + Suppose 'stage_idx=1', the structure of blocks in the stage would be: + conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2 + + If stages is missing, the plugin would be applied to all stages. + + Args: + plugins (list[dict]): List of plugins cfg to build. The postfix is + required if multiple same type plugins are inserted. + stage_idx (int): Index of stage to build + + Returns: + list[dict]: Plugins for current stage + """ + stage_plugins = [] + for plugin in plugins: + plugin = plugin.copy() + stages = plugin.pop('stages', None) + assert stages is None or len(stages) == self.num_stages + # whether to insert plugin into current stage + if stages is None or stages[stage_idx]: + stage_plugins.append(plugin) + + return stage_plugins + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``.""" + return ResLayer(**kwargs) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def _make_stem_layer(self, in_channels, stem_channels): + """Make stem layer for ResNet.""" + if self.deep_stem: + self.stem = nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels // 2, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels // 2)[1], + nn.ReLU(inplace=True), + build_conv_layer( + self.conv_cfg, + stem_channels // 2, + stem_channels // 2, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels // 2)[1], + nn.ReLU(inplace=True), + build_conv_layer( + self.conv_cfg, + stem_channels // 2, + stem_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels)[1], + nn.ReLU(inplace=True)) + else: + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels, + kernel_size=7, + stride=2, + padding=3, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, stem_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def _freeze_stages(self): + """Freeze stages param and norm stats.""" + if self.frozen_stages >= 0: + if self.deep_stem: + self.stem.eval() + for param in self.stem.parameters(): + param.requires_grad = False + else: + self.norm1.eval() + for m in [self.conv1, self.norm1]: + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + if self.dcn is not None: + for m in self.modules(): + if isinstance(m, Bottleneck) and hasattr( + m, 'conv2_offset'): + constant_init(m.conv2_offset, 0) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + constant_init(m.norm3, 0) + elif isinstance(m, BasicBlock): + constant_init(m.norm2, 0) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Forward function.""" + if self.deep_stem: + x = self.stem(x) + else: + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(ResNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + +@BACKBONES.register_module() +class ResNetV1c(ResNet): + """ResNetV1c variant described in [1]_. + + Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv + in the input stem with three 3x3 convs. + + References: + .. [1] https://arxiv.org/pdf/1812.01187.pdf + """ + + def __init__(self, **kwargs): + super(ResNetV1c, self).__init__( + deep_stem=True, avg_down=False, **kwargs) + + +@BACKBONES.register_module() +class ResNetV1d(ResNet): + """ResNetV1d variant described in [1]_. + + Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in + the input stem with three 3x3 convs. And in the downsampling block, a 2x2 + avg_pool with stride 2 is added before conv, whose stride is changed to 1. + """ + + def __init__(self, **kwargs): + super(ResNetV1d, self).__init__( + deep_stem=True, avg_down=True, **kwargs) diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/resnext.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..962249ad6fd9b50960ad6426f7ce3cac6ed8c5bc --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/resnext.py @@ -0,0 +1,145 @@ +import math + +from annotator.uniformer.mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from ..utils import ResLayer +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResNet + + +class Bottleneck(_Bottleneck): + """Bottleneck block for ResNeXt. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is + "caffe", the stride-two layer is the first 1x1 conv layer. + """ + + def __init__(self, + inplanes, + planes, + groups=1, + base_width=4, + base_channels=64, + **kwargs): + super(Bottleneck, self).__init__(inplanes, planes, **kwargs) + + if groups == 1: + width = self.planes + else: + width = math.floor(self.planes * + (base_width / base_channels)) * groups + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, width, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, width, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.inplanes, + width, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + fallback_on_stride = False + self.with_modulated_dcn = False + if self.with_dcn: + fallback_on_stride = self.dcn.pop('fallback_on_stride', False) + if not self.with_dcn or fallback_on_stride: + self.conv2 = build_conv_layer( + self.conv_cfg, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + self.conv2 = build_conv_layer( + self.dcn, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + self.conv_cfg, + width, + self.planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + +@BACKBONES.register_module() +class ResNeXt(ResNet): + """ResNeXt backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + in_channels (int): Number of input image channels. Normally 3. + num_stages (int): Resnet stages, normally 4. + groups (int): Group of resnext. + base_width (int): Base width of resnext. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + norm_cfg (dict): dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. + + Example: + >>> from annotator.uniformer.mmseg.models import ResNeXt + >>> import torch + >>> self = ResNeXt(depth=50) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 256, 8, 8) + (1, 512, 4, 4) + (1, 1024, 2, 2) + (1, 2048, 1, 1) + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, groups=1, base_width=4, **kwargs): + self.groups = groups + self.base_width = base_width + super(ResNeXt, self).__init__(**kwargs) + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``""" + return ResLayer( + groups=self.groups, + base_width=self.base_width, + base_channels=self.base_channels, + **kwargs) diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/unet.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/unet.py new file mode 100644 index 0000000000000000000000000000000000000000..82caa16a94c195c192a2a920fb7bc7e60f0f3ce3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/unet.py @@ -0,0 +1,429 @@ +import torch.nn as nn +import torch.utils.checkpoint as cp +from annotator.uniformer.mmcv.cnn import (UPSAMPLE_LAYERS, ConvModule, build_activation_layer, + build_norm_layer, constant_init, kaiming_init) +from annotator.uniformer.mmcv.runner import load_checkpoint +from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm + +from annotator.uniformer.mmseg.utils import get_root_logger +from ..builder import BACKBONES +from ..utils import UpConvBlock + + +class BasicConvBlock(nn.Module): + """Basic convolutional block for UNet. + + This module consists of several plain convolutional layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + num_convs (int): Number of convolutional layers. Default: 2. + stride (int): Whether use stride convolution to downsample + the input feature map. If stride=2, it only uses stride convolution + in the first convolutional layer to downsample the input feature + map. Options are 1 or 2. Default: 1. + dilation (int): Whether use dilated convolution to expand the + receptive field. Set dilation rate of each convolutional layer and + the dilation rate of the first convolutional layer is always 1. + Default: 1. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + conv_cfg (dict | None): Config dict for convolution layer. + Default: None. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + dcn (bool): Use deformable convolution in convolutional layer or not. + Default: None. + plugins (dict): plugins for convolutional layers. Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + num_convs=2, + stride=1, + dilation=1, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + dcn=None, + plugins=None): + super(BasicConvBlock, self).__init__() + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + + self.with_cp = with_cp + convs = [] + for i in range(num_convs): + convs.append( + ConvModule( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride if i == 0 else 1, + dilation=1 if i == 0 else dilation, + padding=1 if i == 0 else dilation, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.convs = nn.Sequential(*convs) + + def forward(self, x): + """Forward function.""" + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(self.convs, x) + else: + out = self.convs(x) + return out + + +@UPSAMPLE_LAYERS.register_module() +class DeconvModule(nn.Module): + """Deconvolution upsample module in decoder for UNet (2X upsample). + + This module uses deconvolution to upsample feature map in the decoder + of UNet. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + kernel_size (int): Kernel size of the convolutional layer. Default: 4. + """ + + def __init__(self, + in_channels, + out_channels, + with_cp=False, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + *, + kernel_size=4, + scale_factor=2): + super(DeconvModule, self).__init__() + + assert (kernel_size - scale_factor >= 0) and\ + (kernel_size - scale_factor) % 2 == 0,\ + f'kernel_size should be greater than or equal to scale_factor '\ + f'and (kernel_size - scale_factor) should be even numbers, '\ + f'while the kernel size is {kernel_size} and scale_factor is '\ + f'{scale_factor}.' + + stride = scale_factor + padding = (kernel_size - scale_factor) // 2 + self.with_cp = with_cp + deconv = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding) + + norm_name, norm = build_norm_layer(norm_cfg, out_channels) + activate = build_activation_layer(act_cfg) + self.deconv_upsamping = nn.Sequential(deconv, norm, activate) + + def forward(self, x): + """Forward function.""" + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(self.deconv_upsamping, x) + else: + out = self.deconv_upsamping(x) + return out + + +@UPSAMPLE_LAYERS.register_module() +class InterpConv(nn.Module): + """Interpolation upsample module in decoder for UNet. + + This module uses interpolation to upsample feature map in the decoder + of UNet. It consists of one interpolation upsample layer and one + convolutional layer. It can be one interpolation upsample layer followed + by one convolutional layer (conv_first=False) or one convolutional layer + followed by one interpolation upsample layer (conv_first=True). + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + conv_cfg (dict | None): Config dict for convolution layer. + Default: None. + conv_first (bool): Whether convolutional layer or interpolation + upsample layer first. Default: False. It means interpolation + upsample layer followed by one convolutional layer. + kernel_size (int): Kernel size of the convolutional layer. Default: 1. + stride (int): Stride of the convolutional layer. Default: 1. + padding (int): Padding of the convolutional layer. Default: 1. + upsample_cfg (dict): Interpolation config of the upsample layer. + Default: dict( + scale_factor=2, mode='bilinear', align_corners=False). + """ + + def __init__(self, + in_channels, + out_channels, + with_cp=False, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + *, + conv_cfg=None, + conv_first=False, + kernel_size=1, + stride=1, + padding=0, + upsample_cfg=dict( + scale_factor=2, mode='bilinear', align_corners=False)): + super(InterpConv, self).__init__() + + self.with_cp = with_cp + conv = ConvModule( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + upsample = nn.Upsample(**upsample_cfg) + if conv_first: + self.interp_upsample = nn.Sequential(conv, upsample) + else: + self.interp_upsample = nn.Sequential(upsample, conv) + + def forward(self, x): + """Forward function.""" + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(self.interp_upsample, x) + else: + out = self.interp_upsample(x) + return out + + +@BACKBONES.register_module() +class UNet(nn.Module): + """UNet backbone. + U-Net: Convolutional Networks for Biomedical Image Segmentation. + https://arxiv.org/pdf/1505.04597.pdf + + Args: + in_channels (int): Number of input image channels. Default" 3. + base_channels (int): Number of base channels of each stage. + The output channels of the first stage. Default: 64. + num_stages (int): Number of stages in encoder, normally 5. Default: 5. + strides (Sequence[int 1 | 2]): Strides of each stage in encoder. + len(strides) is equal to num_stages. Normally the stride of the + first stage in encoder is 1. If strides[i]=2, it uses stride + convolution to downsample in the correspondence encoder stage. + Default: (1, 1, 1, 1, 1). + enc_num_convs (Sequence[int]): Number of convolutional layers in the + convolution block of the correspondence encoder stage. + Default: (2, 2, 2, 2, 2). + dec_num_convs (Sequence[int]): Number of convolutional layers in the + convolution block of the correspondence decoder stage. + Default: (2, 2, 2, 2). + downsamples (Sequence[int]): Whether use MaxPool to downsample the + feature map after the first stage of encoder + (stages: [1, num_stages)). If the correspondence encoder stage use + stride convolution (strides[i]=2), it will never use MaxPool to + downsample, even downsamples[i-1]=True. + Default: (True, True, True, True). + enc_dilations (Sequence[int]): Dilation rate of each stage in encoder. + Default: (1, 1, 1, 1, 1). + dec_dilations (Sequence[int]): Dilation rate of each stage in decoder. + Default: (1, 1, 1, 1). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + conv_cfg (dict | None): Config dict for convolution layer. + Default: None. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + upsample_cfg (dict): The upsample config of the upsample module in + decoder. Default: dict(type='InterpConv'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + dcn (bool): Use deformable convolution in convolutional layer or not. + Default: None. + plugins (dict): plugins for convolutional layers. Default: None. + + Notice: + The input image size should be divisible by the whole downsample rate + of the encoder. More detail of the whole downsample rate can be found + in UNet._check_input_divisible. + + """ + + def __init__(self, + in_channels=3, + base_channels=64, + num_stages=5, + strides=(1, 1, 1, 1, 1), + enc_num_convs=(2, 2, 2, 2, 2), + dec_num_convs=(2, 2, 2, 2), + downsamples=(True, True, True, True), + enc_dilations=(1, 1, 1, 1, 1), + dec_dilations=(1, 1, 1, 1), + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + upsample_cfg=dict(type='InterpConv'), + norm_eval=False, + dcn=None, + plugins=None): + super(UNet, self).__init__() + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + assert len(strides) == num_stages, \ + 'The length of strides should be equal to num_stages, '\ + f'while the strides is {strides}, the length of '\ + f'strides is {len(strides)}, and the num_stages is '\ + f'{num_stages}.' + assert len(enc_num_convs) == num_stages, \ + 'The length of enc_num_convs should be equal to num_stages, '\ + f'while the enc_num_convs is {enc_num_convs}, the length of '\ + f'enc_num_convs is {len(enc_num_convs)}, and the num_stages is '\ + f'{num_stages}.' + assert len(dec_num_convs) == (num_stages-1), \ + 'The length of dec_num_convs should be equal to (num_stages-1), '\ + f'while the dec_num_convs is {dec_num_convs}, the length of '\ + f'dec_num_convs is {len(dec_num_convs)}, and the num_stages is '\ + f'{num_stages}.' + assert len(downsamples) == (num_stages-1), \ + 'The length of downsamples should be equal to (num_stages-1), '\ + f'while the downsamples is {downsamples}, the length of '\ + f'downsamples is {len(downsamples)}, and the num_stages is '\ + f'{num_stages}.' + assert len(enc_dilations) == num_stages, \ + 'The length of enc_dilations should be equal to num_stages, '\ + f'while the enc_dilations is {enc_dilations}, the length of '\ + f'enc_dilations is {len(enc_dilations)}, and the num_stages is '\ + f'{num_stages}.' + assert len(dec_dilations) == (num_stages-1), \ + 'The length of dec_dilations should be equal to (num_stages-1), '\ + f'while the dec_dilations is {dec_dilations}, the length of '\ + f'dec_dilations is {len(dec_dilations)}, and the num_stages is '\ + f'{num_stages}.' + self.num_stages = num_stages + self.strides = strides + self.downsamples = downsamples + self.norm_eval = norm_eval + self.base_channels = base_channels + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + + for i in range(num_stages): + enc_conv_block = [] + if i != 0: + if strides[i] == 1 and downsamples[i - 1]: + enc_conv_block.append(nn.MaxPool2d(kernel_size=2)) + upsample = (strides[i] != 1 or downsamples[i - 1]) + self.decoder.append( + UpConvBlock( + conv_block=BasicConvBlock, + in_channels=base_channels * 2**i, + skip_channels=base_channels * 2**(i - 1), + out_channels=base_channels * 2**(i - 1), + num_convs=dec_num_convs[i - 1], + stride=1, + dilation=dec_dilations[i - 1], + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + upsample_cfg=upsample_cfg if upsample else None, + dcn=None, + plugins=None)) + + enc_conv_block.append( + BasicConvBlock( + in_channels=in_channels, + out_channels=base_channels * 2**i, + num_convs=enc_num_convs[i], + stride=strides[i], + dilation=enc_dilations[i], + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + dcn=None, + plugins=None)) + self.encoder.append((nn.Sequential(*enc_conv_block))) + in_channels = base_channels * 2**i + + def forward(self, x): + self._check_input_divisible(x) + enc_outs = [] + for enc in self.encoder: + x = enc(x) + enc_outs.append(x) + dec_outs = [x] + for i in reversed(range(len(self.decoder))): + x = self.decoder[i](enc_outs[i], x) + dec_outs.append(x) + + return dec_outs + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(UNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + def _check_input_divisible(self, x): + h, w = x.shape[-2:] + whole_downsample_rate = 1 + for i in range(1, self.num_stages): + if self.strides[i] == 2 or self.downsamples[i - 1]: + whole_downsample_rate *= 2 + assert (h % whole_downsample_rate == 0) \ + and (w % whole_downsample_rate == 0),\ + f'The input image size {(h, w)} should be divisible by the whole '\ + f'downsample rate {whole_downsample_rate}, when num_stages is '\ + f'{self.num_stages}, strides is {self.strides}, and downsamples '\ + f'is {self.downsamples}.' + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/uniformer.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/uniformer.py new file mode 100644 index 0000000000000000000000000000000000000000..0c4bb88e4c928540cca9ab609988b916520f5b7a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/uniformer.py @@ -0,0 +1,422 @@ +# -------------------------------------------------------- +# UniFormer +# Copyright (c) 2022 SenseTime X-Lab +# Licensed under The MIT License [see LICENSE for details] +# Written by Kunchang Li +# -------------------------------------------------------- + +from collections import OrderedDict +import math + +from functools import partial +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +import numpy as np +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + +from annotator.uniformer.mmcv_custom import load_checkpoint +from annotator.uniformer.mmseg.utils import get_root_logger +from ..builder import BACKBONES + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class CMlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Conv2d(in_features, hidden_features, 1) + self.act = act_layer() + self.fc2 = nn.Conv2d(hidden_features, out_features, 1) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class CBlock(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) + self.norm1 = nn.BatchNorm2d(dim) + self.conv1 = nn.Conv2d(dim, dim, 1) + self.conv2 = nn.Conv2d(dim, dim, 1) + self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = nn.BatchNorm2d(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.pos_embed(x) + x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x))))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SABlock(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.pos_embed(x) + B, N, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + x = x.transpose(1, 2).reshape(B, N, H, W) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class SABlock_Windows(nn.Module): + def __init__(self, dim, num_heads, window_size=14, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.window_size=window_size + self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.pos_embed(x) + x = x.permute(0, 2, 3, 1) + B, H, W, C = x.shape + shortcut = x + x = self.norm1(x) + + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + x_windows = window_partition(x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + x = x.permute(0, 3, 1, 2).reshape(B, C, H, W) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.norm = nn.LayerNorm(embed_dim) + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, _, H, W = x.shape + x = self.proj(x) + B, _, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + return x + + +@BACKBONES.register_module() +class UniFormer(nn.Module): + """ Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - + https://arxiv.org/abs/2010.11929 + """ + def __init__(self, layers=[3, 4, 8, 3], img_size=224, in_chans=3, num_classes=80, embed_dim=[64, 128, 320, 512], + head_dim=64, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6), + pretrained_path=None, use_checkpoint=False, checkpoint_num=[0, 0, 0, 0], + windows=False, hybrid=False, window_size=14): + """ + Args: + layer (list): number of block in each layer + img_size (int, tuple): input image size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + head_dim (int): dimension of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + qk_scale (float): override default qk scale of head_dim ** -0.5 if set + representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + norm_layer (nn.Module): normalization layer + pretrained_path (str): path of pretrained model + use_checkpoint (bool): whether use checkpoint + checkpoint_num (list): index for using checkpoint in every stage + windows (bool): whether use window MHRA + hybrid (bool): whether use hybrid MHRA + window_size (int): size of window (>14) + """ + super().__init__() + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + self.checkpoint_num = checkpoint_num + self.windows = windows + print(f'Use Checkpoint: {self.use_checkpoint}') + print(f'Checkpoint Number: {self.checkpoint_num}') + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + + self.patch_embed1 = PatchEmbed( + img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0]) + self.patch_embed2 = PatchEmbed( + img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1]) + self.patch_embed3 = PatchEmbed( + img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2]) + self.patch_embed4 = PatchEmbed( + img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3]) + + self.pos_drop = nn.Dropout(p=drop_rate) + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(layers))] # stochastic depth decay rule + num_heads = [dim // head_dim for dim in embed_dim] + self.blocks1 = nn.ModuleList([ + CBlock( + dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) + for i in range(layers[0])]) + self.norm1=norm_layer(embed_dim[0]) + self.blocks2 = nn.ModuleList([ + CBlock( + dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]], norm_layer=norm_layer) + for i in range(layers[1])]) + self.norm2 = norm_layer(embed_dim[1]) + if self.windows: + print('Use local window for all blocks in stage3') + self.blocks3 = nn.ModuleList([ + SABlock_Windows( + dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer) + for i in range(layers[2])]) + elif hybrid: + print('Use hybrid window for blocks in stage3') + block3 = [] + for i in range(layers[2]): + if (i + 1) % 4 == 0: + block3.append(SABlock( + dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)) + else: + block3.append(SABlock_Windows( + dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)) + self.blocks3 = nn.ModuleList(block3) + else: + print('Use global window for all blocks in stage3') + self.blocks3 = nn.ModuleList([ + SABlock( + dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer) + for i in range(layers[2])]) + self.norm3 = norm_layer(embed_dim[2]) + self.blocks4 = nn.ModuleList([ + SABlock( + dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]+layers[2]], norm_layer=norm_layer) + for i in range(layers[3])]) + self.norm4 = norm_layer(embed_dim[3]) + + # Representation layer + if representation_size: + self.num_features = representation_size + self.pre_logits = nn.Sequential(OrderedDict([ + ('fc', nn.Linear(embed_dim, representation_size)), + ('act', nn.Tanh()) + ])) + else: + self.pre_logits = nn.Identity() + + self.apply(self._init_weights) + self.init_weights(pretrained=pretrained_path) + + def init_weights(self, pretrained): + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger) + print(f'Load pretrained model from {pretrained}') + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + out = [] + x = self.patch_embed1(x) + x = self.pos_drop(x) + for i, blk in enumerate(self.blocks1): + if self.use_checkpoint and i < self.checkpoint_num[0]: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + x_out = self.norm1(x.permute(0, 2, 3, 1)) + out.append(x_out.permute(0, 3, 1, 2).contiguous()) + x = self.patch_embed2(x) + for i, blk in enumerate(self.blocks2): + if self.use_checkpoint and i < self.checkpoint_num[1]: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + x_out = self.norm2(x.permute(0, 2, 3, 1)) + out.append(x_out.permute(0, 3, 1, 2).contiguous()) + x = self.patch_embed3(x) + for i, blk in enumerate(self.blocks3): + if self.use_checkpoint and i < self.checkpoint_num[2]: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + x_out = self.norm3(x.permute(0, 2, 3, 1)) + out.append(x_out.permute(0, 3, 1, 2).contiguous()) + x = self.patch_embed4(x) + for i, blk in enumerate(self.blocks4): + if self.use_checkpoint and i < self.checkpoint_num[3]: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + x_out = self.norm4(x.permute(0, 2, 3, 1)) + out.append(x_out.permute(0, 3, 1, 2).contiguous()) + return tuple(out) + + def forward(self, x): + x = self.forward_features(x) + return x diff --git a/lavis/common/annotator/uniformer/mmseg/models/backbones/vit.py b/lavis/common/annotator/uniformer/mmseg/models/backbones/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..59e4479650690e08cbc4cab9427aefda47c2116d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/backbones/vit.py @@ -0,0 +1,459 @@ +"""Modified from https://github.com/rwightman/pytorch-image- +models/blob/master/timm/models/vision_transformer.py.""" + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from annotator.uniformer.mmcv.cnn import (Conv2d, Linear, build_activation_layer, build_norm_layer, + constant_init, kaiming_init, normal_init) +from annotator.uniformer.mmcv.runner import _load_checkpoint +from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm + +from annotator.uniformer.mmseg.utils import get_root_logger +from ..builder import BACKBONES +from ..utils import DropPath, trunc_normal_ + + +class Mlp(nn.Module): + """MLP layer for Encoder block. + + Args: + in_features(int): Input dimension for the first fully + connected layer. + hidden_features(int): Output dimension for the first fully + connected layer. + out_features(int): Output dementsion for the second fully + connected layer. + act_cfg(dict): Config dict for activation layer. + Default: dict(type='GELU'). + drop(float): Drop rate for the dropout layer. Dropout rate has + to be between 0 and 1. Default: 0. + """ + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_cfg=dict(type='GELU'), + drop=0.): + super(Mlp, self).__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = Linear(in_features, hidden_features) + self.act = build_activation_layer(act_cfg) + self.fc2 = Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + """Attention layer for Encoder block. + + Args: + dim (int): Dimension for the input vector. + num_heads (int): Number of parallel attention heads. + qkv_bias (bool): Enable bias for qkv if True. Default: False. + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + attn_drop (float): Drop rate for attention output weights. + Default: 0. + proj_drop (float): Drop rate for output weights. Default: 0. + """ + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super(Attention, self).__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + b, n, c = x.shape + qkv = self.qkv(x).reshape(b, n, 3, self.num_heads, + c // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(b, n, c) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + """Implements encoder block with residual connection. + + Args: + dim (int): The feature dimension. + num_heads (int): Number of parallel attention heads. + mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop (float): Drop rate for mlp output weights. Default: 0. + attn_drop (float): Drop rate for attention output weights. + Default: 0. + proj_drop (float): Drop rate for attn layer output weights. + Default: 0. + drop_path (float): Drop rate for paths of model. + Default: 0. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN', requires_grad=True). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + dim, + num_heads, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + proj_drop=0., + drop_path=0., + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN', eps=1e-6), + with_cp=False): + super(Block, self).__init__() + self.with_cp = with_cp + _, self.norm1 = build_norm_layer(norm_cfg, dim) + self.attn = Attention(dim, num_heads, qkv_bias, qk_scale, attn_drop, + proj_drop) + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + _, self.norm2 = build_norm_layer(norm_cfg, dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_cfg=act_cfg, + drop=drop) + + def forward(self, x): + + def _inner_forward(x): + out = x + self.drop_path(self.attn(self.norm1(x))) + out = out + self.drop_path(self.mlp(self.norm2(out))) + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding. + + Args: + img_size (int | tuple): Input image size. + default: 224. + patch_size (int): Width and height for a patch. + default: 16. + in_channels (int): Input channels for images. Default: 3. + embed_dim (int): The embedding dimension. Default: 768. + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768): + super(PatchEmbed, self).__init__() + if isinstance(img_size, int): + self.img_size = (img_size, img_size) + elif isinstance(img_size, tuple): + self.img_size = img_size + else: + raise TypeError('img_size must be type of int or tuple') + h, w = self.img_size + self.patch_size = (patch_size, patch_size) + self.num_patches = (h // patch_size) * (w // patch_size) + self.proj = Conv2d( + in_channels, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + return self.proj(x).flatten(2).transpose(1, 2) + + +@BACKBONES.register_module() +class VisionTransformer(nn.Module): + """Vision transformer backbone. + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for + Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 + + Args: + img_size (tuple): input image size. Default: (224, 224). + patch_size (int, tuple): patch size. Default: 16. + in_channels (int): number of input channels. Default: 3. + embed_dim (int): embedding dimension. Default: 768. + depth (int): depth of transformer. Default: 12. + num_heads (int): number of attention heads. Default: 12. + mlp_ratio (int): ratio of mlp hidden dim to embedding dim. + Default: 4. + out_indices (list | tuple | int): Output from which stages. + Default: -1. + qkv_bias (bool): enable bias for qkv if True. Default: True. + qk_scale (float): override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): dropout rate. Default: 0. + attn_drop_rate (float): attention dropout rate. Default: 0. + drop_path_rate (float): Rate of DropPath. Default: 0. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN', eps=1e-6, requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='GELU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + final_norm (bool): Whether to add a additional layer to normalize + final feature map. Default: False. + interpolate_mode (str): Select the interpolate mode for position + embeding vector resize. Default: bicubic. + with_cls_token (bool): If concatenating class token into image tokens + as transformer input. Default: True. + with_cp (bool): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + """ + + def __init__(self, + img_size=(224, 224), + patch_size=16, + in_channels=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + out_indices=11, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_cfg=dict(type='LN', eps=1e-6, requires_grad=True), + act_cfg=dict(type='GELU'), + norm_eval=False, + final_norm=False, + with_cls_token=True, + interpolate_mode='bicubic', + with_cp=False): + super(VisionTransformer, self).__init__() + self.img_size = img_size + self.patch_size = patch_size + self.features = self.embed_dim = embed_dim + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=embed_dim) + + self.with_cls_token = with_cls_token + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) + self.pos_embed = nn.Parameter( + torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + if isinstance(out_indices, int): + self.out_indices = [out_indices] + elif isinstance(out_indices, list) or isinstance(out_indices, tuple): + self.out_indices = out_indices + else: + raise TypeError('out_indices must be type of int, list or tuple') + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=dpr[i], + attn_drop=attn_drop_rate, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + with_cp=with_cp) for i in range(depth) + ]) + + self.interpolate_mode = interpolate_mode + self.final_norm = final_norm + if final_norm: + _, self.norm = build_norm_layer(norm_cfg, embed_dim) + + self.norm_eval = norm_eval + self.with_cp = with_cp + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = get_root_logger() + checkpoint = _load_checkpoint(pretrained, logger=logger) + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + if 'pos_embed' in state_dict.keys(): + if self.pos_embed.shape != state_dict['pos_embed'].shape: + logger.info(msg=f'Resize the pos_embed shape from \ +{state_dict["pos_embed"].shape} to {self.pos_embed.shape}') + h, w = self.img_size + pos_size = int( + math.sqrt(state_dict['pos_embed'].shape[1] - 1)) + state_dict['pos_embed'] = self.resize_pos_embed( + state_dict['pos_embed'], (h, w), (pos_size, pos_size), + self.patch_size, self.interpolate_mode) + + self.load_state_dict(state_dict, False) + + elif pretrained is None: + # We only implement the 'jax_impl' initialization implemented at + # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + for n, m in self.named_modules(): + if isinstance(m, Linear): + trunc_normal_(m.weight, std=.02) + if m.bias is not None: + if 'mlp' in n: + normal_init(m.bias, std=1e-6) + else: + constant_init(m.bias, 0) + elif isinstance(m, Conv2d): + kaiming_init(m.weight, mode='fan_in') + if m.bias is not None: + constant_init(m.bias, 0) + elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): + constant_init(m.bias, 0) + constant_init(m.weight, 1.0) + else: + raise TypeError('pretrained must be a str or None') + + def _pos_embeding(self, img, patched_img, pos_embed): + """Positiong embeding method. + + Resize the pos_embed, if the input image size doesn't match + the training size. + Args: + img (torch.Tensor): The inference image tensor, the shape + must be [B, C, H, W]. + patched_img (torch.Tensor): The patched image, it should be + shape of [B, L1, C]. + pos_embed (torch.Tensor): The pos_embed weighs, it should be + shape of [B, L2, c]. + Return: + torch.Tensor: The pos encoded image feature. + """ + assert patched_img.ndim == 3 and pos_embed.ndim == 3, \ + 'the shapes of patched_img and pos_embed must be [B, L, C]' + x_len, pos_len = patched_img.shape[1], pos_embed.shape[1] + if x_len != pos_len: + if pos_len == (self.img_size[0] // self.patch_size) * ( + self.img_size[1] // self.patch_size) + 1: + pos_h = self.img_size[0] // self.patch_size + pos_w = self.img_size[1] // self.patch_size + else: + raise ValueError( + 'Unexpected shape of pos_embed, got {}.'.format( + pos_embed.shape)) + pos_embed = self.resize_pos_embed(pos_embed, img.shape[2:], + (pos_h, pos_w), self.patch_size, + self.interpolate_mode) + return self.pos_drop(patched_img + pos_embed) + + @staticmethod + def resize_pos_embed(pos_embed, input_shpae, pos_shape, patch_size, mode): + """Resize pos_embed weights. + + Resize pos_embed using bicubic interpolate method. + Args: + pos_embed (torch.Tensor): pos_embed weights. + input_shpae (tuple): Tuple for (input_h, intput_w). + pos_shape (tuple): Tuple for (pos_h, pos_w). + patch_size (int): Patch size. + Return: + torch.Tensor: The resized pos_embed of shape [B, L_new, C] + """ + assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]' + input_h, input_w = input_shpae + pos_h, pos_w = pos_shape + cls_token_weight = pos_embed[:, 0] + pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):] + pos_embed_weight = pos_embed_weight.reshape( + 1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2) + pos_embed_weight = F.interpolate( + pos_embed_weight, + size=[input_h // patch_size, input_w // patch_size], + align_corners=False, + mode=mode) + cls_token_weight = cls_token_weight.unsqueeze(1) + pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2) + pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1) + return pos_embed + + def forward(self, inputs): + B = inputs.shape[0] + + x = self.patch_embed(inputs) + + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + x = self._pos_embeding(inputs, x, self.pos_embed) + + if not self.with_cls_token: + # Remove class token for transformer input + x = x[:, 1:] + + outs = [] + for i, blk in enumerate(self.blocks): + x = blk(x) + if i == len(self.blocks) - 1: + if self.final_norm: + x = self.norm(x) + if i in self.out_indices: + if self.with_cls_token: + # Remove class token and reshape token for decoder head + out = x[:, 1:] + else: + out = x + B, _, C = out.shape + out = out.reshape(B, inputs.shape[2] // self.patch_size, + inputs.shape[3] // self.patch_size, + C).permute(0, 3, 1, 2) + outs.append(out) + + return tuple(outs) + + def train(self, mode=True): + super(VisionTransformer, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.LayerNorm): + m.eval() diff --git a/lavis/common/annotator/uniformer/mmseg/models/builder.py b/lavis/common/annotator/uniformer/mmseg/models/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..1f5b971252bfc971c3ffbaa27746d69b1d3ea9fd --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/builder.py @@ -0,0 +1,46 @@ +import warnings + +from annotator.uniformer.mmcv.cnn import MODELS as MMCV_MODELS +from annotator.uniformer.mmcv.utils import Registry + +MODELS = Registry('models', parent=MMCV_MODELS) + +BACKBONES = MODELS +NECKS = MODELS +HEADS = MODELS +LOSSES = MODELS +SEGMENTORS = MODELS + + +def build_backbone(cfg): + """Build backbone.""" + return BACKBONES.build(cfg) + + +def build_neck(cfg): + """Build neck.""" + return NECKS.build(cfg) + + +def build_head(cfg): + """Build head.""" + return HEADS.build(cfg) + + +def build_loss(cfg): + """Build loss.""" + return LOSSES.build(cfg) + + +def build_segmentor(cfg, train_cfg=None, test_cfg=None): + """Build segmentor.""" + if train_cfg is not None or test_cfg is not None: + warnings.warn( + 'train_cfg and test_cfg is deprecated, ' + 'please specify them in model', UserWarning) + assert cfg.get('train_cfg') is None or train_cfg is None, \ + 'train_cfg specified in both outer field and model field ' + assert cfg.get('test_cfg') is None or test_cfg is None, \ + 'test_cfg specified in both outer field and model field ' + return SEGMENTORS.build( + cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/__init__.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ac66d3cfe0ea04af45c0f3594bf135841c3812e3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/__init__.py @@ -0,0 +1,28 @@ +from .ann_head import ANNHead +from .apc_head import APCHead +from .aspp_head import ASPPHead +from .cc_head import CCHead +from .da_head import DAHead +from .dm_head import DMHead +from .dnl_head import DNLHead +from .ema_head import EMAHead +from .enc_head import EncHead +from .fcn_head import FCNHead +from .fpn_head import FPNHead +from .gc_head import GCHead +from .lraspp_head import LRASPPHead +from .nl_head import NLHead +from .ocr_head import OCRHead +# from .point_head import PointHead +from .psa_head import PSAHead +from .psp_head import PSPHead +from .sep_aspp_head import DepthwiseSeparableASPPHead +from .sep_fcn_head import DepthwiseSeparableFCNHead +from .uper_head import UPerHead + +__all__ = [ + 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead', + 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', + 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', + 'APCHead', 'DMHead', 'LRASPPHead' +] diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ann_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ann_head.py new file mode 100644 index 0000000000000000000000000000000000000000..30aaacc2cafc568d3de71d1477b4de0dc0fea9d3 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ann_head.py @@ -0,0 +1,245 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule + +from ..builder import HEADS +from ..utils import SelfAttentionBlock as _SelfAttentionBlock +from .decode_head import BaseDecodeHead + + +class PPMConcat(nn.ModuleList): + """Pyramid Pooling Module that only concat the features of each layer. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. + """ + + def __init__(self, pool_scales=(1, 3, 6, 8)): + super(PPMConcat, self).__init__( + [nn.AdaptiveAvgPool2d(pool_scale) for pool_scale in pool_scales]) + + def forward(self, feats): + """Forward function.""" + ppm_outs = [] + for ppm in self: + ppm_out = ppm(feats) + ppm_outs.append(ppm_out.view(*feats.shape[:2], -1)) + concat_outs = torch.cat(ppm_outs, dim=2) + return concat_outs + + +class SelfAttentionBlock(_SelfAttentionBlock): + """Make a ANN used SelfAttentionBlock. + + Args: + low_in_channels (int): Input channels of lower level feature, + which is the key feature for self-attention. + high_in_channels (int): Input channels of higher level feature, + which is the query feature for self-attention. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + share_key_query (bool): Whether share projection weight between key + and query projection. + query_scale (int): The scale of query feature map. + key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module of key feature. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict|None): Config of activation layers. + """ + + def __init__(self, low_in_channels, high_in_channels, channels, + out_channels, share_key_query, query_scale, key_pool_scales, + conv_cfg, norm_cfg, act_cfg): + key_psp = PPMConcat(key_pool_scales) + if query_scale > 1: + query_downsample = nn.MaxPool2d(kernel_size=query_scale) + else: + query_downsample = None + super(SelfAttentionBlock, self).__init__( + key_in_channels=low_in_channels, + query_in_channels=high_in_channels, + channels=channels, + out_channels=out_channels, + share_key_query=share_key_query, + query_downsample=query_downsample, + key_downsample=key_psp, + key_query_num_convs=1, + key_query_norm=True, + value_out_num_convs=1, + value_out_norm=False, + matmul_norm=True, + with_out=True, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + +class AFNB(nn.Module): + """Asymmetric Fusion Non-local Block(AFNB) + + Args: + low_in_channels (int): Input channels of lower level feature, + which is the key feature for self-attention. + high_in_channels (int): Input channels of higher level feature, + which is the query feature for self-attention. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + and query projection. + query_scales (tuple[int]): The scales of query feature map. + Default: (1,) + key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module of key feature. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict|None): Config of activation layers. + """ + + def __init__(self, low_in_channels, high_in_channels, channels, + out_channels, query_scales, key_pool_scales, conv_cfg, + norm_cfg, act_cfg): + super(AFNB, self).__init__() + self.stages = nn.ModuleList() + for query_scale in query_scales: + self.stages.append( + SelfAttentionBlock( + low_in_channels=low_in_channels, + high_in_channels=high_in_channels, + channels=channels, + out_channels=out_channels, + share_key_query=False, + query_scale=query_scale, + key_pool_scales=key_pool_scales, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.bottleneck = ConvModule( + out_channels + high_in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, low_feats, high_feats): + """Forward function.""" + priors = [stage(high_feats, low_feats) for stage in self.stages] + context = torch.stack(priors, dim=0).sum(dim=0) + output = self.bottleneck(torch.cat([context, high_feats], 1)) + return output + + +class APNB(nn.Module): + """Asymmetric Pyramid Non-local Block (APNB) + + Args: + in_channels (int): Input channels of key/query feature, + which is the key feature for self-attention. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + query_scales (tuple[int]): The scales of query feature map. + Default: (1,) + key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module of key feature. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict|None): Config of activation layers. + """ + + def __init__(self, in_channels, channels, out_channels, query_scales, + key_pool_scales, conv_cfg, norm_cfg, act_cfg): + super(APNB, self).__init__() + self.stages = nn.ModuleList() + for query_scale in query_scales: + self.stages.append( + SelfAttentionBlock( + low_in_channels=in_channels, + high_in_channels=in_channels, + channels=channels, + out_channels=out_channels, + share_key_query=True, + query_scale=query_scale, + key_pool_scales=key_pool_scales, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.bottleneck = ConvModule( + 2 * in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, feats): + """Forward function.""" + priors = [stage(feats, feats) for stage in self.stages] + context = torch.stack(priors, dim=0).sum(dim=0) + output = self.bottleneck(torch.cat([context, feats], 1)) + return output + + +@HEADS.register_module() +class ANNHead(BaseDecodeHead): + """Asymmetric Non-local Neural Networks for Semantic Segmentation. + + This head is the implementation of `ANNNet + `_. + + Args: + project_channels (int): Projection channels for Nonlocal. + query_scales (tuple[int]): The scales of query feature map. + Default: (1,) + key_pool_scales (tuple[int]): The pooling scales of key feature map. + Default: (1, 3, 6, 8). + """ + + def __init__(self, + project_channels, + query_scales=(1, ), + key_pool_scales=(1, 3, 6, 8), + **kwargs): + super(ANNHead, self).__init__( + input_transform='multiple_select', **kwargs) + assert len(self.in_channels) == 2 + low_in_channels, high_in_channels = self.in_channels + self.project_channels = project_channels + self.fusion = AFNB( + low_in_channels=low_in_channels, + high_in_channels=high_in_channels, + out_channels=high_in_channels, + channels=project_channels, + query_scales=query_scales, + key_pool_scales=key_pool_scales, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.bottleneck = ConvModule( + high_in_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.context = APNB( + in_channels=self.channels, + out_channels=self.channels, + channels=project_channels, + query_scales=query_scales, + key_pool_scales=key_pool_scales, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + low_feats, high_feats = self._transform_inputs(inputs) + output = self.fusion(low_feats, high_feats) + output = self.dropout(output) + output = self.bottleneck(output) + output = self.context(output) + output = self.cls_seg(output) + + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/apc_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/apc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..c7038bdbe0edf2a1f184b6899486d2d190dda076 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/apc_head.py @@ -0,0 +1,158 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class ACM(nn.Module): + """Adaptive Context Module used in APCNet. + + Args: + pool_scale (int): Pooling scale used in Adaptive Context + Module to extract region features. + fusion (bool): Add one conv to fuse residual feature. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict | None): Config of conv layers. + norm_cfg (dict | None): Config of norm layers. + act_cfg (dict): Config of activation layers. + """ + + def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg, + norm_cfg, act_cfg): + super(ACM, self).__init__() + self.pool_scale = pool_scale + self.fusion = fusion + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.pooled_redu_conv = ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.input_redu_conv = ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.global_info = ConvModule( + self.channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.gla = nn.Conv2d(self.channels, self.pool_scale**2, 1, 1, 0) + + self.residual_conv = ConvModule( + self.channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + if self.fusion: + self.fusion_conv = ConvModule( + self.channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, x): + """Forward function.""" + pooled_x = F.adaptive_avg_pool2d(x, self.pool_scale) + # [batch_size, channels, h, w] + x = self.input_redu_conv(x) + # [batch_size, channels, pool_scale, pool_scale] + pooled_x = self.pooled_redu_conv(pooled_x) + batch_size = x.size(0) + # [batch_size, pool_scale * pool_scale, channels] + pooled_x = pooled_x.view(batch_size, self.channels, + -1).permute(0, 2, 1).contiguous() + # [batch_size, h * w, pool_scale * pool_scale] + affinity_matrix = self.gla(x + resize( + self.global_info(F.adaptive_avg_pool2d(x, 1)), size=x.shape[2:]) + ).permute(0, 2, 3, 1).reshape( + batch_size, -1, self.pool_scale**2) + affinity_matrix = F.sigmoid(affinity_matrix) + # [batch_size, h * w, channels] + z_out = torch.matmul(affinity_matrix, pooled_x) + # [batch_size, channels, h * w] + z_out = z_out.permute(0, 2, 1).contiguous() + # [batch_size, channels, h, w] + z_out = z_out.view(batch_size, self.channels, x.size(2), x.size(3)) + z_out = self.residual_conv(z_out) + z_out = F.relu(z_out + x) + if self.fusion: + z_out = self.fusion_conv(z_out) + + return z_out + + +@HEADS.register_module() +class APCHead(BaseDecodeHead): + """Adaptive Pyramid Context Network for Semantic Segmentation. + + This head is the implementation of + `APCNet `_. + + Args: + pool_scales (tuple[int]): Pooling scales used in Adaptive Context + Module. Default: (1, 2, 3, 6). + fusion (bool): Add one conv to fuse residual feature. + """ + + def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs): + super(APCHead, self).__init__(**kwargs) + assert isinstance(pool_scales, (list, tuple)) + self.pool_scales = pool_scales + self.fusion = fusion + acm_modules = [] + for pool_scale in self.pool_scales: + acm_modules.append( + ACM(pool_scale, + self.fusion, + self.in_channels, + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.acm_modules = nn.ModuleList(acm_modules) + self.bottleneck = ConvModule( + self.in_channels + len(pool_scales) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + acm_outs = [x] + for acm_module in self.acm_modules: + acm_outs.append(acm_module(x)) + acm_outs = torch.cat(acm_outs, dim=1) + output = self.bottleneck(acm_outs) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/aspp_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/aspp_head.py new file mode 100644 index 0000000000000000000000000000000000000000..aa914b5bb25124d1ff199553d96713d6a80484c0 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/aspp_head.py @@ -0,0 +1,107 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class ASPPModule(nn.ModuleList): + """Atrous Spatial Pyramid Pooling (ASPP) Module. + + Args: + dilations (tuple[int]): Dilation rate of each layer. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict): Config of activation layers. + """ + + def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg, + act_cfg): + super(ASPPModule, self).__init__() + self.dilations = dilations + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + for dilation in dilations: + self.append( + ConvModule( + self.in_channels, + self.channels, + 1 if dilation == 1 else 3, + dilation=dilation, + padding=0 if dilation == 1 else dilation, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def forward(self, x): + """Forward function.""" + aspp_outs = [] + for aspp_module in self: + aspp_outs.append(aspp_module(x)) + + return aspp_outs + + +@HEADS.register_module() +class ASPPHead(BaseDecodeHead): + """Rethinking Atrous Convolution for Semantic Image Segmentation. + + This head is the implementation of `DeepLabV3 + `_. + + Args: + dilations (tuple[int]): Dilation rates for ASPP module. + Default: (1, 6, 12, 18). + """ + + def __init__(self, dilations=(1, 6, 12, 18), **kwargs): + super(ASPPHead, self).__init__(**kwargs) + assert isinstance(dilations, (list, tuple)) + self.dilations = dilations + self.image_pool = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.aspp_modules = ASPPModule( + dilations, + self.in_channels, + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.bottleneck = ConvModule( + (len(dilations) + 1) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + aspp_outs = [ + resize( + self.image_pool(x), + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + ] + aspp_outs.extend(self.aspp_modules(x)) + aspp_outs = torch.cat(aspp_outs, dim=1) + output = self.bottleneck(aspp_outs) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/cascade_decode_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/cascade_decode_head.py new file mode 100644 index 0000000000000000000000000000000000000000..d02122ca0e68743b1bf7a893afae96042f23838c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/cascade_decode_head.py @@ -0,0 +1,57 @@ +from abc import ABCMeta, abstractmethod + +from .decode_head import BaseDecodeHead + + +class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta): + """Base class for cascade decode head used in + :class:`CascadeEncoderDecoder.""" + + def __init__(self, *args, **kwargs): + super(BaseCascadeDecodeHead, self).__init__(*args, **kwargs) + + @abstractmethod + def forward(self, inputs, prev_output): + """Placeholder of forward function.""" + pass + + def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg, + train_cfg): + """Forward function for training. + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + train_cfg (dict): The training config. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + seg_logits = self.forward(inputs, prev_output) + losses = self.losses(seg_logits, gt_semantic_seg) + + return losses + + def forward_test(self, inputs, prev_output, img_metas, test_cfg): + """Forward function for testing. + + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + test_cfg (dict): The testing config. + + Returns: + Tensor: Output segmentation map. + """ + return self.forward(inputs, prev_output) diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/cc_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/cc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5b9abb4e747f92657f4220b29788539340986c00 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/cc_head.py @@ -0,0 +1,42 @@ +import torch + +from ..builder import HEADS +from .fcn_head import FCNHead + +try: + from annotator.uniformer.mmcv.ops import CrissCrossAttention +except ModuleNotFoundError: + CrissCrossAttention = None + + +@HEADS.register_module() +class CCHead(FCNHead): + """CCNet: Criss-Cross Attention for Semantic Segmentation. + + This head is the implementation of `CCNet + `_. + + Args: + recurrence (int): Number of recurrence of Criss Cross Attention + module. Default: 2. + """ + + def __init__(self, recurrence=2, **kwargs): + if CrissCrossAttention is None: + raise RuntimeError('Please install mmcv-full for ' + 'CrissCrossAttention ops') + super(CCHead, self).__init__(num_convs=2, **kwargs) + self.recurrence = recurrence + self.cca = CrissCrossAttention(self.channels) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs[0](x) + for _ in range(self.recurrence): + output = self.cca(output) + output = self.convs[1](output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/da_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/da_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5cd49fcfdc7c0a70f9485cc71843dcf3e0cb1774 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/da_head.py @@ -0,0 +1,178 @@ +import torch +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule, Scale +from torch import nn + +from annotator.uniformer.mmseg.core import add_prefix +from ..builder import HEADS +from ..utils import SelfAttentionBlock as _SelfAttentionBlock +from .decode_head import BaseDecodeHead + + +class PAM(_SelfAttentionBlock): + """Position Attention Module (PAM) + + Args: + in_channels (int): Input channels of key/query feature. + channels (int): Output channels of key/query transform. + """ + + def __init__(self, in_channels, channels): + super(PAM, self).__init__( + key_in_channels=in_channels, + query_in_channels=in_channels, + channels=channels, + out_channels=in_channels, + share_key_query=False, + query_downsample=None, + key_downsample=None, + key_query_num_convs=1, + key_query_norm=False, + value_out_num_convs=1, + value_out_norm=False, + matmul_norm=False, + with_out=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None) + + self.gamma = Scale(0) + + def forward(self, x): + """Forward function.""" + out = super(PAM, self).forward(x, x) + + out = self.gamma(out) + x + return out + + +class CAM(nn.Module): + """Channel Attention Module (CAM)""" + + def __init__(self): + super(CAM, self).__init__() + self.gamma = Scale(0) + + def forward(self, x): + """Forward function.""" + batch_size, channels, height, width = x.size() + proj_query = x.view(batch_size, channels, -1) + proj_key = x.view(batch_size, channels, -1).permute(0, 2, 1) + energy = torch.bmm(proj_query, proj_key) + energy_new = torch.max( + energy, -1, keepdim=True)[0].expand_as(energy) - energy + attention = F.softmax(energy_new, dim=-1) + proj_value = x.view(batch_size, channels, -1) + + out = torch.bmm(attention, proj_value) + out = out.view(batch_size, channels, height, width) + + out = self.gamma(out) + x + return out + + +@HEADS.register_module() +class DAHead(BaseDecodeHead): + """Dual Attention Network for Scene Segmentation. + + This head is the implementation of `DANet + `_. + + Args: + pam_channels (int): The channels of Position Attention Module(PAM). + """ + + def __init__(self, pam_channels, **kwargs): + super(DAHead, self).__init__(**kwargs) + self.pam_channels = pam_channels + self.pam_in_conv = ConvModule( + self.in_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.pam = PAM(self.channels, pam_channels) + self.pam_out_conv = ConvModule( + self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.pam_conv_seg = nn.Conv2d( + self.channels, self.num_classes, kernel_size=1) + + self.cam_in_conv = ConvModule( + self.in_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.cam = CAM() + self.cam_out_conv = ConvModule( + self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.cam_conv_seg = nn.Conv2d( + self.channels, self.num_classes, kernel_size=1) + + def pam_cls_seg(self, feat): + """PAM feature classification.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.pam_conv_seg(feat) + return output + + def cam_cls_seg(self, feat): + """CAM feature classification.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.cam_conv_seg(feat) + return output + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + pam_feat = self.pam_in_conv(x) + pam_feat = self.pam(pam_feat) + pam_feat = self.pam_out_conv(pam_feat) + pam_out = self.pam_cls_seg(pam_feat) + + cam_feat = self.cam_in_conv(x) + cam_feat = self.cam(cam_feat) + cam_feat = self.cam_out_conv(cam_feat) + cam_out = self.cam_cls_seg(cam_feat) + + feat_sum = pam_feat + cam_feat + pam_cam_out = self.cls_seg(feat_sum) + + return pam_cam_out, pam_out, cam_out + + def forward_test(self, inputs, img_metas, test_cfg): + """Forward function for testing, only ``pam_cam`` is used.""" + return self.forward(inputs)[0] + + def losses(self, seg_logit, seg_label): + """Compute ``pam_cam``, ``pam``, ``cam`` loss.""" + pam_cam_seg_logit, pam_seg_logit, cam_seg_logit = seg_logit + loss = dict() + loss.update( + add_prefix( + super(DAHead, self).losses(pam_cam_seg_logit, seg_label), + 'pam_cam')) + loss.update( + add_prefix( + super(DAHead, self).losses(pam_seg_logit, seg_label), 'pam')) + loss.update( + add_prefix( + super(DAHead, self).losses(cam_seg_logit, seg_label), 'cam')) + return loss diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/decode_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/decode_head.py new file mode 100644 index 0000000000000000000000000000000000000000..88a661b8f6fec5d4c031d3d85e80777ee63951a6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/decode_head.py @@ -0,0 +1,234 @@ +from abc import ABCMeta, abstractmethod + +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import normal_init +from annotator.uniformer.mmcv.runner import auto_fp16, force_fp32 + +from annotator.uniformer.mmseg.core import build_pixel_sampler +from annotator.uniformer.mmseg.ops import resize +from ..builder import build_loss +from ..losses import accuracy + + +class BaseDecodeHead(nn.Module, metaclass=ABCMeta): + """Base class for BaseDecodeHead. + + Args: + in_channels (int|Sequence[int]): Input channels. + channels (int): Channels after modules, before conv_seg. + num_classes (int): Number of classes. + dropout_ratio (float): Ratio of dropout layer. Default: 0.1. + conv_cfg (dict|None): Config of conv layers. Default: None. + norm_cfg (dict|None): Config of norm layers. Default: None. + act_cfg (dict): Config of activation layers. + Default: dict(type='ReLU') + in_index (int|Sequence[int]): Input feature index. Default: -1 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + None: Only one select feature map is allowed. + Default: None. + loss_decode (dict): Config of decode loss. + Default: dict(type='CrossEntropyLoss'). + ignore_index (int | None): The label index to be ignored. When using + masked BCE loss, ignore_index should be set to None. Default: 255 + sampler (dict|None): The config of segmentation map sampler. + Default: None. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + """ + + def __init__(self, + in_channels, + channels, + *, + num_classes, + dropout_ratio=0.1, + conv_cfg=None, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + in_index=-1, + input_transform=None, + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + ignore_index=255, + sampler=None, + align_corners=False): + super(BaseDecodeHead, self).__init__() + self._init_inputs(in_channels, in_index, input_transform) + self.channels = channels + self.num_classes = num_classes + self.dropout_ratio = dropout_ratio + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.in_index = in_index + self.loss_decode = build_loss(loss_decode) + self.ignore_index = ignore_index + self.align_corners = align_corners + if sampler is not None: + self.sampler = build_pixel_sampler(sampler, context=self) + else: + self.sampler = None + + self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) + if dropout_ratio > 0: + self.dropout = nn.Dropout2d(dropout_ratio) + else: + self.dropout = None + self.fp16_enabled = False + + def extra_repr(self): + """Extra repr.""" + s = f'input_transform={self.input_transform}, ' \ + f'ignore_index={self.ignore_index}, ' \ + f'align_corners={self.align_corners}' + return s + + def _init_inputs(self, in_channels, in_index, input_transform): + """Check and initialize input transforms. + + The in_channels, in_index and input_transform must match. + Specifically, when input_transform is None, only single feature map + will be selected. So in_channels and in_index must be of type int. + When input_transform + + Args: + in_channels (int|Sequence[int]): Input channels. + in_index (int|Sequence[int]): Input feature index. + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + None: Only one select feature map is allowed. + """ + + if input_transform is not None: + assert input_transform in ['resize_concat', 'multiple_select'] + self.input_transform = input_transform + self.in_index = in_index + if input_transform is not None: + assert isinstance(in_channels, (list, tuple)) + assert isinstance(in_index, (list, tuple)) + assert len(in_channels) == len(in_index) + if input_transform == 'resize_concat': + self.in_channels = sum(in_channels) + else: + self.in_channels = in_channels + else: + assert isinstance(in_channels, int) + assert isinstance(in_index, int) + self.in_channels = in_channels + + def init_weights(self): + """Initialize weights of classification layer.""" + normal_init(self.conv_seg, mean=0, std=0.01) + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor]): List of multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = torch.cat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + @auto_fp16() + @abstractmethod + def forward(self, inputs): + """Placeholder of forward function.""" + pass + + def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg): + """Forward function for training. + Args: + inputs (list[Tensor]): List of multi-level img features. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + train_cfg (dict): The training config. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + seg_logits = self.forward(inputs) + losses = self.losses(seg_logits, gt_semantic_seg) + return losses + + def forward_test(self, inputs, img_metas, test_cfg): + """Forward function for testing. + + Args: + inputs (list[Tensor]): List of multi-level img features. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + test_cfg (dict): The testing config. + + Returns: + Tensor: Output segmentation map. + """ + return self.forward(inputs) + + def cls_seg(self, feat): + """Classify each pixel.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.conv_seg(feat) + return output + + @force_fp32(apply_to=('seg_logit', )) + def losses(self, seg_logit, seg_label): + """Compute segmentation loss.""" + loss = dict() + seg_logit = resize( + input=seg_logit, + size=seg_label.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + if self.sampler is not None: + seg_weight = self.sampler.sample(seg_logit, seg_label) + else: + seg_weight = None + seg_label = seg_label.squeeze(1) + loss['loss_seg'] = self.loss_decode( + seg_logit, + seg_label, + weight=seg_weight, + ignore_index=self.ignore_index) + loss['acc_seg'] = accuracy(seg_logit, seg_label) + return loss diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/dm_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/dm_head.py new file mode 100644 index 0000000000000000000000000000000000000000..19c963923126b53ce22f60813540a35badf24b3d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/dm_head.py @@ -0,0 +1,140 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer + +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class DCM(nn.Module): + """Dynamic Convolutional Module used in DMNet. + + Args: + filter_size (int): The filter size of generated convolution kernel + used in Dynamic Convolutional Module. + fusion (bool): Add one conv to fuse DCM output feature. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict | None): Config of conv layers. + norm_cfg (dict | None): Config of norm layers. + act_cfg (dict): Config of activation layers. + """ + + def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg, + norm_cfg, act_cfg): + super(DCM, self).__init__() + self.filter_size = filter_size + self.fusion = fusion + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.filter_gen_conv = nn.Conv2d(self.in_channels, self.channels, 1, 1, + 0) + + self.input_redu_conv = ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + if self.norm_cfg is not None: + self.norm = build_norm_layer(self.norm_cfg, self.channels)[1] + else: + self.norm = None + self.activate = build_activation_layer(self.act_cfg) + + if self.fusion: + self.fusion_conv = ConvModule( + self.channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, x): + """Forward function.""" + generated_filter = self.filter_gen_conv( + F.adaptive_avg_pool2d(x, self.filter_size)) + x = self.input_redu_conv(x) + b, c, h, w = x.shape + # [1, b * c, h, w], c = self.channels + x = x.view(1, b * c, h, w) + # [b * c, 1, filter_size, filter_size] + generated_filter = generated_filter.view(b * c, 1, self.filter_size, + self.filter_size) + pad = (self.filter_size - 1) // 2 + if (self.filter_size - 1) % 2 == 0: + p2d = (pad, pad, pad, pad) + else: + p2d = (pad + 1, pad, pad + 1, pad) + x = F.pad(input=x, pad=p2d, mode='constant', value=0) + # [1, b * c, h, w] + output = F.conv2d(input=x, weight=generated_filter, groups=b * c) + # [b, c, h, w] + output = output.view(b, c, h, w) + if self.norm is not None: + output = self.norm(output) + output = self.activate(output) + + if self.fusion: + output = self.fusion_conv(output) + + return output + + +@HEADS.register_module() +class DMHead(BaseDecodeHead): + """Dynamic Multi-scale Filters for Semantic Segmentation. + + This head is the implementation of + `DMNet `_. + + Args: + filter_sizes (tuple[int]): The size of generated convolutional filters + used in Dynamic Convolutional Module. Default: (1, 3, 5, 7). + fusion (bool): Add one conv to fuse DCM output feature. + """ + + def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs): + super(DMHead, self).__init__(**kwargs) + assert isinstance(filter_sizes, (list, tuple)) + self.filter_sizes = filter_sizes + self.fusion = fusion + dcm_modules = [] + for filter_size in self.filter_sizes: + dcm_modules.append( + DCM(filter_size, + self.fusion, + self.in_channels, + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.dcm_modules = nn.ModuleList(dcm_modules) + self.bottleneck = ConvModule( + self.in_channels + len(filter_sizes) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + dcm_outs = [x] + for dcm_module in self.dcm_modules: + dcm_outs.append(dcm_module(x)) + dcm_outs = torch.cat(dcm_outs, dim=1) + output = self.bottleneck(dcm_outs) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/dnl_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/dnl_head.py new file mode 100644 index 0000000000000000000000000000000000000000..333280c5947066fd3c7ebcfe302a0e7ad65480d5 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/dnl_head.py @@ -0,0 +1,131 @@ +import torch +from annotator.uniformer.mmcv.cnn import NonLocal2d +from torch import nn + +from ..builder import HEADS +from .fcn_head import FCNHead + + +class DisentangledNonLocal2d(NonLocal2d): + """Disentangled Non-Local Blocks. + + Args: + temperature (float): Temperature to adjust attention. Default: 0.05 + """ + + def __init__(self, *arg, temperature, **kwargs): + super().__init__(*arg, **kwargs) + self.temperature = temperature + self.conv_mask = nn.Conv2d(self.in_channels, 1, kernel_size=1) + + def embedded_gaussian(self, theta_x, phi_x): + """Embedded gaussian with temperature.""" + + # NonLocal2d pairwise_weight: [N, HxW, HxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + if self.use_scale: + # theta_x.shape[-1] is `self.inter_channels` + pairwise_weight /= theta_x.shape[-1]**0.5 + pairwise_weight /= self.temperature + pairwise_weight = pairwise_weight.softmax(dim=-1) + return pairwise_weight + + def forward(self, x): + # x: [N, C, H, W] + n = x.size(0) + + # g_x: [N, HxW, C] + g_x = self.g(x).view(n, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + # theta_x: [N, HxW, C], phi_x: [N, C, HxW] + if self.mode == 'gaussian': + theta_x = x.view(n, self.in_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + if self.sub_sample: + phi_x = self.phi(x).view(n, self.in_channels, -1) + else: + phi_x = x.view(n, self.in_channels, -1) + elif self.mode == 'concatenation': + theta_x = self.theta(x).view(n, self.inter_channels, -1, 1) + phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) + else: + theta_x = self.theta(x).view(n, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(n, self.inter_channels, -1) + + # subtract mean + theta_x -= theta_x.mean(dim=-2, keepdim=True) + phi_x -= phi_x.mean(dim=-1, keepdim=True) + + pairwise_func = getattr(self, self.mode) + # pairwise_weight: [N, HxW, HxW] + pairwise_weight = pairwise_func(theta_x, phi_x) + + # y: [N, HxW, C] + y = torch.matmul(pairwise_weight, g_x) + # y: [N, C, H, W] + y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels, + *x.size()[2:]) + + # unary_mask: [N, 1, HxW] + unary_mask = self.conv_mask(x) + unary_mask = unary_mask.view(n, 1, -1) + unary_mask = unary_mask.softmax(dim=-1) + # unary_x: [N, 1, C] + unary_x = torch.matmul(unary_mask, g_x) + # unary_x: [N, C, 1, 1] + unary_x = unary_x.permute(0, 2, 1).contiguous().reshape( + n, self.inter_channels, 1, 1) + + output = x + self.conv_out(y + unary_x) + + return output + + +@HEADS.register_module() +class DNLHead(FCNHead): + """Disentangled Non-Local Neural Networks. + + This head is the implementation of `DNLNet + `_. + + Args: + reduction (int): Reduction factor of projection transform. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + sqrt(1/inter_channels). Default: False. + mode (str): The nonlocal mode. Options are 'embedded_gaussian', + 'dot_product'. Default: 'embedded_gaussian.'. + temperature (float): Temperature to adjust attention. Default: 0.05 + """ + + def __init__(self, + reduction=2, + use_scale=True, + mode='embedded_gaussian', + temperature=0.05, + **kwargs): + super(DNLHead, self).__init__(num_convs=2, **kwargs) + self.reduction = reduction + self.use_scale = use_scale + self.mode = mode + self.temperature = temperature + self.dnl_block = DisentangledNonLocal2d( + in_channels=self.channels, + reduction=self.reduction, + use_scale=self.use_scale, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + mode=self.mode, + temperature=self.temperature) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs[0](x) + output = self.dnl_block(output) + output = self.convs[1](output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ema_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ema_head.py new file mode 100644 index 0000000000000000000000000000000000000000..12267cb40569d2b5a4a2955a6dc2671377ff5e0a --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ema_head.py @@ -0,0 +1,168 @@ +import math + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule + +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +def reduce_mean(tensor): + """Reduce mean when distributed training.""" + if not (dist.is_available() and dist.is_initialized()): + return tensor + tensor = tensor.clone() + dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) + return tensor + + +class EMAModule(nn.Module): + """Expectation Maximization Attention Module used in EMANet. + + Args: + channels (int): Channels of the whole module. + num_bases (int): Number of bases. + num_stages (int): Number of the EM iterations. + """ + + def __init__(self, channels, num_bases, num_stages, momentum): + super(EMAModule, self).__init__() + assert num_stages >= 1, 'num_stages must be at least 1!' + self.num_bases = num_bases + self.num_stages = num_stages + self.momentum = momentum + + bases = torch.zeros(1, channels, self.num_bases) + bases.normal_(0, math.sqrt(2. / self.num_bases)) + # [1, channels, num_bases] + bases = F.normalize(bases, dim=1, p=2) + self.register_buffer('bases', bases) + + def forward(self, feats): + """Forward function.""" + batch_size, channels, height, width = feats.size() + # [batch_size, channels, height*width] + feats = feats.view(batch_size, channels, height * width) + # [batch_size, channels, num_bases] + bases = self.bases.repeat(batch_size, 1, 1) + + with torch.no_grad(): + for i in range(self.num_stages): + # [batch_size, height*width, num_bases] + attention = torch.einsum('bcn,bck->bnk', feats, bases) + attention = F.softmax(attention, dim=2) + # l1 norm + attention_normed = F.normalize(attention, dim=1, p=1) + # [batch_size, channels, num_bases] + bases = torch.einsum('bcn,bnk->bck', feats, attention_normed) + # l2 norm + bases = F.normalize(bases, dim=1, p=2) + + feats_recon = torch.einsum('bck,bnk->bcn', bases, attention) + feats_recon = feats_recon.view(batch_size, channels, height, width) + + if self.training: + bases = bases.mean(dim=0, keepdim=True) + bases = reduce_mean(bases) + # l2 norm + bases = F.normalize(bases, dim=1, p=2) + self.bases = (1 - + self.momentum) * self.bases + self.momentum * bases + + return feats_recon + + +@HEADS.register_module() +class EMAHead(BaseDecodeHead): + """Expectation Maximization Attention Networks for Semantic Segmentation. + + This head is the implementation of `EMANet + `_. + + Args: + ema_channels (int): EMA module channels + num_bases (int): Number of bases. + num_stages (int): Number of the EM iterations. + concat_input (bool): Whether concat the input and output of convs + before classification layer. Default: True + momentum (float): Momentum to update the base. Default: 0.1. + """ + + def __init__(self, + ema_channels, + num_bases, + num_stages, + concat_input=True, + momentum=0.1, + **kwargs): + super(EMAHead, self).__init__(**kwargs) + self.ema_channels = ema_channels + self.num_bases = num_bases + self.num_stages = num_stages + self.concat_input = concat_input + self.momentum = momentum + self.ema_module = EMAModule(self.ema_channels, self.num_bases, + self.num_stages, self.momentum) + + self.ema_in_conv = ConvModule( + self.in_channels, + self.ema_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + # project (0, inf) -> (-inf, inf) + self.ema_mid_conv = ConvModule( + self.ema_channels, + self.ema_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=None, + act_cfg=None) + for param in self.ema_mid_conv.parameters(): + param.requires_grad = False + + self.ema_out_conv = ConvModule( + self.ema_channels, + self.ema_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + self.bottleneck = ConvModule( + self.ema_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.concat_input: + self.conv_cat = ConvModule( + self.in_channels + self.channels, + self.channels, + kernel_size=3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + feats = self.ema_in_conv(x) + identity = feats + feats = self.ema_mid_conv(feats) + recon = self.ema_module(feats) + recon = F.relu(recon, inplace=True) + recon = self.ema_out_conv(recon) + output = F.relu(identity + recon, inplace=True) + output = self.bottleneck(output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/enc_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/enc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..da57af617e05d41761628fd2d6d232655b32d905 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/enc_head.py @@ -0,0 +1,187 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule, build_norm_layer + +from annotator.uniformer.mmseg.ops import Encoding, resize +from ..builder import HEADS, build_loss +from .decode_head import BaseDecodeHead + + +class EncModule(nn.Module): + """Encoding Module used in EncNet. + + Args: + in_channels (int): Input channels. + num_codes (int): Number of code words. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict): Config of activation layers. + """ + + def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg): + super(EncModule, self).__init__() + self.encoding_project = ConvModule( + in_channels, + in_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + # TODO: resolve this hack + # change to 1d + if norm_cfg is not None: + encoding_norm_cfg = norm_cfg.copy() + if encoding_norm_cfg['type'] in ['BN', 'IN']: + encoding_norm_cfg['type'] += '1d' + else: + encoding_norm_cfg['type'] = encoding_norm_cfg['type'].replace( + '2d', '1d') + else: + # fallback to BN1d + encoding_norm_cfg = dict(type='BN1d') + self.encoding = nn.Sequential( + Encoding(channels=in_channels, num_codes=num_codes), + build_norm_layer(encoding_norm_cfg, num_codes)[1], + nn.ReLU(inplace=True)) + self.fc = nn.Sequential( + nn.Linear(in_channels, in_channels), nn.Sigmoid()) + + def forward(self, x): + """Forward function.""" + encoding_projection = self.encoding_project(x) + encoding_feat = self.encoding(encoding_projection).mean(dim=1) + batch_size, channels, _, _ = x.size() + gamma = self.fc(encoding_feat) + y = gamma.view(batch_size, channels, 1, 1) + output = F.relu_(x + x * y) + return encoding_feat, output + + +@HEADS.register_module() +class EncHead(BaseDecodeHead): + """Context Encoding for Semantic Segmentation. + + This head is the implementation of `EncNet + `_. + + Args: + num_codes (int): Number of code words. Default: 32. + use_se_loss (bool): Whether use Semantic Encoding Loss (SE-loss) to + regularize the training. Default: True. + add_lateral (bool): Whether use lateral connection to fuse features. + Default: False. + loss_se_decode (dict): Config of decode loss. + Default: dict(type='CrossEntropyLoss', use_sigmoid=True). + """ + + def __init__(self, + num_codes=32, + use_se_loss=True, + add_lateral=False, + loss_se_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=0.2), + **kwargs): + super(EncHead, self).__init__( + input_transform='multiple_select', **kwargs) + self.use_se_loss = use_se_loss + self.add_lateral = add_lateral + self.num_codes = num_codes + self.bottleneck = ConvModule( + self.in_channels[-1], + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if add_lateral: + self.lateral_convs = nn.ModuleList() + for in_channels in self.in_channels[:-1]: # skip the last one + self.lateral_convs.append( + ConvModule( + in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.fusion = ConvModule( + len(self.in_channels) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.enc_module = EncModule( + self.channels, + num_codes=num_codes, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.use_se_loss: + self.loss_se_decode = build_loss(loss_se_decode) + self.se_layer = nn.Linear(self.channels, self.num_classes) + + def forward(self, inputs): + """Forward function.""" + inputs = self._transform_inputs(inputs) + feat = self.bottleneck(inputs[-1]) + if self.add_lateral: + laterals = [ + resize( + lateral_conv(inputs[i]), + size=feat.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + feat = self.fusion(torch.cat([feat, *laterals], 1)) + encode_feat, output = self.enc_module(feat) + output = self.cls_seg(output) + if self.use_se_loss: + se_output = self.se_layer(encode_feat) + return output, se_output + else: + return output + + def forward_test(self, inputs, img_metas, test_cfg): + """Forward function for testing, ignore se_loss.""" + if self.use_se_loss: + return self.forward(inputs)[0] + else: + return self.forward(inputs) + + @staticmethod + def _convert_to_onehot_labels(seg_label, num_classes): + """Convert segmentation label to onehot. + + Args: + seg_label (Tensor): Segmentation label of shape (N, H, W). + num_classes (int): Number of classes. + + Returns: + Tensor: Onehot labels of shape (N, num_classes). + """ + + batch_size = seg_label.size(0) + onehot_labels = seg_label.new_zeros((batch_size, num_classes)) + for i in range(batch_size): + hist = seg_label[i].float().histc( + bins=num_classes, min=0, max=num_classes - 1) + onehot_labels[i] = hist > 0 + return onehot_labels + + def losses(self, seg_logit, seg_label): + """Compute segmentation and semantic encoding loss.""" + seg_logit, se_seg_logit = seg_logit + loss = dict() + loss.update(super(EncHead, self).losses(seg_logit, seg_label)) + se_loss = self.loss_se_decode( + se_seg_logit, + self._convert_to_onehot_labels(seg_label, self.num_classes)) + loss['loss_se'] = se_loss + return loss diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/fcn_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/fcn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..edb32c283fa4baada6b4a0bf3f7540c3580c3468 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/fcn_head.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule + +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +@HEADS.register_module() +class FCNHead(BaseDecodeHead): + """Fully Convolution Networks for Semantic Segmentation. + + This head is implemented of `FCNNet `_. + + Args: + num_convs (int): Number of convs in the head. Default: 2. + kernel_size (int): The kernel size for convs in the head. Default: 3. + concat_input (bool): Whether concat the input and output of convs + before classification layer. + dilation (int): The dilation rate for convs in the head. Default: 1. + """ + + def __init__(self, + num_convs=2, + kernel_size=3, + concat_input=True, + dilation=1, + **kwargs): + assert num_convs >= 0 and dilation > 0 and isinstance(dilation, int) + self.num_convs = num_convs + self.concat_input = concat_input + self.kernel_size = kernel_size + super(FCNHead, self).__init__(**kwargs) + if num_convs == 0: + assert self.in_channels == self.channels + + conv_padding = (kernel_size // 2) * dilation + convs = [] + convs.append( + ConvModule( + self.in_channels, + self.channels, + kernel_size=kernel_size, + padding=conv_padding, + dilation=dilation, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + for i in range(num_convs - 1): + convs.append( + ConvModule( + self.channels, + self.channels, + kernel_size=kernel_size, + padding=conv_padding, + dilation=dilation, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + if num_convs == 0: + self.convs = nn.Identity() + else: + self.convs = nn.Sequential(*convs) + if self.concat_input: + self.conv_cat = ConvModule( + self.in_channels + self.channels, + self.channels, + kernel_size=kernel_size, + padding=kernel_size // 2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs(x) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/fpn_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/fpn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1241c55b0813d1ecdddf1e66e7c5031fbf78ed50 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/fpn_head.py @@ -0,0 +1,68 @@ +import numpy as np +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +@HEADS.register_module() +class FPNHead(BaseDecodeHead): + """Panoptic Feature Pyramid Networks. + + This head is the implementation of `Semantic FPN + `_. + + Args: + feature_strides (tuple[int]): The strides for input feature maps. + stack_lateral. All strides suppose to be power of 2. The first + one is of largest resolution. + """ + + def __init__(self, feature_strides, **kwargs): + super(FPNHead, self).__init__( + input_transform='multiple_select', **kwargs) + assert len(feature_strides) == len(self.in_channels) + assert min(feature_strides) == feature_strides[0] + self.feature_strides = feature_strides + + self.scale_heads = nn.ModuleList() + for i in range(len(feature_strides)): + head_length = max( + 1, + int(np.log2(feature_strides[i]) - np.log2(feature_strides[0]))) + scale_head = [] + for k in range(head_length): + scale_head.append( + ConvModule( + self.in_channels[i] if k == 0 else self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + if feature_strides[i] != feature_strides[0]: + scale_head.append( + nn.Upsample( + scale_factor=2, + mode='bilinear', + align_corners=self.align_corners)) + self.scale_heads.append(nn.Sequential(*scale_head)) + + def forward(self, inputs): + + x = self._transform_inputs(inputs) + + output = self.scale_heads[0](x[0]) + for i in range(1, len(self.feature_strides)): + # non inplace + output = output + resize( + self.scale_heads[i](x[i]), + size=output.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/gc_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/gc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..70741245af975800840709911bd18d72247e3e04 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/gc_head.py @@ -0,0 +1,47 @@ +import torch +from annotator.uniformer.mmcv.cnn import ContextBlock + +from ..builder import HEADS +from .fcn_head import FCNHead + + +@HEADS.register_module() +class GCHead(FCNHead): + """GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond. + + This head is the implementation of `GCNet + `_. + + Args: + ratio (float): Multiplier of channels ratio. Default: 1/4. + pooling_type (str): The pooling type of context aggregation. + Options are 'att', 'avg'. Default: 'avg'. + fusion_types (tuple[str]): The fusion type for feature fusion. + Options are 'channel_add', 'channel_mul'. Default: ('channel_add',) + """ + + def __init__(self, + ratio=1 / 4., + pooling_type='att', + fusion_types=('channel_add', ), + **kwargs): + super(GCHead, self).__init__(num_convs=2, **kwargs) + self.ratio = ratio + self.pooling_type = pooling_type + self.fusion_types = fusion_types + self.gc_block = ContextBlock( + in_channels=self.channels, + ratio=self.ratio, + pooling_type=self.pooling_type, + fusion_types=self.fusion_types) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs[0](x) + output = self.gc_block(output) + output = self.convs[1](output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/lraspp_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/lraspp_head.py new file mode 100644 index 0000000000000000000000000000000000000000..69bf320934d787aaa11984a0c4effe9ad8015b22 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/lraspp_head.py @@ -0,0 +1,90 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv import is_tuple_of +from annotator.uniformer.mmcv.cnn import ConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +@HEADS.register_module() +class LRASPPHead(BaseDecodeHead): + """Lite R-ASPP (LRASPP) head is proposed in Searching for MobileNetV3. + + This head is the improved implementation of `Searching for MobileNetV3 + `_. + + Args: + branch_channels (tuple[int]): The number of output channels in every + each branch. Default: (32, 64). + """ + + def __init__(self, branch_channels=(32, 64), **kwargs): + super(LRASPPHead, self).__init__(**kwargs) + if self.input_transform != 'multiple_select': + raise ValueError('in Lite R-ASPP (LRASPP) head, input_transform ' + f'must be \'multiple_select\'. But received ' + f'\'{self.input_transform}\'') + assert is_tuple_of(branch_channels, int) + assert len(branch_channels) == len(self.in_channels) - 1 + self.branch_channels = branch_channels + + self.convs = nn.Sequential() + self.conv_ups = nn.Sequential() + for i in range(len(branch_channels)): + self.convs.add_module( + f'conv{i}', + nn.Conv2d( + self.in_channels[i], branch_channels[i], 1, bias=False)) + self.conv_ups.add_module( + f'conv_up{i}', + ConvModule( + self.channels + branch_channels[i], + self.channels, + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=False)) + + self.conv_up_input = nn.Conv2d(self.channels, self.channels, 1) + + self.aspp_conv = ConvModule( + self.in_channels[-1], + self.channels, + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=False) + self.image_pool = nn.Sequential( + nn.AvgPool2d(kernel_size=49, stride=(16, 20)), + ConvModule( + self.in_channels[2], + self.channels, + 1, + act_cfg=dict(type='Sigmoid'), + bias=False)) + + def forward(self, inputs): + """Forward function.""" + inputs = self._transform_inputs(inputs) + + x = inputs[-1] + + x = self.aspp_conv(x) * resize( + self.image_pool(x), + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + x = self.conv_up_input(x) + + for i in range(len(self.branch_channels) - 1, -1, -1): + x = resize( + x, + size=inputs[i].size()[2:], + mode='bilinear', + align_corners=self.align_corners) + x = torch.cat([x, self.convs[i](inputs[i])], 1) + x = self.conv_ups[i](x) + + return self.cls_seg(x) diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/nl_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/nl_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3eee424199e6aa363b564e2a3340a070db04db86 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/nl_head.py @@ -0,0 +1,49 @@ +import torch +from annotator.uniformer.mmcv.cnn import NonLocal2d + +from ..builder import HEADS +from .fcn_head import FCNHead + + +@HEADS.register_module() +class NLHead(FCNHead): + """Non-local Neural Networks. + + This head is the implementation of `NLNet + `_. + + Args: + reduction (int): Reduction factor of projection transform. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + sqrt(1/inter_channels). Default: True. + mode (str): The nonlocal mode. Options are 'embedded_gaussian', + 'dot_product'. Default: 'embedded_gaussian.'. + """ + + def __init__(self, + reduction=2, + use_scale=True, + mode='embedded_gaussian', + **kwargs): + super(NLHead, self).__init__(num_convs=2, **kwargs) + self.reduction = reduction + self.use_scale = use_scale + self.mode = mode + self.nl_block = NonLocal2d( + in_channels=self.channels, + reduction=self.reduction, + use_scale=self.use_scale, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + mode=self.mode) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs[0](x) + output = self.nl_block(output) + output = self.convs[1](output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ocr_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ocr_head.py new file mode 100644 index 0000000000000000000000000000000000000000..715852e94e81dc46623972748285d2d19237a341 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/ocr_head.py @@ -0,0 +1,127 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from ..utils import SelfAttentionBlock as _SelfAttentionBlock +from .cascade_decode_head import BaseCascadeDecodeHead + + +class SpatialGatherModule(nn.Module): + """Aggregate the context features according to the initial predicted + probability distribution. + + Employ the soft-weighted method to aggregate the context. + """ + + def __init__(self, scale): + super(SpatialGatherModule, self).__init__() + self.scale = scale + + def forward(self, feats, probs): + """Forward function.""" + batch_size, num_classes, height, width = probs.size() + channels = feats.size(1) + probs = probs.view(batch_size, num_classes, -1) + feats = feats.view(batch_size, channels, -1) + # [batch_size, height*width, num_classes] + feats = feats.permute(0, 2, 1) + # [batch_size, channels, height*width] + probs = F.softmax(self.scale * probs, dim=2) + # [batch_size, channels, num_classes] + ocr_context = torch.matmul(probs, feats) + ocr_context = ocr_context.permute(0, 2, 1).contiguous().unsqueeze(3) + return ocr_context + + +class ObjectAttentionBlock(_SelfAttentionBlock): + """Make a OCR used SelfAttentionBlock.""" + + def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg, + act_cfg): + if scale > 1: + query_downsample = nn.MaxPool2d(kernel_size=scale) + else: + query_downsample = None + super(ObjectAttentionBlock, self).__init__( + key_in_channels=in_channels, + query_in_channels=in_channels, + channels=channels, + out_channels=in_channels, + share_key_query=False, + query_downsample=query_downsample, + key_downsample=None, + key_query_num_convs=2, + key_query_norm=True, + value_out_num_convs=1, + value_out_norm=True, + matmul_norm=True, + with_out=True, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.bottleneck = ConvModule( + in_channels * 2, + in_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, query_feats, key_feats): + """Forward function.""" + context = super(ObjectAttentionBlock, + self).forward(query_feats, key_feats) + output = self.bottleneck(torch.cat([context, query_feats], dim=1)) + if self.query_downsample is not None: + output = resize(query_feats) + + return output + + +@HEADS.register_module() +class OCRHead(BaseCascadeDecodeHead): + """Object-Contextual Representations for Semantic Segmentation. + + This head is the implementation of `OCRNet + `_. + + Args: + ocr_channels (int): The intermediate channels of OCR block. + scale (int): The scale of probability map in SpatialGatherModule in + Default: 1. + """ + + def __init__(self, ocr_channels, scale=1, **kwargs): + super(OCRHead, self).__init__(**kwargs) + self.ocr_channels = ocr_channels + self.scale = scale + self.object_context_block = ObjectAttentionBlock( + self.channels, + self.ocr_channels, + self.scale, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.spatial_gather_module = SpatialGatherModule(self.scale) + + self.bottleneck = ConvModule( + self.in_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs, prev_output): + """Forward function.""" + x = self._transform_inputs(inputs) + feats = self.bottleneck(x) + context = self.spatial_gather_module(feats, prev_output) + object_context = self.object_context_block(feats, context) + output = self.cls_seg(object_context) + + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/point_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/point_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3342aa28bb8d264b2c3d01cbf5098d145943c193 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/point_head.py @@ -0,0 +1,349 @@ +# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py # noqa + +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule, normal_init +from annotator.uniformer.mmcv.ops import point_sample + +from annotator.uniformer.mmseg.models.builder import HEADS +from annotator.uniformer.mmseg.ops import resize +from ..losses import accuracy +from .cascade_decode_head import BaseCascadeDecodeHead + + +def calculate_uncertainty(seg_logits): + """Estimate uncertainty based on seg logits. + + For each location of the prediction ``seg_logits`` we estimate + uncertainty as the difference between top first and top second + predicted logits. + + Args: + seg_logits (Tensor): Semantic segmentation logits, + shape (batch_size, num_classes, height, width). + + Returns: + scores (Tensor): T uncertainty scores with the most uncertain + locations having the highest uncertainty score, shape ( + batch_size, 1, height, width) + """ + top2_scores = torch.topk(seg_logits, k=2, dim=1)[0] + return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1) + + +@HEADS.register_module() +class PointHead(BaseCascadeDecodeHead): + """A mask point head use in PointRend. + + ``PointHead`` use shared multi-layer perceptron (equivalent to + nn.Conv1d) to predict the logit of input points. The fine-grained feature + and coarse feature will be concatenate together for predication. + + Args: + num_fcs (int): Number of fc layers in the head. Default: 3. + in_channels (int): Number of input channels. Default: 256. + fc_channels (int): Number of fc channels. Default: 256. + num_classes (int): Number of classes for logits. Default: 80. + class_agnostic (bool): Whether use class agnostic classification. + If so, the output channels of logits will be 1. Default: False. + coarse_pred_each_layer (bool): Whether concatenate coarse feature with + the output of each fc layer. Default: True. + conv_cfg (dict|None): Dictionary to construct and config conv layer. + Default: dict(type='Conv1d')) + norm_cfg (dict|None): Dictionary to construct and config norm layer. + Default: None. + loss_point (dict): Dictionary to construct and config loss layer of + point head. Default: dict(type='CrossEntropyLoss', use_mask=True, + loss_weight=1.0). + """ + + def __init__(self, + num_fcs=3, + coarse_pred_each_layer=True, + conv_cfg=dict(type='Conv1d'), + norm_cfg=None, + act_cfg=dict(type='ReLU', inplace=False), + **kwargs): + super(PointHead, self).__init__( + input_transform='multiple_select', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + **kwargs) + + self.num_fcs = num_fcs + self.coarse_pred_each_layer = coarse_pred_each_layer + + fc_in_channels = sum(self.in_channels) + self.num_classes + fc_channels = self.channels + self.fcs = nn.ModuleList() + for k in range(num_fcs): + fc = ConvModule( + fc_in_channels, + fc_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.fcs.append(fc) + fc_in_channels = fc_channels + fc_in_channels += self.num_classes if self.coarse_pred_each_layer \ + else 0 + self.fc_seg = nn.Conv1d( + fc_in_channels, + self.num_classes, + kernel_size=1, + stride=1, + padding=0) + if self.dropout_ratio > 0: + self.dropout = nn.Dropout(self.dropout_ratio) + delattr(self, 'conv_seg') + + def init_weights(self): + """Initialize weights of classification layer.""" + normal_init(self.fc_seg, std=0.001) + + def cls_seg(self, feat): + """Classify each pixel with fc.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.fc_seg(feat) + return output + + def forward(self, fine_grained_point_feats, coarse_point_feats): + x = torch.cat([fine_grained_point_feats, coarse_point_feats], dim=1) + for fc in self.fcs: + x = fc(x) + if self.coarse_pred_each_layer: + x = torch.cat((x, coarse_point_feats), dim=1) + return self.cls_seg(x) + + def _get_fine_grained_point_feats(self, x, points): + """Sample from fine grained features. + + Args: + x (list[Tensor]): Feature pyramid from by neck or backbone. + points (Tensor): Point coordinates, shape (batch_size, + num_points, 2). + + Returns: + fine_grained_feats (Tensor): Sampled fine grained feature, + shape (batch_size, sum(channels of x), num_points). + """ + + fine_grained_feats_list = [ + point_sample(_, points, align_corners=self.align_corners) + for _ in x + ] + if len(fine_grained_feats_list) > 1: + fine_grained_feats = torch.cat(fine_grained_feats_list, dim=1) + else: + fine_grained_feats = fine_grained_feats_list[0] + + return fine_grained_feats + + def _get_coarse_point_feats(self, prev_output, points): + """Sample from fine grained features. + + Args: + prev_output (list[Tensor]): Prediction of previous decode head. + points (Tensor): Point coordinates, shape (batch_size, + num_points, 2). + + Returns: + coarse_feats (Tensor): Sampled coarse feature, shape (batch_size, + num_classes, num_points). + """ + + coarse_feats = point_sample( + prev_output, points, align_corners=self.align_corners) + + return coarse_feats + + def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg, + train_cfg): + """Forward function for training. + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + train_cfg (dict): The training config. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + x = self._transform_inputs(inputs) + with torch.no_grad(): + points = self.get_points_train( + prev_output, calculate_uncertainty, cfg=train_cfg) + fine_grained_point_feats = self._get_fine_grained_point_feats( + x, points) + coarse_point_feats = self._get_coarse_point_feats(prev_output, points) + point_logits = self.forward(fine_grained_point_feats, + coarse_point_feats) + point_label = point_sample( + gt_semantic_seg.float(), + points, + mode='nearest', + align_corners=self.align_corners) + point_label = point_label.squeeze(1).long() + + losses = self.losses(point_logits, point_label) + + return losses + + def forward_test(self, inputs, prev_output, img_metas, test_cfg): + """Forward function for testing. + + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + test_cfg (dict): The testing config. + + Returns: + Tensor: Output segmentation map. + """ + + x = self._transform_inputs(inputs) + refined_seg_logits = prev_output.clone() + for _ in range(test_cfg.subdivision_steps): + refined_seg_logits = resize( + refined_seg_logits, + scale_factor=test_cfg.scale_factor, + mode='bilinear', + align_corners=self.align_corners) + batch_size, channels, height, width = refined_seg_logits.shape + point_indices, points = self.get_points_test( + refined_seg_logits, calculate_uncertainty, cfg=test_cfg) + fine_grained_point_feats = self._get_fine_grained_point_feats( + x, points) + coarse_point_feats = self._get_coarse_point_feats( + prev_output, points) + point_logits = self.forward(fine_grained_point_feats, + coarse_point_feats) + + point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1) + refined_seg_logits = refined_seg_logits.reshape( + batch_size, channels, height * width) + refined_seg_logits = refined_seg_logits.scatter_( + 2, point_indices, point_logits) + refined_seg_logits = refined_seg_logits.view( + batch_size, channels, height, width) + + return refined_seg_logits + + def losses(self, point_logits, point_label): + """Compute segmentation loss.""" + loss = dict() + loss['loss_point'] = self.loss_decode( + point_logits, point_label, ignore_index=self.ignore_index) + loss['acc_point'] = accuracy(point_logits, point_label) + return loss + + def get_points_train(self, seg_logits, uncertainty_func, cfg): + """Sample points for training. + + Sample points in [0, 1] x [0, 1] coordinate space based on their + uncertainty. The uncertainties are calculated for each point using + 'uncertainty_func' function that takes point's logit prediction as + input. + + Args: + seg_logits (Tensor): Semantic segmentation logits, shape ( + batch_size, num_classes, height, width). + uncertainty_func (func): uncertainty calculation function. + cfg (dict): Training config of point head. + + Returns: + point_coords (Tensor): A tensor of shape (batch_size, num_points, + 2) that contains the coordinates of ``num_points`` sampled + points. + """ + num_points = cfg.num_points + oversample_ratio = cfg.oversample_ratio + importance_sample_ratio = cfg.importance_sample_ratio + assert oversample_ratio >= 1 + assert 0 <= importance_sample_ratio <= 1 + batch_size = seg_logits.shape[0] + num_sampled = int(num_points * oversample_ratio) + point_coords = torch.rand( + batch_size, num_sampled, 2, device=seg_logits.device) + point_logits = point_sample(seg_logits, point_coords) + # It is crucial to calculate uncertainty based on the sampled + # prediction value for the points. Calculating uncertainties of the + # coarse predictions first and sampling them for points leads to + # incorrect results. To illustrate this: assume uncertainty func( + # logits)=-abs(logits), a sampled point between two coarse + # predictions with -1 and 1 logits has 0 logits, and therefore 0 + # uncertainty value. However, if we calculate uncertainties for the + # coarse predictions first, both will have -1 uncertainty, + # and sampled point will get -1 uncertainty. + point_uncertainties = uncertainty_func(point_logits) + num_uncertain_points = int(importance_sample_ratio * num_points) + num_random_points = num_points - num_uncertain_points + idx = torch.topk( + point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] + shift = num_sampled * torch.arange( + batch_size, dtype=torch.long, device=seg_logits.device) + idx += shift[:, None] + point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view( + batch_size, num_uncertain_points, 2) + if num_random_points > 0: + rand_point_coords = torch.rand( + batch_size, num_random_points, 2, device=seg_logits.device) + point_coords = torch.cat((point_coords, rand_point_coords), dim=1) + return point_coords + + def get_points_test(self, seg_logits, uncertainty_func, cfg): + """Sample points for testing. + + Find ``num_points`` most uncertain points from ``uncertainty_map``. + + Args: + seg_logits (Tensor): A tensor of shape (batch_size, num_classes, + height, width) for class-specific or class-agnostic prediction. + uncertainty_func (func): uncertainty calculation function. + cfg (dict): Testing config of point head. + + Returns: + point_indices (Tensor): A tensor of shape (batch_size, num_points) + that contains indices from [0, height x width) of the most + uncertain points. + point_coords (Tensor): A tensor of shape (batch_size, num_points, + 2) that contains [0, 1] x [0, 1] normalized coordinates of the + most uncertain points from the ``height x width`` grid . + """ + + num_points = cfg.subdivision_num_points + uncertainty_map = uncertainty_func(seg_logits) + batch_size, _, height, width = uncertainty_map.shape + h_step = 1.0 / height + w_step = 1.0 / width + + uncertainty_map = uncertainty_map.view(batch_size, height * width) + num_points = min(height * width, num_points) + point_indices = uncertainty_map.topk(num_points, dim=1)[1] + point_coords = torch.zeros( + batch_size, + num_points, + 2, + dtype=torch.float, + device=seg_logits.device) + point_coords[:, :, 0] = w_step / 2.0 + (point_indices % + width).float() * w_step + point_coords[:, :, 1] = h_step / 2.0 + (point_indices // + width).float() * h_step + return point_indices, point_coords diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/psa_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/psa_head.py new file mode 100644 index 0000000000000000000000000000000000000000..480dbd1a081262e45bf87e32c4a339ac8f8b4ffb --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/psa_head.py @@ -0,0 +1,196 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + +try: + from annotator.uniformer.mmcv.ops import PSAMask +except ModuleNotFoundError: + PSAMask = None + + +@HEADS.register_module() +class PSAHead(BaseDecodeHead): + """Point-wise Spatial Attention Network for Scene Parsing. + + This head is the implementation of `PSANet + `_. + + Args: + mask_size (tuple[int]): The PSA mask size. It usually equals input + size. + psa_type (str): The type of psa module. Options are 'collect', + 'distribute', 'bi-direction'. Default: 'bi-direction' + compact (bool): Whether use compact map for 'collect' mode. + Default: True. + shrink_factor (int): The downsample factors of psa mask. Default: 2. + normalization_factor (float): The normalize factor of attention. + psa_softmax (bool): Whether use softmax for attention. + """ + + def __init__(self, + mask_size, + psa_type='bi-direction', + compact=False, + shrink_factor=2, + normalization_factor=1.0, + psa_softmax=True, + **kwargs): + if PSAMask is None: + raise RuntimeError('Please install mmcv-full for PSAMask ops') + super(PSAHead, self).__init__(**kwargs) + assert psa_type in ['collect', 'distribute', 'bi-direction'] + self.psa_type = psa_type + self.compact = compact + self.shrink_factor = shrink_factor + self.mask_size = mask_size + mask_h, mask_w = mask_size + self.psa_softmax = psa_softmax + if normalization_factor is None: + normalization_factor = mask_h * mask_w + self.normalization_factor = normalization_factor + + self.reduce = ConvModule( + self.in_channels, + self.channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.attention = nn.Sequential( + ConvModule( + self.channels, + self.channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + self.channels, mask_h * mask_w, kernel_size=1, bias=False)) + if psa_type == 'bi-direction': + self.reduce_p = ConvModule( + self.in_channels, + self.channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.attention_p = nn.Sequential( + ConvModule( + self.channels, + self.channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + self.channels, mask_h * mask_w, kernel_size=1, bias=False)) + self.psamask_collect = PSAMask('collect', mask_size) + self.psamask_distribute = PSAMask('distribute', mask_size) + else: + self.psamask = PSAMask(psa_type, mask_size) + self.proj = ConvModule( + self.channels * (2 if psa_type == 'bi-direction' else 1), + self.in_channels, + kernel_size=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.bottleneck = ConvModule( + self.in_channels * 2, + self.channels, + kernel_size=3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + identity = x + align_corners = self.align_corners + if self.psa_type in ['collect', 'distribute']: + out = self.reduce(x) + n, c, h, w = out.size() + if self.shrink_factor != 1: + if h % self.shrink_factor and w % self.shrink_factor: + h = (h - 1) // self.shrink_factor + 1 + w = (w - 1) // self.shrink_factor + 1 + align_corners = True + else: + h = h // self.shrink_factor + w = w // self.shrink_factor + align_corners = False + out = resize( + out, + size=(h, w), + mode='bilinear', + align_corners=align_corners) + y = self.attention(out) + if self.compact: + if self.psa_type == 'collect': + y = y.view(n, h * w, + h * w).transpose(1, 2).view(n, h * w, h, w) + else: + y = self.psamask(y) + if self.psa_softmax: + y = F.softmax(y, dim=1) + out = torch.bmm( + out.view(n, c, h * w), y.view(n, h * w, h * w)).view( + n, c, h, w) * (1.0 / self.normalization_factor) + else: + x_col = self.reduce(x) + x_dis = self.reduce_p(x) + n, c, h, w = x_col.size() + if self.shrink_factor != 1: + if h % self.shrink_factor and w % self.shrink_factor: + h = (h - 1) // self.shrink_factor + 1 + w = (w - 1) // self.shrink_factor + 1 + align_corners = True + else: + h = h // self.shrink_factor + w = w // self.shrink_factor + align_corners = False + x_col = resize( + x_col, + size=(h, w), + mode='bilinear', + align_corners=align_corners) + x_dis = resize( + x_dis, + size=(h, w), + mode='bilinear', + align_corners=align_corners) + y_col = self.attention(x_col) + y_dis = self.attention_p(x_dis) + if self.compact: + y_dis = y_dis.view(n, h * w, + h * w).transpose(1, 2).view(n, h * w, h, w) + else: + y_col = self.psamask_collect(y_col) + y_dis = self.psamask_distribute(y_dis) + if self.psa_softmax: + y_col = F.softmax(y_col, dim=1) + y_dis = F.softmax(y_dis, dim=1) + x_col = torch.bmm( + x_col.view(n, c, h * w), y_col.view(n, h * w, h * w)).view( + n, c, h, w) * (1.0 / self.normalization_factor) + x_dis = torch.bmm( + x_dis.view(n, c, h * w), y_dis.view(n, h * w, h * w)).view( + n, c, h, w) * (1.0 / self.normalization_factor) + out = torch.cat([x_col, x_dis], 1) + out = self.proj(out) + out = resize( + out, + size=identity.shape[2:], + mode='bilinear', + align_corners=align_corners) + out = self.bottleneck(torch.cat((identity, out), dim=1)) + out = self.cls_seg(out) + return out diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/psp_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/psp_head.py new file mode 100644 index 0000000000000000000000000000000000000000..b5f1e71c70c3a20f4007c263ec471a87bb214a48 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/psp_head.py @@ -0,0 +1,101 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class PPM(nn.ModuleList): + """Pooling Pyramid Module used in PSPNet. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict): Config of activation layers. + align_corners (bool): align_corners argument of F.interpolate. + """ + + def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg, + act_cfg, align_corners): + super(PPM, self).__init__() + self.pool_scales = pool_scales + self.align_corners = align_corners + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + for pool_scale in pool_scales: + self.append( + nn.Sequential( + nn.AdaptiveAvgPool2d(pool_scale), + ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg))) + + def forward(self, x): + """Forward function.""" + ppm_outs = [] + for ppm in self: + ppm_out = ppm(x) + upsampled_ppm_out = resize( + ppm_out, + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + ppm_outs.append(upsampled_ppm_out) + return ppm_outs + + +@HEADS.register_module() +class PSPHead(BaseDecodeHead): + """Pyramid Scene Parsing Network. + + This head is the implementation of + `PSPNet `_. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. Default: (1, 2, 3, 6). + """ + + def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): + super(PSPHead, self).__init__(**kwargs) + assert isinstance(pool_scales, (list, tuple)) + self.pool_scales = pool_scales + self.psp_modules = PPM( + self.pool_scales, + self.in_channels, + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + self.bottleneck = ConvModule( + self.in_channels + len(pool_scales) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + psp_outs = [x] + psp_outs.extend(self.psp_modules(x)) + psp_outs = torch.cat(psp_outs, dim=1) + output = self.bottleneck(psp_outs) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/sep_aspp_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/sep_aspp_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3339a7ac56e77dfc638e9bffb557d4699148686b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/sep_aspp_head.py @@ -0,0 +1,101 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule, DepthwiseSeparableConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from .aspp_head import ASPPHead, ASPPModule + + +class DepthwiseSeparableASPPModule(ASPPModule): + """Atrous Spatial Pyramid Pooling (ASPP) Module with depthwise separable + conv.""" + + def __init__(self, **kwargs): + super(DepthwiseSeparableASPPModule, self).__init__(**kwargs) + for i, dilation in enumerate(self.dilations): + if dilation > 1: + self[i] = DepthwiseSeparableConvModule( + self.in_channels, + self.channels, + 3, + dilation=dilation, + padding=dilation, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + +@HEADS.register_module() +class DepthwiseSeparableASPPHead(ASPPHead): + """Encoder-Decoder with Atrous Separable Convolution for Semantic Image + Segmentation. + + This head is the implementation of `DeepLabV3+ + `_. + + Args: + c1_in_channels (int): The input channels of c1 decoder. If is 0, + the no decoder will be used. + c1_channels (int): The intermediate channels of c1 decoder. + """ + + def __init__(self, c1_in_channels, c1_channels, **kwargs): + super(DepthwiseSeparableASPPHead, self).__init__(**kwargs) + assert c1_in_channels >= 0 + self.aspp_modules = DepthwiseSeparableASPPModule( + dilations=self.dilations, + in_channels=self.in_channels, + channels=self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if c1_in_channels > 0: + self.c1_bottleneck = ConvModule( + c1_in_channels, + c1_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + self.c1_bottleneck = None + self.sep_bottleneck = nn.Sequential( + DepthwiseSeparableConvModule( + self.channels + c1_channels, + self.channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + DepthwiseSeparableConvModule( + self.channels, + self.channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + aspp_outs = [ + resize( + self.image_pool(x), + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + ] + aspp_outs.extend(self.aspp_modules(x)) + aspp_outs = torch.cat(aspp_outs, dim=1) + output = self.bottleneck(aspp_outs) + if self.c1_bottleneck is not None: + c1_output = self.c1_bottleneck(inputs[0]) + output = resize( + input=output, + size=c1_output.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + output = torch.cat([output, c1_output], dim=1) + output = self.sep_bottleneck(output) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/sep_fcn_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/sep_fcn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a0986143fa4f2bd36f5271354fe5f843f35b9e6f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/sep_fcn_head.py @@ -0,0 +1,51 @@ +from annotator.uniformer.mmcv.cnn import DepthwiseSeparableConvModule + +from ..builder import HEADS +from .fcn_head import FCNHead + + +@HEADS.register_module() +class DepthwiseSeparableFCNHead(FCNHead): + """Depthwise-Separable Fully Convolutional Network for Semantic + Segmentation. + + This head is implemented according to Fast-SCNN paper. + Args: + in_channels(int): Number of output channels of FFM. + channels(int): Number of middle-stage channels in the decode head. + concat_input(bool): Whether to concatenate original decode input into + the result of several consecutive convolution layers. + Default: True. + num_classes(int): Used to determine the dimension of + final prediction tensor. + in_index(int): Correspond with 'out_indices' in FastSCNN backbone. + norm_cfg (dict | None): Config of norm layers. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + loss_decode(dict): Config of loss type and some + relevant additional options. + """ + + def __init__(self, **kwargs): + super(DepthwiseSeparableFCNHead, self).__init__(**kwargs) + self.convs[0] = DepthwiseSeparableConvModule( + self.in_channels, + self.channels, + kernel_size=self.kernel_size, + padding=self.kernel_size // 2, + norm_cfg=self.norm_cfg) + for i in range(1, self.num_convs): + self.convs[i] = DepthwiseSeparableConvModule( + self.channels, + self.channels, + kernel_size=self.kernel_size, + padding=self.kernel_size // 2, + norm_cfg=self.norm_cfg) + + if self.concat_input: + self.conv_cat = DepthwiseSeparableConvModule( + self.in_channels + self.channels, + self.channels, + kernel_size=self.kernel_size, + padding=self.kernel_size // 2, + norm_cfg=self.norm_cfg) diff --git a/lavis/common/annotator/uniformer/mmseg/models/decode_heads/uper_head.py b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/uper_head.py new file mode 100644 index 0000000000000000000000000000000000000000..9e1301b706b0d83ed714bbdee8ee24693f150455 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/decode_heads/uper_head.py @@ -0,0 +1,126 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule + +from annotator.uniformer.mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead +from .psp_head import PPM + + +@HEADS.register_module() +class UPerHead(BaseDecodeHead): + """Unified Perceptual Parsing for Scene Understanding. + + This head is the implementation of `UPerNet + `_. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module applied on the last feature. Default: (1, 2, 3, 6). + """ + + def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): + super(UPerHead, self).__init__( + input_transform='multiple_select', **kwargs) + # PSP Module + self.psp_modules = PPM( + pool_scales, + self.in_channels[-1], + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + self.bottleneck = ConvModule( + self.in_channels[-1] + len(pool_scales) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + # FPN Module + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + for in_channels in self.in_channels[:-1]: # skip the top layer + l_conv = ConvModule( + in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + inplace=False) + fpn_conv = ConvModule( + self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + inplace=False) + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + self.fpn_bottleneck = ConvModule( + len(self.in_channels) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def psp_forward(self, inputs): + """Forward function of PSP module.""" + x = inputs[-1] + psp_outs = [x] + psp_outs.extend(self.psp_modules(x)) + psp_outs = torch.cat(psp_outs, dim=1) + output = self.bottleneck(psp_outs) + + return output + + def forward(self, inputs): + """Forward function.""" + + inputs = self._transform_inputs(inputs) + + # build laterals + laterals = [ + lateral_conv(inputs[i]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + laterals.append(self.psp_forward(inputs)) + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] += resize( + laterals[i], + size=prev_shape, + mode='bilinear', + align_corners=self.align_corners) + + # build outputs + fpn_outs = [ + self.fpn_convs[i](laterals[i]) + for i in range(used_backbone_levels - 1) + ] + # append psp feature + fpn_outs.append(laterals[-1]) + + for i in range(used_backbone_levels - 1, 0, -1): + fpn_outs[i] = resize( + fpn_outs[i], + size=fpn_outs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) + fpn_outs = torch.cat(fpn_outs, dim=1) + output = self.fpn_bottleneck(fpn_outs) + output = self.cls_seg(output) + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/losses/__init__.py b/lavis/common/annotator/uniformer/mmseg/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..beca72045694273d63465bac2f27dbc6672271db --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/losses/__init__.py @@ -0,0 +1,12 @@ +from .accuracy import Accuracy, accuracy +from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy, + cross_entropy, mask_cross_entropy) +from .dice_loss import DiceLoss +from .lovasz_loss import LovaszLoss +from .utils import reduce_loss, weight_reduce_loss, weighted_loss + +__all__ = [ + 'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy', + 'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss', + 'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss' +] diff --git a/lavis/common/annotator/uniformer/mmseg/models/losses/accuracy.py b/lavis/common/annotator/uniformer/mmseg/models/losses/accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..c0fd2e7e74a0f721c4a814c09d6e453e5956bb38 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/losses/accuracy.py @@ -0,0 +1,78 @@ +import torch.nn as nn + + +def accuracy(pred, target, topk=1, thresh=None): + """Calculate accuracy according to the prediction and target. + + Args: + pred (torch.Tensor): The model prediction, shape (N, num_class, ...) + target (torch.Tensor): The target of each prediction, shape (N, , ...) + topk (int | tuple[int], optional): If the predictions in ``topk`` + matches the target, the predictions will be regarded as + correct ones. Defaults to 1. + thresh (float, optional): If not None, predictions with scores under + this threshold are considered incorrect. Default to None. + + Returns: + float | tuple[float]: If the input ``topk`` is a single integer, + the function will return a single float as accuracy. If + ``topk`` is a tuple containing multiple integers, the + function will return a tuple containing accuracies of + each ``topk`` number. + """ + assert isinstance(topk, (int, tuple)) + if isinstance(topk, int): + topk = (topk, ) + return_single = True + else: + return_single = False + + maxk = max(topk) + if pred.size(0) == 0: + accu = [pred.new_tensor(0.) for i in range(len(topk))] + return accu[0] if return_single else accu + assert pred.ndim == target.ndim + 1 + assert pred.size(0) == target.size(0) + assert maxk <= pred.size(1), \ + f'maxk {maxk} exceeds pred dimension {pred.size(1)}' + pred_value, pred_label = pred.topk(maxk, dim=1) + # transpose to shape (maxk, N, ...) + pred_label = pred_label.transpose(0, 1) + correct = pred_label.eq(target.unsqueeze(0).expand_as(pred_label)) + if thresh is not None: + # Only prediction values larger than thresh are counted as correct + correct = correct & (pred_value > thresh).t() + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / target.numel())) + return res[0] if return_single else res + + +class Accuracy(nn.Module): + """Accuracy calculation module.""" + + def __init__(self, topk=(1, ), thresh=None): + """Module to calculate the accuracy. + + Args: + topk (tuple, optional): The criterion used to calculate the + accuracy. Defaults to (1,). + thresh (float, optional): If not None, predictions with scores + under this threshold are considered incorrect. Default to None. + """ + super().__init__() + self.topk = topk + self.thresh = thresh + + def forward(self, pred, target): + """Forward function to calculate accuracy. + + Args: + pred (torch.Tensor): Prediction of models. + target (torch.Tensor): Target for each prediction. + + Returns: + tuple[float]: The accuracies under different topk criterions. + """ + return accuracy(pred, target, self.topk, self.thresh) diff --git a/lavis/common/annotator/uniformer/mmseg/models/losses/cross_entropy_loss.py b/lavis/common/annotator/uniformer/mmseg/models/losses/cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..42c0790c98616bb69621deed55547fc04c7392ef --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/losses/cross_entropy_loss.py @@ -0,0 +1,198 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES +from .utils import get_class_weight, weight_reduce_loss + + +def cross_entropy(pred, + label, + weight=None, + class_weight=None, + reduction='mean', + avg_factor=None, + ignore_index=-100): + """The wrapper function for :func:`F.cross_entropy`""" + # class_weight is a manual rescaling weight given to each class. + # If given, has to be a Tensor of size C element-wise losses + loss = F.cross_entropy( + pred, + label, + weight=class_weight, + reduction='none', + ignore_index=ignore_index) + + # apply weights and do the reduction + if weight is not None: + weight = weight.float() + loss = weight_reduce_loss( + loss, weight=weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + +def _expand_onehot_labels(labels, label_weights, target_shape, ignore_index): + """Expand onehot labels to match the size of prediction.""" + bin_labels = labels.new_zeros(target_shape) + valid_mask = (labels >= 0) & (labels != ignore_index) + inds = torch.nonzero(valid_mask, as_tuple=True) + + if inds[0].numel() > 0: + if labels.dim() == 3: + bin_labels[inds[0], labels[valid_mask], inds[1], inds[2]] = 1 + else: + bin_labels[inds[0], labels[valid_mask]] = 1 + + valid_mask = valid_mask.unsqueeze(1).expand(target_shape).float() + if label_weights is None: + bin_label_weights = valid_mask + else: + bin_label_weights = label_weights.unsqueeze(1).expand(target_shape) + bin_label_weights *= valid_mask + + return bin_labels, bin_label_weights + + +def binary_cross_entropy(pred, + label, + weight=None, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=255): + """Calculate the binary CrossEntropy loss. + + Args: + pred (torch.Tensor): The prediction with shape (N, 1). + label (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (int | None): The label index to be ignored. Default: 255 + + Returns: + torch.Tensor: The calculated loss + """ + if pred.dim() != label.dim(): + assert (pred.dim() == 2 and label.dim() == 1) or ( + pred.dim() == 4 and label.dim() == 3), \ + 'Only pred shape [N, C], label shape [N] or pred shape [N, C, ' \ + 'H, W], label shape [N, H, W] are supported' + label, weight = _expand_onehot_labels(label, weight, pred.shape, + ignore_index) + + # weighted element-wise losses + if weight is not None: + weight = weight.float() + loss = F.binary_cross_entropy_with_logits( + pred, label.float(), pos_weight=class_weight, reduction='none') + # do the reduction for the weighted loss + loss = weight_reduce_loss( + loss, weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + +def mask_cross_entropy(pred, + target, + label, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=None): + """Calculate the CrossEntropy loss for masks. + + Args: + pred (torch.Tensor): The prediction with shape (N, C), C is the number + of classes. + target (torch.Tensor): The learning label of the prediction. + label (torch.Tensor): ``label`` indicates the class label of the mask' + corresponding object. This will be used to select the mask in the + of the class which the object belongs to when the mask prediction + if not class-agnostic. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (None): Placeholder, to be consistent with other loss. + Default: None. + + Returns: + torch.Tensor: The calculated loss + """ + assert ignore_index is None, 'BCE loss does not support ignore_index' + # TODO: handle these two reserved arguments + assert reduction == 'mean' and avg_factor is None + num_rois = pred.size()[0] + inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) + pred_slice = pred[inds, label].squeeze(1) + return F.binary_cross_entropy_with_logits( + pred_slice, target, weight=class_weight, reduction='mean')[None] + + +@LOSSES.register_module() +class CrossEntropyLoss(nn.Module): + """CrossEntropyLoss. + + Args: + use_sigmoid (bool, optional): Whether the prediction uses sigmoid + of softmax. Defaults to False. + use_mask (bool, optional): Whether to use mask cross entropy loss. + Defaults to False. + reduction (str, optional): . Defaults to 'mean'. + Options are "none", "mean" and "sum". + class_weight (list[float] | str, optional): Weight of each class. If in + str format, read them from a file. Defaults to None. + loss_weight (float, optional): Weight of the loss. Defaults to 1.0. + """ + + def __init__(self, + use_sigmoid=False, + use_mask=False, + reduction='mean', + class_weight=None, + loss_weight=1.0): + super(CrossEntropyLoss, self).__init__() + assert (use_sigmoid is False) or (use_mask is False) + self.use_sigmoid = use_sigmoid + self.use_mask = use_mask + self.reduction = reduction + self.loss_weight = loss_weight + self.class_weight = get_class_weight(class_weight) + + if self.use_sigmoid: + self.cls_criterion = binary_cross_entropy + elif self.use_mask: + self.cls_criterion = mask_cross_entropy + else: + self.cls_criterion = cross_entropy + + def forward(self, + cls_score, + label, + weight=None, + avg_factor=None, + reduction_override=None, + **kwargs): + """Forward function.""" + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.class_weight is not None: + class_weight = cls_score.new_tensor(self.class_weight) + else: + class_weight = None + loss_cls = self.loss_weight * self.cls_criterion( + cls_score, + label, + weight, + class_weight=class_weight, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss_cls diff --git a/lavis/common/annotator/uniformer/mmseg/models/losses/dice_loss.py b/lavis/common/annotator/uniformer/mmseg/models/losses/dice_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..27a77b962d7d8b3079c7d6cd9db52280c6fb4970 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/losses/dice_loss.py @@ -0,0 +1,119 @@ +"""Modified from https://github.com/LikeLy-Journey/SegmenTron/blob/master/ +segmentron/solver/loss.py (Apache-2.0 License)""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES +from .utils import get_class_weight, weighted_loss + + +@weighted_loss +def dice_loss(pred, + target, + valid_mask, + smooth=1, + exponent=2, + class_weight=None, + ignore_index=255): + assert pred.shape[0] == target.shape[0] + total_loss = 0 + num_classes = pred.shape[1] + for i in range(num_classes): + if i != ignore_index: + dice_loss = binary_dice_loss( + pred[:, i], + target[..., i], + valid_mask=valid_mask, + smooth=smooth, + exponent=exponent) + if class_weight is not None: + dice_loss *= class_weight[i] + total_loss += dice_loss + return total_loss / num_classes + + +@weighted_loss +def binary_dice_loss(pred, target, valid_mask, smooth=1, exponent=2, **kwards): + assert pred.shape[0] == target.shape[0] + pred = pred.reshape(pred.shape[0], -1) + target = target.reshape(target.shape[0], -1) + valid_mask = valid_mask.reshape(valid_mask.shape[0], -1) + + num = torch.sum(torch.mul(pred, target) * valid_mask, dim=1) * 2 + smooth + den = torch.sum(pred.pow(exponent) + target.pow(exponent), dim=1) + smooth + + return 1 - num / den + + +@LOSSES.register_module() +class DiceLoss(nn.Module): + """DiceLoss. + + This loss is proposed in `V-Net: Fully Convolutional Neural Networks for + Volumetric Medical Image Segmentation `_. + + Args: + loss_type (str, optional): Binary or multi-class loss. + Default: 'multi_class'. Options are "binary" and "multi_class". + smooth (float): A float number to smooth loss, and avoid NaN error. + Default: 1 + exponent (float): An float number to calculate denominator + value: \\sum{x^exponent} + \\sum{y^exponent}. Default: 2. + reduction (str, optional): The method used to reduce the loss. Options + are "none", "mean" and "sum". This parameter only works when + per_image is True. Default: 'mean'. + class_weight (list[float] | str, optional): Weight of each class. If in + str format, read them from a file. Defaults to None. + loss_weight (float, optional): Weight of the loss. Default to 1.0. + ignore_index (int | None): The label index to be ignored. Default: 255. + """ + + def __init__(self, + smooth=1, + exponent=2, + reduction='mean', + class_weight=None, + loss_weight=1.0, + ignore_index=255, + **kwards): + super(DiceLoss, self).__init__() + self.smooth = smooth + self.exponent = exponent + self.reduction = reduction + self.class_weight = get_class_weight(class_weight) + self.loss_weight = loss_weight + self.ignore_index = ignore_index + + def forward(self, + pred, + target, + avg_factor=None, + reduction_override=None, + **kwards): + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.class_weight is not None: + class_weight = pred.new_tensor(self.class_weight) + else: + class_weight = None + + pred = F.softmax(pred, dim=1) + num_classes = pred.shape[1] + one_hot_target = F.one_hot( + torch.clamp(target.long(), 0, num_classes - 1), + num_classes=num_classes) + valid_mask = (target != self.ignore_index).long() + + loss = self.loss_weight * dice_loss( + pred, + one_hot_target, + valid_mask=valid_mask, + reduction=reduction, + avg_factor=avg_factor, + smooth=self.smooth, + exponent=self.exponent, + class_weight=class_weight, + ignore_index=self.ignore_index) + return loss diff --git a/lavis/common/annotator/uniformer/mmseg/models/losses/lovasz_loss.py b/lavis/common/annotator/uniformer/mmseg/models/losses/lovasz_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..6badb67f6d987b59fb07aa97caaaf89896e27a8d --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/losses/lovasz_loss.py @@ -0,0 +1,303 @@ +"""Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor +ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim +Berman 2018 ESAT-PSI KU Leuven (MIT License)""" + +import annotator.uniformer.mmcv as mmcv +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES +from .utils import get_class_weight, weight_reduce_loss + + +def lovasz_grad(gt_sorted): + """Computes gradient of the Lovasz extension w.r.t sorted errors. + + See Alg. 1 in paper. + """ + p = len(gt_sorted) + gts = gt_sorted.sum() + intersection = gts - gt_sorted.float().cumsum(0) + union = gts + (1 - gt_sorted).float().cumsum(0) + jaccard = 1. - intersection / union + if p > 1: # cover 1-pixel case + jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] + return jaccard + + +def flatten_binary_logits(logits, labels, ignore_index=None): + """Flattens predictions in the batch (binary case) Remove labels equal to + 'ignore_index'.""" + logits = logits.view(-1) + labels = labels.view(-1) + if ignore_index is None: + return logits, labels + valid = (labels != ignore_index) + vlogits = logits[valid] + vlabels = labels[valid] + return vlogits, vlabels + + +def flatten_probs(probs, labels, ignore_index=None): + """Flattens predictions in the batch.""" + if probs.dim() == 3: + # assumes output of a sigmoid layer + B, H, W = probs.size() + probs = probs.view(B, 1, H, W) + B, C, H, W = probs.size() + probs = probs.permute(0, 2, 3, 1).contiguous().view(-1, C) # B*H*W, C=P,C + labels = labels.view(-1) + if ignore_index is None: + return probs, labels + valid = (labels != ignore_index) + vprobs = probs[valid.nonzero().squeeze()] + vlabels = labels[valid] + return vprobs, vlabels + + +def lovasz_hinge_flat(logits, labels): + """Binary Lovasz hinge loss. + + Args: + logits (torch.Tensor): [P], logits at each prediction + (between -infty and +infty). + labels (torch.Tensor): [P], binary ground truth labels (0 or 1). + + Returns: + torch.Tensor: The calculated loss. + """ + if len(labels) == 0: + # only void pixels, the gradients should be 0 + return logits.sum() * 0. + signs = 2. * labels.float() - 1. + errors = (1. - logits * signs) + errors_sorted, perm = torch.sort(errors, dim=0, descending=True) + perm = perm.data + gt_sorted = labels[perm] + grad = lovasz_grad(gt_sorted) + loss = torch.dot(F.relu(errors_sorted), grad) + return loss + + +def lovasz_hinge(logits, + labels, + classes='present', + per_image=False, + class_weight=None, + reduction='mean', + avg_factor=None, + ignore_index=255): + """Binary Lovasz hinge loss. + + Args: + logits (torch.Tensor): [B, H, W], logits at each pixel + (between -infty and +infty). + labels (torch.Tensor): [B, H, W], binary ground truth masks (0 or 1). + classes (str | list[int], optional): Placeholder, to be consistent with + other loss. Default: None. + per_image (bool, optional): If per_image is True, compute the loss per + image instead of per batch. Default: False. + class_weight (list[float], optional): Placeholder, to be consistent + with other loss. Default: None. + reduction (str, optional): The method used to reduce the loss. Options + are "none", "mean" and "sum". This parameter only works when + per_image is True. Default: 'mean'. + avg_factor (int, optional): Average factor that is used to average + the loss. This parameter only works when per_image is True. + Default: None. + ignore_index (int | None): The label index to be ignored. Default: 255. + + Returns: + torch.Tensor: The calculated loss. + """ + if per_image: + loss = [ + lovasz_hinge_flat(*flatten_binary_logits( + logit.unsqueeze(0), label.unsqueeze(0), ignore_index)) + for logit, label in zip(logits, labels) + ] + loss = weight_reduce_loss( + torch.stack(loss), None, reduction, avg_factor) + else: + loss = lovasz_hinge_flat( + *flatten_binary_logits(logits, labels, ignore_index)) + return loss + + +def lovasz_softmax_flat(probs, labels, classes='present', class_weight=None): + """Multi-class Lovasz-Softmax loss. + + Args: + probs (torch.Tensor): [P, C], class probabilities at each prediction + (between 0 and 1). + labels (torch.Tensor): [P], ground truth labels (between 0 and C - 1). + classes (str | list[int], optional): Classes chosen to calculate loss. + 'all' for all classes, 'present' for classes present in labels, or + a list of classes to average. Default: 'present'. + class_weight (list[float], optional): The weight for each class. + Default: None. + + Returns: + torch.Tensor: The calculated loss. + """ + if probs.numel() == 0: + # only void pixels, the gradients should be 0 + return probs * 0. + C = probs.size(1) + losses = [] + class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes + for c in class_to_sum: + fg = (labels == c).float() # foreground for class c + if (classes == 'present' and fg.sum() == 0): + continue + if C == 1: + if len(classes) > 1: + raise ValueError('Sigmoid output possible only with 1 class') + class_pred = probs[:, 0] + else: + class_pred = probs[:, c] + errors = (fg - class_pred).abs() + errors_sorted, perm = torch.sort(errors, 0, descending=True) + perm = perm.data + fg_sorted = fg[perm] + loss = torch.dot(errors_sorted, lovasz_grad(fg_sorted)) + if class_weight is not None: + loss *= class_weight[c] + losses.append(loss) + return torch.stack(losses).mean() + + +def lovasz_softmax(probs, + labels, + classes='present', + per_image=False, + class_weight=None, + reduction='mean', + avg_factor=None, + ignore_index=255): + """Multi-class Lovasz-Softmax loss. + + Args: + probs (torch.Tensor): [B, C, H, W], class probabilities at each + prediction (between 0 and 1). + labels (torch.Tensor): [B, H, W], ground truth labels (between 0 and + C - 1). + classes (str | list[int], optional): Classes chosen to calculate loss. + 'all' for all classes, 'present' for classes present in labels, or + a list of classes to average. Default: 'present'. + per_image (bool, optional): If per_image is True, compute the loss per + image instead of per batch. Default: False. + class_weight (list[float], optional): The weight for each class. + Default: None. + reduction (str, optional): The method used to reduce the loss. Options + are "none", "mean" and "sum". This parameter only works when + per_image is True. Default: 'mean'. + avg_factor (int, optional): Average factor that is used to average + the loss. This parameter only works when per_image is True. + Default: None. + ignore_index (int | None): The label index to be ignored. Default: 255. + + Returns: + torch.Tensor: The calculated loss. + """ + + if per_image: + loss = [ + lovasz_softmax_flat( + *flatten_probs( + prob.unsqueeze(0), label.unsqueeze(0), ignore_index), + classes=classes, + class_weight=class_weight) + for prob, label in zip(probs, labels) + ] + loss = weight_reduce_loss( + torch.stack(loss), None, reduction, avg_factor) + else: + loss = lovasz_softmax_flat( + *flatten_probs(probs, labels, ignore_index), + classes=classes, + class_weight=class_weight) + return loss + + +@LOSSES.register_module() +class LovaszLoss(nn.Module): + """LovaszLoss. + + This loss is proposed in `The Lovasz-Softmax loss: A tractable surrogate + for the optimization of the intersection-over-union measure in neural + networks `_. + + Args: + loss_type (str, optional): Binary or multi-class loss. + Default: 'multi_class'. Options are "binary" and "multi_class". + classes (str | list[int], optional): Classes chosen to calculate loss. + 'all' for all classes, 'present' for classes present in labels, or + a list of classes to average. Default: 'present'. + per_image (bool, optional): If per_image is True, compute the loss per + image instead of per batch. Default: False. + reduction (str, optional): The method used to reduce the loss. Options + are "none", "mean" and "sum". This parameter only works when + per_image is True. Default: 'mean'. + class_weight (list[float] | str, optional): Weight of each class. If in + str format, read them from a file. Defaults to None. + loss_weight (float, optional): Weight of the loss. Defaults to 1.0. + """ + + def __init__(self, + loss_type='multi_class', + classes='present', + per_image=False, + reduction='mean', + class_weight=None, + loss_weight=1.0): + super(LovaszLoss, self).__init__() + assert loss_type in ('binary', 'multi_class'), "loss_type should be \ + 'binary' or 'multi_class'." + + if loss_type == 'binary': + self.cls_criterion = lovasz_hinge + else: + self.cls_criterion = lovasz_softmax + assert classes in ('all', 'present') or mmcv.is_list_of(classes, int) + if not per_image: + assert reduction == 'none', "reduction should be 'none' when \ + per_image is False." + + self.classes = classes + self.per_image = per_image + self.reduction = reduction + self.loss_weight = loss_weight + self.class_weight = get_class_weight(class_weight) + + def forward(self, + cls_score, + label, + weight=None, + avg_factor=None, + reduction_override=None, + **kwargs): + """Forward function.""" + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.class_weight is not None: + class_weight = cls_score.new_tensor(self.class_weight) + else: + class_weight = None + + # if multi-class loss, transform logits to probs + if self.cls_criterion == lovasz_softmax: + cls_score = F.softmax(cls_score, dim=1) + + loss_cls = self.loss_weight * self.cls_criterion( + cls_score, + label, + self.classes, + self.per_image, + class_weight=class_weight, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss_cls diff --git a/lavis/common/annotator/uniformer/mmseg/models/losses/utils.py b/lavis/common/annotator/uniformer/mmseg/models/losses/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..85aec9f3045240c3de96a928324ae8f5c3aebe8b --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/losses/utils.py @@ -0,0 +1,121 @@ +import functools + +import annotator.uniformer.mmcv as mmcv +import numpy as np +import torch.nn.functional as F + + +def get_class_weight(class_weight): + """Get class weight for loss function. + + Args: + class_weight (list[float] | str | None): If class_weight is a str, + take it as a file name and read from it. + """ + if isinstance(class_weight, str): + # take it as a file path + if class_weight.endswith('.npy'): + class_weight = np.load(class_weight) + else: + # pkl, json or yaml + class_weight = mmcv.load(class_weight) + + return class_weight + + +def reduce_loss(loss, reduction): + """Reduce loss as specified. + + Args: + loss (Tensor): Elementwise loss tensor. + reduction (str): Options are "none", "mean" and "sum". + + Return: + Tensor: Reduced loss tensor. + """ + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, elementwise_mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() + + +def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): Element-wise loss. + weight (Tensor): Element-wise weights. + reduction (str): Same as built-in losses of PyTorch. + avg_factor (float): Avarage factor when computing the mean of losses. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + assert weight.dim() == loss.dim() + if weight.dim() > 1: + assert weight.size(1) == 1 or weight.size(1) == loss.size(1) + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + loss = loss.sum() / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + + +def weighted_loss(loss_func): + """Create a weighted version of a given loss function. + + To use this decorator, the loss function must have the signature like + `loss_func(pred, target, **kwargs)`. The function only needs to compute + element-wise loss without any reduction. This decorator will add weight + and reduction arguments to the function. The decorated function will have + the signature like `loss_func(pred, target, weight=None, reduction='mean', + avg_factor=None, **kwargs)`. + + :Example: + + >>> import torch + >>> @weighted_loss + >>> def l1_loss(pred, target): + >>> return (pred - target).abs() + + >>> pred = torch.Tensor([0, 2, 3]) + >>> target = torch.Tensor([1, 1, 1]) + >>> weight = torch.Tensor([1, 0, 1]) + + >>> l1_loss(pred, target) + tensor(1.3333) + >>> l1_loss(pred, target, weight) + tensor(1.) + >>> l1_loss(pred, target, reduction='none') + tensor([1., 1., 2.]) + >>> l1_loss(pred, target, weight, avg_factor=2) + tensor(1.5000) + """ + + @functools.wraps(loss_func) + def wrapper(pred, + target, + weight=None, + reduction='mean', + avg_factor=None, + **kwargs): + # get element-wise loss + loss = loss_func(pred, target, **kwargs) + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + return wrapper diff --git a/lavis/common/annotator/uniformer/mmseg/models/necks/__init__.py b/lavis/common/annotator/uniformer/mmseg/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b9d3d5b3fe80247642d962edd6fb787537d01d6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/necks/__init__.py @@ -0,0 +1,4 @@ +from .fpn import FPN +from .multilevel_neck import MultiLevelNeck + +__all__ = ['FPN', 'MultiLevelNeck'] diff --git a/lavis/common/annotator/uniformer/mmseg/models/necks/fpn.py b/lavis/common/annotator/uniformer/mmseg/models/necks/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..a53b2a69500f8c2edb835abc3ff0ccc2173d1fb1 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/necks/fpn.py @@ -0,0 +1,212 @@ +import torch.nn as nn +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule, xavier_init + +from ..builder import NECKS + + +@NECKS.register_module() +class FPN(nn.Module): + """Feature Pyramid Network. + + This is an implementation of - Feature Pyramid Networks for Object + Detection (https://arxiv.org/abs/1612.03144) + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_outs (int): Number of output scales. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Default: 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Default: -1, which means the last level. + add_extra_convs (bool | str): If bool, it decides whether to add conv + layers on top of the original feature maps. Default to False. + If True, its actual mode is specified by `extra_convs_on_inputs`. + If str, it specifies the source feature map of the extra convs. + Only the following options are allowed + + - 'on_input': Last feat map of neck inputs (i.e. backbone feature). + - 'on_lateral': Last feature map after lateral convs. + - 'on_output': The last output feature map after fpn convs. + extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs + on the original feature from the backbone. If True, + it is equivalent to `add_extra_convs='on_input'`. If False, it is + equivalent to set `add_extra_convs='on_output'`. Default to True. + relu_before_extra_convs (bool): Whether to apply relu before the extra + conv. Default: False. + no_norm_on_lateral (bool): Whether to apply norm on lateral. + Default: False. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (str): Config dict for activation layer in ConvModule. + Default: None. + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(mode='nearest')` + + Example: + >>> import torch + >>> in_channels = [2, 3, 5, 7] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = FPN(in_channels, 11, len(in_channels)).eval() + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 11, 340, 340]) + outputs[1].shape = torch.Size([1, 11, 170, 170]) + outputs[2].shape = torch.Size([1, 11, 84, 84]) + outputs[3].shape = torch.Size([1, 11, 43, 43]) + """ + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False, + extra_convs_on_inputs=False, + relu_before_extra_convs=False, + no_norm_on_lateral=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + upsample_cfg=dict(mode='nearest')): + super(FPN, self).__init__() + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.relu_before_extra_convs = relu_before_extra_convs + self.no_norm_on_lateral = no_norm_on_lateral + self.fp16_enabled = False + self.upsample_cfg = upsample_cfg.copy() + + if end_level == -1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level < inputs, no extra level is allowed + self.backbone_end_level = end_level + assert end_level <= len(in_channels) + assert num_outs == end_level - start_level + self.start_level = start_level + self.end_level = end_level + self.add_extra_convs = add_extra_convs + assert isinstance(add_extra_convs, (str, bool)) + if isinstance(add_extra_convs, str): + # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' + assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') + elif add_extra_convs: # True + if extra_convs_on_inputs: + # For compatibility with previous release + # TODO: deprecate `extra_convs_on_inputs` + self.add_extra_convs = 'on_input' + else: + self.add_extra_convs = 'on_output' + + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + + for i in range(self.start_level, self.backbone_end_level): + l_conv = ConvModule( + in_channels[i], + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, + act_cfg=act_cfg, + inplace=False) + fpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + # add extra conv layers (e.g., RetinaNet) + extra_levels = num_outs - self.backbone_end_level + self.start_level + if self.add_extra_convs and extra_levels >= 1: + for i in range(extra_levels): + if i == 0 and self.add_extra_convs == 'on_input': + in_channels = self.in_channels[self.backbone_end_level - 1] + else: + in_channels = out_channels + extra_fpn_conv = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + self.fpn_convs.append(extra_fpn_conv) + + # default init_weights for conv(msra) and norm in ConvModule + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + xavier_init(m, distribution='uniform') + + def forward(self, inputs): + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + # In some cases, fixing `scale factor` (e.g. 2) is preferred, but + # it cannot co-exist with `size` in `F.interpolate`. + if 'scale_factor' in self.upsample_cfg: + laterals[i - 1] += F.interpolate(laterals[i], + **self.upsample_cfg) + else: + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] += F.interpolate( + laterals[i], size=prev_shape, **self.upsample_cfg) + + # build outputs + # part 1: from original levels + outs = [ + self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) + ] + # part 2: add extra levels + if self.num_outs > len(outs): + # use max pool to get more levels on top of outputs + # (e.g., Faster R-CNN, Mask R-CNN) + if not self.add_extra_convs: + for i in range(self.num_outs - used_backbone_levels): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) + # add conv layers on top of original feature maps (RetinaNet) + else: + if self.add_extra_convs == 'on_input': + extra_source = inputs[self.backbone_end_level - 1] + elif self.add_extra_convs == 'on_lateral': + extra_source = laterals[-1] + elif self.add_extra_convs == 'on_output': + extra_source = outs[-1] + else: + raise NotImplementedError + outs.append(self.fpn_convs[used_backbone_levels](extra_source)) + for i in range(used_backbone_levels + 1, self.num_outs): + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) + return tuple(outs) diff --git a/lavis/common/annotator/uniformer/mmseg/models/necks/multilevel_neck.py b/lavis/common/annotator/uniformer/mmseg/models/necks/multilevel_neck.py new file mode 100644 index 0000000000000000000000000000000000000000..766144d8136326a1fab5906a153a0c0df69b6b60 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/necks/multilevel_neck.py @@ -0,0 +1,70 @@ +import torch.nn as nn +import torch.nn.functional as F +from annotator.uniformer.mmcv.cnn import ConvModule + +from ..builder import NECKS + + +@NECKS.register_module() +class MultiLevelNeck(nn.Module): + """MultiLevelNeck. + + A neck structure connect vit backbone and decoder_heads. + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + scales (List[int]): Scale factors for each input feature map. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (dict): Config dict for activation layer in ConvModule. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + scales=[0.5, 1, 2, 4], + norm_cfg=None, + act_cfg=None): + super(MultiLevelNeck, self).__init__() + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.scales = scales + self.num_outs = len(scales) + self.lateral_convs = nn.ModuleList() + self.convs = nn.ModuleList() + for in_channel in in_channels: + self.lateral_convs.append( + ConvModule( + in_channel, + out_channels, + kernel_size=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + for _ in range(self.num_outs): + self.convs.append( + ConvModule( + out_channels, + out_channels, + kernel_size=3, + padding=1, + stride=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, inputs): + assert len(inputs) == len(self.in_channels) + print(inputs[0].shape) + inputs = [ + lateral_conv(inputs[i]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + # for len(inputs) not equal to self.num_outs + if len(inputs) == 1: + inputs = [inputs[0] for _ in range(self.num_outs)] + outs = [] + for i in range(self.num_outs): + x_resize = F.interpolate( + inputs[i], scale_factor=self.scales[i], mode='bilinear') + outs.append(self.convs[i](x_resize)) + return tuple(outs) diff --git a/lavis/common/annotator/uniformer/mmseg/models/segmentors/__init__.py b/lavis/common/annotator/uniformer/mmseg/models/segmentors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dca2f09405330743c476e190896bee39c45498ea --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/segmentors/__init__.py @@ -0,0 +1,5 @@ +from .base import BaseSegmentor +from .cascade_encoder_decoder import CascadeEncoderDecoder +from .encoder_decoder import EncoderDecoder + +__all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder'] diff --git a/lavis/common/annotator/uniformer/mmseg/models/segmentors/base.py b/lavis/common/annotator/uniformer/mmseg/models/segmentors/base.py new file mode 100644 index 0000000000000000000000000000000000000000..172fc63b736c4f13be1cd909433bc260760a1eaa --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/segmentors/base.py @@ -0,0 +1,273 @@ +import logging +import warnings +from abc import ABCMeta, abstractmethod +from collections import OrderedDict + +import annotator.uniformer.mmcv as mmcv +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +from annotator.uniformer.mmcv.runner import auto_fp16 + + +class BaseSegmentor(nn.Module): + """Base class for segmentors.""" + + __metaclass__ = ABCMeta + + def __init__(self): + super(BaseSegmentor, self).__init__() + self.fp16_enabled = False + + @property + def with_neck(self): + """bool: whether the segmentor has neck""" + return hasattr(self, 'neck') and self.neck is not None + + @property + def with_auxiliary_head(self): + """bool: whether the segmentor has auxiliary head""" + return hasattr(self, + 'auxiliary_head') and self.auxiliary_head is not None + + @property + def with_decode_head(self): + """bool: whether the segmentor has decode head""" + return hasattr(self, 'decode_head') and self.decode_head is not None + + @abstractmethod + def extract_feat(self, imgs): + """Placeholder for extract features from images.""" + pass + + @abstractmethod + def encode_decode(self, img, img_metas): + """Placeholder for encode images with backbone and decode into a + semantic segmentation map of the same size as input.""" + pass + + @abstractmethod + def forward_train(self, imgs, img_metas, **kwargs): + """Placeholder for Forward function for training.""" + pass + + @abstractmethod + def simple_test(self, img, img_meta, **kwargs): + """Placeholder for single image test.""" + pass + + @abstractmethod + def aug_test(self, imgs, img_metas, **kwargs): + """Placeholder for augmentation test.""" + pass + + def init_weights(self, pretrained=None): + """Initialize the weights in segmentor. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if pretrained is not None: + logger = logging.getLogger() + logger.info(f'load model from: {pretrained}') + + def forward_test(self, imgs, img_metas, **kwargs): + """ + Args: + imgs (List[Tensor]): the outer list indicates test-time + augmentations and inner Tensor should have a shape NxCxHxW, + which contains all images in the batch. + img_metas (List[List[dict]]): the outer list indicates test-time + augs (multiscale, flip, etc.) and the inner list indicates + images in a batch. + """ + for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: + if not isinstance(var, list): + raise TypeError(f'{name} must be a list, but got ' + f'{type(var)}') + + num_augs = len(imgs) + if num_augs != len(img_metas): + raise ValueError(f'num of augmentations ({len(imgs)}) != ' + f'num of image meta ({len(img_metas)})') + # all images in the same aug batch all of the same ori_shape and pad + # shape + for img_meta in img_metas: + ori_shapes = [_['ori_shape'] for _ in img_meta] + assert all(shape == ori_shapes[0] for shape in ori_shapes) + img_shapes = [_['img_shape'] for _ in img_meta] + assert all(shape == img_shapes[0] for shape in img_shapes) + pad_shapes = [_['pad_shape'] for _ in img_meta] + assert all(shape == pad_shapes[0] for shape in pad_shapes) + + if num_augs == 1: + return self.simple_test(imgs[0], img_metas[0], **kwargs) + else: + return self.aug_test(imgs, img_metas, **kwargs) + + @auto_fp16(apply_to=('img', )) + def forward(self, img, img_metas, return_loss=True, **kwargs): + """Calls either :func:`forward_train` or :func:`forward_test` depending + on whether ``return_loss`` is ``True``. + + Note this setting will change the expected inputs. When + ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor + and List[dict]), and when ``resturn_loss=False``, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + """ + if return_loss: + return self.forward_train(img, img_metas, **kwargs) + else: + return self.forward_test(img, img_metas, **kwargs) + + def train_step(self, data_batch, optimizer, **kwargs): + """The iteration step during training. + + This method defines an iteration step during training, except for the + back propagation and optimizer updating, which are done in an optimizer + hook. Note that in some complicated cases or models, the whole process + including back propagation and optimizer updating is also defined in + this method, such as GAN. + + Args: + data (dict): The output of dataloader. + optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of + runner is passed to ``train_step()``. This argument is unused + and reserved. + + Returns: + dict: It should contain at least 3 keys: ``loss``, ``log_vars``, + ``num_samples``. + ``loss`` is a tensor for back propagation, which can be a + weighted sum of multiple losses. + ``log_vars`` contains all the variables to be sent to the + logger. + ``num_samples`` indicates the batch size (when the model is + DDP, it means the batch size on each GPU), which is used for + averaging the logs. + """ + losses = self(**data_batch) + loss, log_vars = self._parse_losses(losses) + + outputs = dict( + loss=loss, + log_vars=log_vars, + num_samples=len(data_batch['img_metas'])) + + return outputs + + def val_step(self, data_batch, **kwargs): + """The iteration step during validation. + + This method shares the same signature as :func:`train_step`, but used + during val epochs. Note that the evaluation after training epochs is + not implemented with this method, but an evaluation hook. + """ + output = self(**data_batch, **kwargs) + return output + + @staticmethod + def _parse_losses(losses): + """Parse the raw outputs (losses) of the network. + + Args: + losses (dict): Raw output of the network, which usually contain + losses and other necessary information. + + Returns: + tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor + which may be a weighted sum of all losses, log_vars contains + all the variables to be sent to the logger. + """ + log_vars = OrderedDict() + for loss_name, loss_value in losses.items(): + if isinstance(loss_value, torch.Tensor): + log_vars[loss_name] = loss_value.mean() + elif isinstance(loss_value, list): + log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) + else: + raise TypeError( + f'{loss_name} is not a tensor or list of tensors') + + loss = sum(_value for _key, _value in log_vars.items() + if 'loss' in _key) + + log_vars['loss'] = loss + for loss_name, loss_value in log_vars.items(): + # reduce loss when distributed training + if dist.is_available() and dist.is_initialized(): + loss_value = loss_value.data.clone() + dist.all_reduce(loss_value.div_(dist.get_world_size())) + log_vars[loss_name] = loss_value.item() + + return loss, log_vars + + def show_result(self, + img, + result, + palette=None, + win_name='', + show=False, + wait_time=0, + out_file=None, + opacity=0.5): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (Tensor): The semantic segmentation results to draw over + `img`. + palette (list[list[int]]] | np.ndarray | None): The palette of + segmentation map. If None is given, random palette will be + generated. Default: None + win_name (str): The window name. + wait_time (int): Value of waitKey param. + Default: 0. + show (bool): Whether to show the image. + Default: False. + out_file (str or None): The filename to write the image. + Default: None. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + Returns: + img (Tensor): Only if not `show` or `out_file` + """ + img = mmcv.imread(img) + img = img.copy() + seg = result[0] + if palette is None: + if self.PALETTE is None: + palette = np.random.randint( + 0, 255, size=(len(self.CLASSES), 3)) + else: + palette = self.PALETTE + palette = np.array(palette) + assert palette.shape[0] == len(self.CLASSES) + assert palette.shape[1] == 3 + assert len(palette.shape) == 2 + assert 0 < opacity <= 1.0 + color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) + for label, color in enumerate(palette): + color_seg[seg == label, :] = color + # convert to BGR + color_seg = color_seg[..., ::-1] + + img = img * (1 - opacity) + color_seg * opacity + img = img.astype(np.uint8) + # if out_file specified, do not show image in window + if out_file is not None: + show = False + + if show: + mmcv.imshow(img, win_name, wait_time) + if out_file is not None: + mmcv.imwrite(img, out_file) + + if not (show or out_file): + warnings.warn('show==False and out_file is not specified, only ' + 'result image will be returned') + return img diff --git a/lavis/common/annotator/uniformer/mmseg/models/segmentors/cascade_encoder_decoder.py b/lavis/common/annotator/uniformer/mmseg/models/segmentors/cascade_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..873957d8d6468147c994493d92ff5c1b15bfb703 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/segmentors/cascade_encoder_decoder.py @@ -0,0 +1,98 @@ +from torch import nn + +from annotator.uniformer.mmseg.core import add_prefix +from annotator.uniformer.mmseg.ops import resize +from .. import builder +from ..builder import SEGMENTORS +from .encoder_decoder import EncoderDecoder + + +@SEGMENTORS.register_module() +class CascadeEncoderDecoder(EncoderDecoder): + """Cascade Encoder Decoder segmentors. + + CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of + CascadeEncoderDecoder are cascaded. The output of previous decoder_head + will be the input of next decoder_head. + """ + + def __init__(self, + num_stages, + backbone, + decode_head, + neck=None, + auxiliary_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + self.num_stages = num_stages + super(CascadeEncoderDecoder, self).__init__( + backbone=backbone, + decode_head=decode_head, + neck=neck, + auxiliary_head=auxiliary_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained) + + def _init_decode_head(self, decode_head): + """Initialize ``decode_head``""" + assert isinstance(decode_head, list) + assert len(decode_head) == self.num_stages + self.decode_head = nn.ModuleList() + for i in range(self.num_stages): + self.decode_head.append(builder.build_head(decode_head[i])) + self.align_corners = self.decode_head[-1].align_corners + self.num_classes = self.decode_head[-1].num_classes + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone and heads. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + self.backbone.init_weights(pretrained=pretrained) + for i in range(self.num_stages): + self.decode_head[i].init_weights() + if self.with_auxiliary_head: + if isinstance(self.auxiliary_head, nn.ModuleList): + for aux_head in self.auxiliary_head: + aux_head.init_weights() + else: + self.auxiliary_head.init_weights() + + def encode_decode(self, img, img_metas): + """Encode images with backbone and decode into a semantic segmentation + map of the same size as input.""" + x = self.extract_feat(img) + out = self.decode_head[0].forward_test(x, img_metas, self.test_cfg) + for i in range(1, self.num_stages): + out = self.decode_head[i].forward_test(x, out, img_metas, + self.test_cfg) + out = resize( + input=out, + size=img.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + return out + + def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg): + """Run forward function and calculate loss for decode head in + training.""" + losses = dict() + + loss_decode = self.decode_head[0].forward_train( + x, img_metas, gt_semantic_seg, self.train_cfg) + + losses.update(add_prefix(loss_decode, 'decode_0')) + + for i in range(1, self.num_stages): + # forward test again, maybe unnecessary for most methods. + prev_outputs = self.decode_head[i - 1].forward_test( + x, img_metas, self.test_cfg) + loss_decode = self.decode_head[i].forward_train( + x, prev_outputs, img_metas, gt_semantic_seg, self.train_cfg) + losses.update(add_prefix(loss_decode, f'decode_{i}')) + + return losses diff --git a/lavis/common/annotator/uniformer/mmseg/models/segmentors/encoder_decoder.py b/lavis/common/annotator/uniformer/mmseg/models/segmentors/encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..98392ac04c4c44a7f4e7b1c0808266875877dd1f --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/segmentors/encoder_decoder.py @@ -0,0 +1,298 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from annotator.uniformer.mmseg.core import add_prefix +from annotator.uniformer.mmseg.ops import resize +from .. import builder +from ..builder import SEGMENTORS +from .base import BaseSegmentor + + +@SEGMENTORS.register_module() +class EncoderDecoder(BaseSegmentor): + """Encoder Decoder segmentors. + + EncoderDecoder typically consists of backbone, decode_head, auxiliary_head. + Note that auxiliary_head is only used for deep supervision during training, + which could be dumped during inference. + """ + + def __init__(self, + backbone, + decode_head, + neck=None, + auxiliary_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(EncoderDecoder, self).__init__() + self.backbone = builder.build_backbone(backbone) + if neck is not None: + self.neck = builder.build_neck(neck) + self._init_decode_head(decode_head) + self._init_auxiliary_head(auxiliary_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.init_weights(pretrained=pretrained) + + assert self.with_decode_head + + def _init_decode_head(self, decode_head): + """Initialize ``decode_head``""" + self.decode_head = builder.build_head(decode_head) + self.align_corners = self.decode_head.align_corners + self.num_classes = self.decode_head.num_classes + + def _init_auxiliary_head(self, auxiliary_head): + """Initialize ``auxiliary_head``""" + if auxiliary_head is not None: + if isinstance(auxiliary_head, list): + self.auxiliary_head = nn.ModuleList() + for head_cfg in auxiliary_head: + self.auxiliary_head.append(builder.build_head(head_cfg)) + else: + self.auxiliary_head = builder.build_head(auxiliary_head) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone and heads. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + super(EncoderDecoder, self).init_weights(pretrained) + self.backbone.init_weights(pretrained=pretrained) + self.decode_head.init_weights() + if self.with_auxiliary_head: + if isinstance(self.auxiliary_head, nn.ModuleList): + for aux_head in self.auxiliary_head: + aux_head.init_weights() + else: + self.auxiliary_head.init_weights() + + def extract_feat(self, img): + """Extract features from images.""" + x = self.backbone(img) + if self.with_neck: + x = self.neck(x) + return x + + def encode_decode(self, img, img_metas): + """Encode images with backbone and decode into a semantic segmentation + map of the same size as input.""" + x = self.extract_feat(img) + out = self._decode_head_forward_test(x, img_metas) + out = resize( + input=out, + size=img.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + return out + + def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg): + """Run forward function and calculate loss for decode head in + training.""" + losses = dict() + loss_decode = self.decode_head.forward_train(x, img_metas, + gt_semantic_seg, + self.train_cfg) + + losses.update(add_prefix(loss_decode, 'decode')) + return losses + + def _decode_head_forward_test(self, x, img_metas): + """Run forward function and calculate loss for decode head in + inference.""" + seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg) + return seg_logits + + def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg): + """Run forward function and calculate loss for auxiliary head in + training.""" + losses = dict() + if isinstance(self.auxiliary_head, nn.ModuleList): + for idx, aux_head in enumerate(self.auxiliary_head): + loss_aux = aux_head.forward_train(x, img_metas, + gt_semantic_seg, + self.train_cfg) + losses.update(add_prefix(loss_aux, f'aux_{idx}')) + else: + loss_aux = self.auxiliary_head.forward_train( + x, img_metas, gt_semantic_seg, self.train_cfg) + losses.update(add_prefix(loss_aux, 'aux')) + + return losses + + def forward_dummy(self, img): + """Dummy forward function.""" + seg_logit = self.encode_decode(img, None) + + return seg_logit + + def forward_train(self, img, img_metas, gt_semantic_seg): + """Forward function for training. + + Args: + img (Tensor): Input images. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + + x = self.extract_feat(img) + + losses = dict() + + loss_decode = self._decode_head_forward_train(x, img_metas, + gt_semantic_seg) + losses.update(loss_decode) + + if self.with_auxiliary_head: + loss_aux = self._auxiliary_head_forward_train( + x, img_metas, gt_semantic_seg) + losses.update(loss_aux) + + return losses + + # TODO refactor + def slide_inference(self, img, img_meta, rescale): + """Inference by sliding-window with overlap. + + If h_crop > h_img or w_crop > w_img, the small patch will be used to + decode without padding. + """ + + h_stride, w_stride = self.test_cfg.stride + h_crop, w_crop = self.test_cfg.crop_size + batch_size, _, h_img, w_img = img.size() + num_classes = self.num_classes + h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1 + w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1 + preds = img.new_zeros((batch_size, num_classes, h_img, w_img)) + count_mat = img.new_zeros((batch_size, 1, h_img, w_img)) + for h_idx in range(h_grids): + for w_idx in range(w_grids): + y1 = h_idx * h_stride + x1 = w_idx * w_stride + y2 = min(y1 + h_crop, h_img) + x2 = min(x1 + w_crop, w_img) + y1 = max(y2 - h_crop, 0) + x1 = max(x2 - w_crop, 0) + crop_img = img[:, :, y1:y2, x1:x2] + crop_seg_logit = self.encode_decode(crop_img, img_meta) + preds += F.pad(crop_seg_logit, + (int(x1), int(preds.shape[3] - x2), int(y1), + int(preds.shape[2] - y2))) + + count_mat[:, :, y1:y2, x1:x2] += 1 + assert (count_mat == 0).sum() == 0 + if torch.onnx.is_in_onnx_export(): + # cast count_mat to constant while exporting to ONNX + count_mat = torch.from_numpy( + count_mat.cpu().detach().numpy()).to(device=img.device) + preds = preds / count_mat + if rescale: + preds = resize( + preds, + size=img_meta[0]['ori_shape'][:2], + mode='bilinear', + align_corners=self.align_corners, + warning=False) + return preds + + def whole_inference(self, img, img_meta, rescale): + """Inference with full image.""" + + seg_logit = self.encode_decode(img, img_meta) + if rescale: + # support dynamic shape for onnx + if torch.onnx.is_in_onnx_export(): + size = img.shape[2:] + else: + size = img_meta[0]['ori_shape'][:2] + seg_logit = resize( + seg_logit, + size=size, + mode='bilinear', + align_corners=self.align_corners, + warning=False) + + return seg_logit + + def inference(self, img, img_meta, rescale): + """Inference with slide/whole style. + + Args: + img (Tensor): The input image of shape (N, 3, H, W). + img_meta (dict): Image info dict where each dict has: 'img_shape', + 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + rescale (bool): Whether rescale back to original shape. + + Returns: + Tensor: The output segmentation map. + """ + + assert self.test_cfg.mode in ['slide', 'whole'] + ori_shape = img_meta[0]['ori_shape'] + assert all(_['ori_shape'] == ori_shape for _ in img_meta) + if self.test_cfg.mode == 'slide': + seg_logit = self.slide_inference(img, img_meta, rescale) + else: + seg_logit = self.whole_inference(img, img_meta, rescale) + output = F.softmax(seg_logit, dim=1) + flip = img_meta[0]['flip'] + if flip: + flip_direction = img_meta[0]['flip_direction'] + assert flip_direction in ['horizontal', 'vertical'] + if flip_direction == 'horizontal': + output = output.flip(dims=(3, )) + elif flip_direction == 'vertical': + output = output.flip(dims=(2, )) + + return output + + def simple_test(self, img, img_meta, rescale=True): + """Simple test with single image.""" + seg_logit = self.inference(img, img_meta, rescale) + seg_pred = seg_logit.argmax(dim=1) + if torch.onnx.is_in_onnx_export(): + # our inference backend only support 4D output + seg_pred = seg_pred.unsqueeze(0) + return seg_pred + seg_pred = seg_pred.cpu().numpy() + # unravel batch dim + seg_pred = list(seg_pred) + return seg_pred + + def aug_test(self, imgs, img_metas, rescale=True): + """Test with augmentations. + + Only rescale=True is supported. + """ + # aug_test rescale all imgs back to ori_shape for now + assert rescale + # to save memory, we get augmented seg logit inplace + seg_logit = self.inference(imgs[0], img_metas[0], rescale) + for i in range(1, len(imgs)): + cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale) + seg_logit += cur_seg_logit + seg_logit /= len(imgs) + seg_pred = seg_logit.argmax(dim=1) + seg_pred = seg_pred.cpu().numpy() + # unravel batch dim + seg_pred = list(seg_pred) + return seg_pred diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/__init__.py b/lavis/common/annotator/uniformer/mmseg/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d3bdd349b9f2ae499a2fcb2ac1d2e3c77befebe --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/__init__.py @@ -0,0 +1,13 @@ +from .drop import DropPath +from .inverted_residual import InvertedResidual, InvertedResidualV3 +from .make_divisible import make_divisible +from .res_layer import ResLayer +from .se_layer import SELayer +from .self_attention_block import SelfAttentionBlock +from .up_conv_block import UpConvBlock +from .weight_init import trunc_normal_ + +__all__ = [ + 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', + 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_' +] diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/drop.py b/lavis/common/annotator/uniformer/mmseg/models/utils/drop.py new file mode 100644 index 0000000000000000000000000000000000000000..4520b0ff407d2a95a864086bdbca0065f222aa63 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/drop.py @@ -0,0 +1,31 @@ +"""Modified from https://github.com/rwightman/pytorch-image- +models/blob/master/timm/models/layers/drop.py.""" + +import torch +from torch import nn + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + + Args: + drop_prob (float): Drop rate for paths of model. Dropout rate has + to be between 0 and 1. Default: 0. + """ + + def __init__(self, drop_prob=0.): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.keep_prob = 1 - drop_prob + + def forward(self, x): + if self.drop_prob == 0. or not self.training: + return x + shape = (x.shape[0], ) + (1, ) * ( + x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = self.keep_prob + torch.rand( + shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(self.keep_prob) * random_tensor + return output diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/inverted_residual.py b/lavis/common/annotator/uniformer/mmseg/models/utils/inverted_residual.py new file mode 100644 index 0000000000000000000000000000000000000000..53b8fcd41f71d814738f1ac3f5acd3c3d701bf96 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/inverted_residual.py @@ -0,0 +1,208 @@ +from annotator.uniformer.mmcv.cnn import ConvModule +from torch import nn +from torch.utils import checkpoint as cp + +from .se_layer import SELayer + + +class InvertedResidual(nn.Module): + """InvertedResidual block for MobileNetV2. + + Args: + in_channels (int): The input channels of the InvertedResidual block. + out_channels (int): The output channels of the InvertedResidual block. + stride (int): Stride of the middle (first) 3x3 convolution. + expand_ratio (int): Adjusts number of channels of the hidden layer + in InvertedResidual by this amount. + dilation (int): Dilation rate of depthwise conv. Default: 1 + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + stride, + expand_ratio, + dilation=1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + with_cp=False): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2], f'stride must in [1, 2]. ' \ + f'But received {stride}.' + self.with_cp = with_cp + self.use_res_connect = self.stride == 1 and in_channels == out_channels + hidden_dim = int(round(in_channels * expand_ratio)) + + layers = [] + if expand_ratio != 1: + layers.append( + ConvModule( + in_channels=in_channels, + out_channels=hidden_dim, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + layers.extend([ + ConvModule( + in_channels=hidden_dim, + out_channels=hidden_dim, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + groups=hidden_dim, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + in_channels=hidden_dim, + out_channels=out_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + + def _inner_forward(x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class InvertedResidualV3(nn.Module): + """Inverted Residual Block for MobileNetV3. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + mid_channels (int): The input channels of the depthwise convolution. + kernel_size (int): The kernel size of the depthwise convolution. + Default: 3. + stride (int): The stride of the depthwise convolution. Default: 1. + se_cfg (dict): Config dict for se layer. Default: None, which means no + se layer. + with_expand_conv (bool): Use expand conv or not. If set False, + mid_channels must be the same with in_channels. Default: True. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + stride=1, + se_cfg=None, + with_expand_conv=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + super(InvertedResidualV3, self).__init__() + self.with_res_shortcut = (stride == 1 and in_channels == out_channels) + assert stride in [1, 2] + self.with_cp = with_cp + self.with_se = se_cfg is not None + self.with_expand_conv = with_expand_conv + + if self.with_se: + assert isinstance(se_cfg, dict) + if not self.with_expand_conv: + assert mid_channels == in_channels + + if self.with_expand_conv: + self.expand_conv = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.depthwise_conv = ConvModule( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=kernel_size // 2, + groups=mid_channels, + conv_cfg=dict( + type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if self.with_se: + self.se = SELayer(**se_cfg) + + self.linear_conv = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x): + + def _inner_forward(x): + out = x + + if self.with_expand_conv: + out = self.expand_conv(out) + + out = self.depthwise_conv(out) + + if self.with_se: + out = self.se(out) + + out = self.linear_conv(out) + + if self.with_res_shortcut: + return x + out + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/make_divisible.py b/lavis/common/annotator/uniformer/mmseg/models/utils/make_divisible.py new file mode 100644 index 0000000000000000000000000000000000000000..75ad756052529f52fe83bb95dd1f0ecfc9a13078 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/make_divisible.py @@ -0,0 +1,27 @@ +def make_divisible(value, divisor, min_value=None, min_ratio=0.9): + """Make divisible function. + + This function rounds the channel number to the nearest value that can be + divisible by the divisor. It is taken from the original tf repo. It ensures + that all layers have a channel number that is divisible by divisor. It can + be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py # noqa + + Args: + value (int): The original channel number. + divisor (int): The divisor to fully divide the channel number. + min_value (int): The minimum value of the output channel. + Default: None, means that the minimum value equal to the divisor. + min_ratio (float): The minimum ratio of the rounded channel number to + the original channel number. Default: 0.9. + + Returns: + int: The modified output channel number. + """ + + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than (1-min_ratio). + if new_value < min_ratio * value: + new_value += divisor + return new_value diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/res_layer.py b/lavis/common/annotator/uniformer/mmseg/models/utils/res_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..b2c07b47007e92e4c3945b989e79f9d50306f5fe --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/res_layer.py @@ -0,0 +1,94 @@ +from annotator.uniformer.mmcv.cnn import build_conv_layer, build_norm_layer +from torch import nn as nn + + +class ResLayer(nn.Sequential): + """ResLayer to build ResNet style backbone. + + Args: + block (nn.Module): block used to build ResLayer. + inplanes (int): inplanes of block. + planes (int): planes of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Default: 1 + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + multi_grid (int | None): Multi grid dilation rates of last + stage. Default: None + contract_dilation (bool): Whether contract first dilation of each layer + Default: False + """ + + def __init__(self, + block, + inplanes, + planes, + num_blocks, + stride=1, + dilation=1, + avg_down=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + multi_grid=None, + contract_dilation=False, + **kwargs): + self.block = block + + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = [] + conv_stride = stride + if avg_down: + conv_stride = 1 + downsample.append( + nn.AvgPool2d( + kernel_size=stride, + stride=stride, + ceil_mode=True, + count_include_pad=False)) + downsample.extend([ + build_conv_layer( + conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=conv_stride, + bias=False), + build_norm_layer(norm_cfg, planes * block.expansion)[1] + ]) + downsample = nn.Sequential(*downsample) + + layers = [] + if multi_grid is None: + if dilation > 1 and contract_dilation: + first_dilation = dilation // 2 + else: + first_dilation = dilation + else: + first_dilation = multi_grid[0] + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride, + dilation=first_dilation, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + inplanes = planes * block.expansion + for i in range(1, num_blocks): + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=1, + dilation=dilation if multi_grid is None else multi_grid[i], + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + super(ResLayer, self).__init__(*layers) diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/se_layer.py b/lavis/common/annotator/uniformer/mmseg/models/utils/se_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..083bd7d1ccee909c900c7aed2cc928bf14727f3e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/se_layer.py @@ -0,0 +1,57 @@ +import annotator.uniformer.mmcv as mmcv +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule + +from .make_divisible import make_divisible + + +class SELayer(nn.Module): + """Squeeze-and-Excitation Module. + + Args: + channels (int): The input (and output) channels of the SE layer. + ratio (int): Squeeze ratio in SELayer, the intermediate channel will be + ``int(channels/ratio)``. Default: 16. + conv_cfg (None or dict): Config dict for convolution layer. + Default: None, which means using conv2d. + act_cfg (dict or Sequence[dict]): Config dict for activation layer. + If act_cfg is a dict, two activation layers will be configured + by this dict. If act_cfg is a sequence of dicts, the first + activation layer will be configured by the first dict and the + second activation layer will be configured by the second dict. + Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, + divisor=6.0)). + """ + + def __init__(self, + channels, + ratio=16, + conv_cfg=None, + act_cfg=(dict(type='ReLU'), + dict(type='HSigmoid', bias=3.0, divisor=6.0))): + super(SELayer, self).__init__() + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert mmcv.is_tuple_of(act_cfg, dict) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=make_divisible(channels // ratio, 8), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=make_divisible(channels // ratio, 8), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x): + out = self.global_avgpool(x) + out = self.conv1(out) + out = self.conv2(out) + return x * out diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/self_attention_block.py b/lavis/common/annotator/uniformer/mmseg/models/utils/self_attention_block.py new file mode 100644 index 0000000000000000000000000000000000000000..440c7b73ee4706fde555595926d63a18d7574acc --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/self_attention_block.py @@ -0,0 +1,159 @@ +import torch +from annotator.uniformer.mmcv.cnn import ConvModule, constant_init +from torch import nn as nn +from torch.nn import functional as F + + +class SelfAttentionBlock(nn.Module): + """General self-attention block/non-local block. + + Please refer to https://arxiv.org/abs/1706.03762 for details about key, + query and value. + + Args: + key_in_channels (int): Input channels of key feature. + query_in_channels (int): Input channels of query feature. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + share_key_query (bool): Whether share projection weight between key + and query projection. + query_downsample (nn.Module): Query downsample module. + key_downsample (nn.Module): Key downsample module. + key_query_num_convs (int): Number of convs for key/query projection. + value_num_convs (int): Number of convs for value projection. + matmul_norm (bool): Whether normalize attention map with sqrt of + channels + with_out (bool): Whether use out projection. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict|None): Config of activation layers. + """ + + def __init__(self, key_in_channels, query_in_channels, channels, + out_channels, share_key_query, query_downsample, + key_downsample, key_query_num_convs, value_out_num_convs, + key_query_norm, value_out_norm, matmul_norm, with_out, + conv_cfg, norm_cfg, act_cfg): + super(SelfAttentionBlock, self).__init__() + if share_key_query: + assert key_in_channels == query_in_channels + self.key_in_channels = key_in_channels + self.query_in_channels = query_in_channels + self.out_channels = out_channels + self.channels = channels + self.share_key_query = share_key_query + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.key_project = self.build_project( + key_in_channels, + channels, + num_convs=key_query_num_convs, + use_conv_module=key_query_norm, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + if share_key_query: + self.query_project = self.key_project + else: + self.query_project = self.build_project( + query_in_channels, + channels, + num_convs=key_query_num_convs, + use_conv_module=key_query_norm, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.value_project = self.build_project( + key_in_channels, + channels if with_out else out_channels, + num_convs=value_out_num_convs, + use_conv_module=value_out_norm, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + if with_out: + self.out_project = self.build_project( + channels, + out_channels, + num_convs=value_out_num_convs, + use_conv_module=value_out_norm, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.out_project = None + + self.query_downsample = query_downsample + self.key_downsample = key_downsample + self.matmul_norm = matmul_norm + + self.init_weights() + + def init_weights(self): + """Initialize weight of later layer.""" + if self.out_project is not None: + if not isinstance(self.out_project, ConvModule): + constant_init(self.out_project, 0) + + def build_project(self, in_channels, channels, num_convs, use_conv_module, + conv_cfg, norm_cfg, act_cfg): + """Build projection layer for key/query/value/out.""" + if use_conv_module: + convs = [ + ConvModule( + in_channels, + channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + ] + for _ in range(num_convs - 1): + convs.append( + ConvModule( + channels, + channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + else: + convs = [nn.Conv2d(in_channels, channels, 1)] + for _ in range(num_convs - 1): + convs.append(nn.Conv2d(channels, channels, 1)) + if len(convs) > 1: + convs = nn.Sequential(*convs) + else: + convs = convs[0] + return convs + + def forward(self, query_feats, key_feats): + """Forward function.""" + batch_size = query_feats.size(0) + query = self.query_project(query_feats) + if self.query_downsample is not None: + query = self.query_downsample(query) + query = query.reshape(*query.shape[:2], -1) + query = query.permute(0, 2, 1).contiguous() + + key = self.key_project(key_feats) + value = self.value_project(key_feats) + if self.key_downsample is not None: + key = self.key_downsample(key) + value = self.key_downsample(value) + key = key.reshape(*key.shape[:2], -1) + value = value.reshape(*value.shape[:2], -1) + value = value.permute(0, 2, 1).contiguous() + + sim_map = torch.matmul(query, key) + if self.matmul_norm: + sim_map = (self.channels**-.5) * sim_map + sim_map = F.softmax(sim_map, dim=-1) + + context = torch.matmul(sim_map, value) + context = context.permute(0, 2, 1).contiguous() + context = context.reshape(batch_size, -1, *query_feats.shape[2:]) + if self.out_project is not None: + context = self.out_project(context) + return context diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/up_conv_block.py b/lavis/common/annotator/uniformer/mmseg/models/utils/up_conv_block.py new file mode 100644 index 0000000000000000000000000000000000000000..378469da76cb7bff6a639e7877b3c275d50490fb --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/up_conv_block.py @@ -0,0 +1,101 @@ +import torch +import torch.nn as nn +from annotator.uniformer.mmcv.cnn import ConvModule, build_upsample_layer + + +class UpConvBlock(nn.Module): + """Upsample convolution block in decoder for UNet. + + This upsample convolution block consists of one upsample module + followed by one convolution block. The upsample module expands the + high-level low-resolution feature map and the convolution block fuses + the upsampled high-level low-resolution feature map and the low-level + high-resolution feature map from encoder. + + Args: + conv_block (nn.Sequential): Sequential of convolutional layers. + in_channels (int): Number of input channels of the high-level + skip_channels (int): Number of input channels of the low-level + high-resolution feature map from encoder. + out_channels (int): Number of output channels. + num_convs (int): Number of convolutional layers in the conv_block. + Default: 2. + stride (int): Stride of convolutional layer in conv_block. Default: 1. + dilation (int): Dilation rate of convolutional layer in conv_block. + Default: 1. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + conv_cfg (dict | None): Config dict for convolution layer. + Default: None. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + upsample_cfg (dict): The upsample config of the upsample module in + decoder. Default: dict(type='InterpConv'). If the size of + high-level feature map is the same as that of skip feature map + (low-level feature map from encoder), it does not need upsample the + high-level feature map and the upsample_cfg is None. + dcn (bool): Use deformable convolution in convolutional layer or not. + Default: None. + plugins (dict): plugins for convolutional layers. Default: None. + """ + + def __init__(self, + conv_block, + in_channels, + skip_channels, + out_channels, + num_convs=2, + stride=1, + dilation=1, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + upsample_cfg=dict(type='InterpConv'), + dcn=None, + plugins=None): + super(UpConvBlock, self).__init__() + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + + self.conv_block = conv_block( + in_channels=2 * skip_channels, + out_channels=out_channels, + num_convs=num_convs, + stride=stride, + dilation=dilation, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + dcn=None, + plugins=None) + if upsample_cfg is not None: + self.upsample = build_upsample_layer( + cfg=upsample_cfg, + in_channels=in_channels, + out_channels=skip_channels, + with_cp=with_cp, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.upsample = ConvModule( + in_channels, + skip_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, skip, x): + """Forward function.""" + + x = self.upsample(x) + out = torch.cat([skip, x], dim=1) + out = self.conv_block(out) + + return out diff --git a/lavis/common/annotator/uniformer/mmseg/models/utils/weight_init.py b/lavis/common/annotator/uniformer/mmseg/models/utils/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..38141ba3d61f64ddfc0a31574b4648cbad96d7dd --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/models/utils/weight_init.py @@ -0,0 +1,62 @@ +"""Modified from https://github.com/rwightman/pytorch-image- +models/blob/master/timm/models/layers/drop.py.""" + +import math +import warnings + +import torch + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + """Reference: https://people.sc.fsu.edu/~jburkardt/presentations + /truncated_normal.pdf""" + + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. ' + 'The distribution of values may be incorrect.', + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + lower_bound = norm_cdf((a - mean) / std) + upper_bound = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * lower_bound - 1, 2 * upper_bound - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor (``torch.Tensor``): an n-dimensional `torch.Tensor` + mean (float): the mean of the normal distribution + std (float): the standard deviation of the normal distribution + a (float): the minimum cutoff value + b (float): the maximum cutoff value + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) diff --git a/lavis/common/annotator/uniformer/mmseg/ops/__init__.py b/lavis/common/annotator/uniformer/mmseg/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bec51c75b9363a9a19e9fb5c35f4e7dbd6f7751c --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/ops/__init__.py @@ -0,0 +1,4 @@ +from .encoding import Encoding +from .wrappers import Upsample, resize + +__all__ = ['Upsample', 'resize', 'Encoding'] diff --git a/lavis/common/annotator/uniformer/mmseg/ops/encoding.py b/lavis/common/annotator/uniformer/mmseg/ops/encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..7eb3629a6426550b8e4c537ee1ff4341893e489e --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/ops/encoding.py @@ -0,0 +1,74 @@ +import torch +from torch import nn +from torch.nn import functional as F + + +class Encoding(nn.Module): + """Encoding Layer: a learnable residual encoder. + + Input is of shape (batch_size, channels, height, width). + Output is of shape (batch_size, num_codes, channels). + + Args: + channels: dimension of the features or feature channels + num_codes: number of code words + """ + + def __init__(self, channels, num_codes): + super(Encoding, self).__init__() + # init codewords and smoothing factor + self.channels, self.num_codes = channels, num_codes + std = 1. / ((num_codes * channels)**0.5) + # [num_codes, channels] + self.codewords = nn.Parameter( + torch.empty(num_codes, channels, + dtype=torch.float).uniform_(-std, std), + requires_grad=True) + # [num_codes] + self.scale = nn.Parameter( + torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0), + requires_grad=True) + + @staticmethod + def scaled_l2(x, codewords, scale): + num_codes, channels = codewords.size() + batch_size = x.size(0) + reshaped_scale = scale.view((1, 1, num_codes)) + expanded_x = x.unsqueeze(2).expand( + (batch_size, x.size(1), num_codes, channels)) + reshaped_codewords = codewords.view((1, 1, num_codes, channels)) + + scaled_l2_norm = reshaped_scale * ( + expanded_x - reshaped_codewords).pow(2).sum(dim=3) + return scaled_l2_norm + + @staticmethod + def aggregate(assignment_weights, x, codewords): + num_codes, channels = codewords.size() + reshaped_codewords = codewords.view((1, 1, num_codes, channels)) + batch_size = x.size(0) + + expanded_x = x.unsqueeze(2).expand( + (batch_size, x.size(1), num_codes, channels)) + encoded_feat = (assignment_weights.unsqueeze(3) * + (expanded_x - reshaped_codewords)).sum(dim=1) + return encoded_feat + + def forward(self, x): + assert x.dim() == 4 and x.size(1) == self.channels + # [batch_size, channels, height, width] + batch_size = x.size(0) + # [batch_size, height x width, channels] + x = x.view(batch_size, self.channels, -1).transpose(1, 2).contiguous() + # assignment_weights: [batch_size, channels, num_codes] + assignment_weights = F.softmax( + self.scaled_l2(x, self.codewords, self.scale), dim=2) + # aggregate + encoded_feat = self.aggregate(assignment_weights, x, self.codewords) + return encoded_feat + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(Nx{self.channels}xHxW =>Nx{self.num_codes}' \ + f'x{self.channels})' + return repr_str diff --git a/lavis/common/annotator/uniformer/mmseg/ops/wrappers.py b/lavis/common/annotator/uniformer/mmseg/ops/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..0ed9a0cb8d7c0e0ec2748dd89c652756653cac78 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/ops/wrappers.py @@ -0,0 +1,50 @@ +import warnings + +import torch.nn as nn +import torch.nn.functional as F + + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > output_h: + if ((output_h > 1 and output_w > 1 and input_h > 1 + and input_w > 1) and (output_h - 1) % (input_h - 1) + and (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + return F.interpolate(input, size, scale_factor, mode, align_corners) + + +class Upsample(nn.Module): + + def __init__(self, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None): + super(Upsample, self).__init__() + self.size = size + if isinstance(scale_factor, tuple): + self.scale_factor = tuple(float(factor) for factor in scale_factor) + else: + self.scale_factor = float(scale_factor) if scale_factor else None + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + if not self.size: + size = [int(t * self.scale_factor) for t in x.shape[-2:]] + else: + size = self.size + return resize(x, size, None, self.mode, self.align_corners) diff --git a/lavis/common/annotator/uniformer/mmseg/utils/__init__.py b/lavis/common/annotator/uniformer/mmseg/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ac489e2dbbc0e6fa87f5088b4edcc20f8cadc1a6 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/utils/__init__.py @@ -0,0 +1,4 @@ +from .collect_env import collect_env +from .logger import get_root_logger + +__all__ = ['get_root_logger', 'collect_env'] diff --git a/lavis/common/annotator/uniformer/mmseg/utils/collect_env.py b/lavis/common/annotator/uniformer/mmseg/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..65c2134ddbee9655161237dd0894d38c768c2624 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/utils/collect_env.py @@ -0,0 +1,17 @@ +from annotator.uniformer.mmcv.utils import collect_env as collect_base_env +from annotator.uniformer.mmcv.utils import get_git_hash + +import annotator.uniformer.mmseg as mmseg + + +def collect_env(): + """Collect the information of the running environments.""" + env_info = collect_base_env() + env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}' + + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print('{}: {}'.format(name, val)) diff --git a/lavis/common/annotator/uniformer/mmseg/utils/logger.py b/lavis/common/annotator/uniformer/mmseg/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..4149d9eda3dfef07490352d22ac40c42460315e4 --- /dev/null +++ b/lavis/common/annotator/uniformer/mmseg/utils/logger.py @@ -0,0 +1,27 @@ +import logging + +from annotator.uniformer.mmcv.utils import get_logger + + +def get_root_logger(log_file=None, log_level=logging.INFO): + """Get the root logger. + + The logger will be initialized if it has not been initialized. By default a + StreamHandler will be added. If `log_file` is specified, a FileHandler will + also be added. The name of the root logger is the top-level package name, + e.g., "mmseg". + + Args: + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the root logger. + log_level (int): The root logger level. Note that only the process of + rank 0 is affected, while other processes will set the level to + "Error" and be silent most of the time. + + Returns: + logging.Logger: The root logger. + """ + + logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level) + + return logger diff --git a/lavis/common/annotator/util.py b/lavis/common/annotator/util.py new file mode 100644 index 0000000000000000000000000000000000000000..90831643d19cc1b9b0940df3d4fd4d846ba74a05 --- /dev/null +++ b/lavis/common/annotator/util.py @@ -0,0 +1,38 @@ +import numpy as np +import cv2 +import os + + +annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts') + + +def HWC3(x): + assert x.dtype == np.uint8 + if x.ndim == 2: + x = x[:, :, None] + assert x.ndim == 3 + H, W, C = x.shape + assert C == 1 or C == 3 or C == 4 + if C == 3: + return x + if C == 1: + return np.concatenate([x, x, x], axis=2) + if C == 4: + color = x[:, :, 0:3].astype(np.float32) + alpha = x[:, :, 3:4].astype(np.float32) / 255.0 + y = color * alpha + 255.0 * (1.0 - alpha) + y = y.clip(0, 255).astype(np.uint8) + return y + + +def resize_image(input_image, resolution): + H, W, C = input_image.shape + H = float(H) + W = float(W) + k = float(resolution) / min(H, W) + H *= k + W *= k + H = int(np.round(H / 64.0)) * 64 + W = int(np.round(W / 64.0)) * 64 + img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA) + return img diff --git a/lavis/common/config.py b/lavis/common/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2264b0578fd52b805f619a871ce5ff80c0310ccb --- /dev/null +++ b/lavis/common/config.py @@ -0,0 +1,468 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +import json +from typing import Dict + +from omegaconf import OmegaConf +from lavis.common.registry import registry + + +class Config: + def __init__(self, args): + self.config = {} + + self.args = args + + # Register the config and configuration for setup + registry.register("configuration", self) + + user_config = self._build_opt_list(self.args.options) + + config = OmegaConf.load(self.args.cfg_path) + + runner_config = self.build_runner_config(config) + model_config = self.build_model_config(config, **user_config) + dataset_config = self.build_dataset_config(config) + + # Validate the user-provided runner configuration + # model and dataset configuration are supposed to be validated by the respective classes + # [TODO] validate the model/dataset configuration + # self._validate_runner_config(runner_config) + + # Override the default configuration with user options. + self.config = OmegaConf.merge( + runner_config, model_config, dataset_config, user_config + ) + + def _validate_runner_config(self, runner_config): + """ + This method validates the configuration, such that + 1) all the user specified options are valid; + 2) no type mismatches between the user specified options and the config. + """ + runner_config_validator = create_runner_config_validator() + runner_config_validator.validate(runner_config) + + def _build_opt_list(self, opts): + opts_dot_list = self._convert_to_dot_list(opts) + return OmegaConf.from_dotlist(opts_dot_list) + + @staticmethod + def build_model_config(config, **kwargs): + model = config.get("model", None) + assert model is not None, "Missing model configuration file." + + model_cls = registry.get_model_class(model.arch) + assert model_cls is not None, f"Model '{model.arch}' has not been registered." + + model_type = kwargs.get("model.model_type", None) + if not model_type: + model_type = model.get("model_type", None) + # else use the model type selected by user. + + assert model_type is not None, "Missing model_type." + + model_config_path = model_cls.default_config_path(model_type=model_type) + + model_config = OmegaConf.create() + # hiararchy override, customized config > default config + model_config = OmegaConf.merge( + model_config, + OmegaConf.load(model_config_path), + {"model": config["model"]}, + ) + + return model_config + + @staticmethod + def build_runner_config(config): + return {"run": config.run} + + @staticmethod + def build_dataset_config(config): + datasets = config.get("datasets", None) + if datasets is None: + raise KeyError( + "Expecting 'datasets' as the root key for dataset configuration." + ) + + dataset_config = OmegaConf.create() + + for dataset_name in datasets: + builder_cls = registry.get_builder_class(dataset_name) + + dataset_config_type = datasets[dataset_name].get("type", "default") + dataset_config_path = builder_cls.default_config_path( + type=dataset_config_type + ) + + # hiararchy override, customized config > default config + dataset_config = OmegaConf.merge( + dataset_config, + OmegaConf.load(dataset_config_path), + {"datasets": {dataset_name: config["datasets"][dataset_name]}}, + ) + + return dataset_config + + def _convert_to_dot_list(self, opts): + if opts is None: + opts = [] + + if len(opts) == 0: + return opts + + has_equal = opts[0].find("=") != -1 + + if has_equal: + return opts + + return [(opt + "=" + value) for opt, value in zip(opts[0::2], opts[1::2])] + + def get_config(self): + return self.config + + @property + def run_cfg(self): + return self.config.run + + @property + def datasets_cfg(self): + return self.config.datasets + + @property + def model_cfg(self): + return self.config.model + + def pretty_print(self): + logging.info("\n===== Running Parameters =====") + logging.info(self._convert_node_to_json(self.config.run)) + + logging.info("\n====== Dataset Attributes ======") + datasets = self.config.datasets + + for dataset in datasets: + if dataset in self.config.datasets: + logging.info(f"\n======== {dataset} =======") + dataset_config = self.config.datasets[dataset] + logging.info(self._convert_node_to_json(dataset_config)) + else: + logging.warning(f"No dataset named '{dataset}' in config. Skipping") + + logging.info(f"\n====== Model Attributes ======") + logging.info(self._convert_node_to_json(self.config.model)) + + def _convert_node_to_json(self, node): + container = OmegaConf.to_container(node, resolve=True) + return json.dumps(container, indent=4, sort_keys=True) + + def to_dict(self): + return OmegaConf.to_container(self.config) + + +def node_to_dict(node): + return OmegaConf.to_container(node) + + +class ConfigValidator: + """ + This is a preliminary implementation to centralize and validate the configuration. + May be altered in the future. + + A helper class to validate configurations from yaml file. + + This serves the following purposes: + 1. Ensure all the options in the yaml are defined, raise error if not. + 2. when type mismatches are found, the validator will raise an error. + 3. a central place to store and display helpful messages for supported configurations. + + """ + + class _Argument: + def __init__(self, name, choices=None, type=None, help=None): + self.name = name + self.val = None + self.choices = choices + self.type = type + self.help = help + + def __str__(self): + s = f"{self.name}={self.val}" + if self.type is not None: + s += f", ({self.type})" + if self.choices is not None: + s += f", choices: {self.choices}" + if self.help is not None: + s += f", ({self.help})" + return s + + def __init__(self, description): + self.description = description + + self.arguments = dict() + + self.parsed_args = None + + def __getitem__(self, key): + assert self.parsed_args is not None, "No arguments parsed yet." + + return self.parsed_args[key] + + def __str__(self) -> str: + return self.format_help() + + def add_argument(self, *args, **kwargs): + """ + Assume the first argument is the name of the argument. + """ + self.arguments[args[0]] = self._Argument(*args, **kwargs) + + def validate(self, config=None): + """ + Convert yaml config (dict-like) to list, required by argparse. + """ + for k, v in config.items(): + assert ( + k in self.arguments + ), f"""{k} is not a valid argument. Support arguments are {self.format_arguments()}.""" + + if self.arguments[k].type is not None: + try: + self.arguments[k].val = self.arguments[k].type(v) + except ValueError: + raise ValueError(f"{k} is not a valid {self.arguments[k].type}.") + + if self.arguments[k].choices is not None: + assert ( + v in self.arguments[k].choices + ), f"""{k} must be one of {self.arguments[k].choices}.""" + + return config + + def format_arguments(self): + return str([f"{k}" for k in sorted(self.arguments.keys())]) + + def format_help(self): + # description + key-value pair string for each argument + help_msg = str(self.description) + return help_msg + ", available arguments: " + self.format_arguments() + + def print_help(self): + # display help message + print(self.format_help()) + + +def create_runner_config_validator(): + validator = ConfigValidator(description="Runner configurations") + + validator.add_argument( + "runner", + type=str, + choices=["runner_base", "runner_iter"], + help="""Runner to use. The "runner_base" uses epoch-based training while iter-based + runner runs based on iters. Default: runner_base""", + ) + # add argumetns for training dataset ratios + validator.add_argument( + "train_dataset_ratios", + type=Dict[str, float], + help="""Ratios of training dataset. This is used in iteration-based runner. + Do not support for epoch-based runner because how to define an epoch becomes tricky. + Default: None""", + ) + validator.add_argument( + "max_iters", + type=float, + help="Maximum number of iterations to run.", + ) + validator.add_argument( + "max_epoch", + type=int, + help="Maximum number of epochs to run.", + ) + # add arguments for iters_per_inner_epoch + validator.add_argument( + "iters_per_inner_epoch", + type=float, + help="Number of iterations per inner epoch. This is required when runner is runner_iter.", + ) + lr_scheds_choices = registry.list_lr_schedulers() + validator.add_argument( + "lr_sched", + type=str, + choices=lr_scheds_choices, + help="Learning rate scheduler to use, from {}".format(lr_scheds_choices), + ) + task_choices = registry.list_tasks() + validator.add_argument( + "task", + type=str, + choices=task_choices, + help="Task to use, from {}".format(task_choices), + ) + # add arguments for init_lr + validator.add_argument( + "init_lr", + type=float, + help="Initial learning rate. This will be the learning rate after warmup and before decay.", + ) + # add arguments for min_lr + validator.add_argument( + "min_lr", + type=float, + help="Minimum learning rate (after decay).", + ) + # add arguments for warmup_lr + validator.add_argument( + "warmup_lr", + type=float, + help="Starting learning rate for warmup.", + ) + # add arguments for learning rate decay rate + validator.add_argument( + "lr_decay_rate", + type=float, + help="Learning rate decay rate. Required if using a decaying learning rate scheduler.", + ) + # add arguments for weight decay + validator.add_argument( + "weight_decay", + type=float, + help="Weight decay rate.", + ) + # add arguments for training batch size + validator.add_argument( + "batch_size_train", + type=int, + help="Training batch size.", + ) + # add arguments for evaluation batch size + validator.add_argument( + "batch_size_eval", + type=int, + help="Evaluation batch size, including validation and testing.", + ) + # add arguments for number of workers for data loading + validator.add_argument( + "num_workers", + help="Number of workers for data loading.", + ) + # add arguments for warm up steps + validator.add_argument( + "warmup_steps", + type=int, + help="Number of warmup steps. Required if a warmup schedule is used.", + ) + # add arguments for random seed + validator.add_argument( + "seed", + type=int, + help="Random seed.", + ) + # add arguments for output directory + validator.add_argument( + "output_dir", + type=str, + help="Output directory to save checkpoints and logs.", + ) + # add arguments for whether only use evaluation + validator.add_argument( + "evaluate", + help="Whether to only evaluate the model. If true, training will not be performed.", + ) + # add arguments for splits used for training, e.g. ["train", "val"] + validator.add_argument( + "train_splits", + type=list, + help="Splits to use for training.", + ) + # add arguments for splits used for validation, e.g. ["val"] + validator.add_argument( + "valid_splits", + type=list, + help="Splits to use for validation. If not provided, will skip the validation.", + ) + # add arguments for splits used for testing, e.g. ["test"] + validator.add_argument( + "test_splits", + type=list, + help="Splits to use for testing. If not provided, will skip the testing.", + ) + # add arguments for accumulating gradient for iterations + validator.add_argument( + "accum_grad_iters", + type=int, + help="Number of iterations to accumulate gradient for.", + ) + + # ====== distributed training ====== + validator.add_argument( + "device", + type=str, + choices=["cpu", "cuda"], + help="Device to use. Support 'cuda' or 'cpu' as for now.", + ) + validator.add_argument( + "world_size", + type=int, + help="Number of processes participating in the job.", + ) + validator.add_argument("dist_url", type=str) + validator.add_argument("distributed", type=bool) + # add arguments to opt using distributed sampler during evaluation or not + validator.add_argument( + "use_dist_eval_sampler", + type=bool, + help="Whether to use distributed sampler during evaluation or not.", + ) + + # ====== task specific ====== + # generation task specific arguments + # add arguments for maximal length of text output + validator.add_argument( + "max_len", + type=int, + help="Maximal length of text output.", + ) + # add arguments for minimal length of text output + validator.add_argument( + "min_len", + type=int, + help="Minimal length of text output.", + ) + # add arguments number of beams + validator.add_argument( + "num_beams", + type=int, + help="Number of beams used for beam search.", + ) + + # vqa task specific arguments + # add arguments for number of answer candidates + validator.add_argument( + "num_ans_candidates", + type=int, + help="""For ALBEF and BLIP, these models first rank answers according to likelihood to select answer candidates.""", + ) + # add arguments for inference method + validator.add_argument( + "inference_method", + type=str, + choices=["genearte", "rank"], + help="""Inference method to use for question answering. If rank, requires a answer list.""", + ) + + # ====== model specific ====== + validator.add_argument( + "k_test", + type=int, + help="Number of top k most similar samples from ITC/VTC selection to be tested.", + ) + + return validator diff --git a/lavis/common/dist_utils.py b/lavis/common/dist_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..296a3c86f29c6e82fa8f1108c7dd9fa7d3e9ce45 --- /dev/null +++ b/lavis/common/dist_utils.py @@ -0,0 +1,137 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import datetime +import functools +import os + +import torch +import torch.distributed as dist +import timm.models.hub as timm_hub + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop("force", False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def init_distributed_mode(args): + if "RANK" in os.environ and "WORLD_SIZE" in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ["WORLD_SIZE"]) + args.gpu = int(os.environ["LOCAL_RANK"]) + elif "SLURM_PROCID" in os.environ: + args.rank = int(os.environ["SLURM_PROCID"]) + args.gpu = args.rank % torch.cuda.device_count() + else: + print("Not using distributed mode") + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = "nccl" + print( + "| distributed init (rank {}, world {}): {}".format( + args.rank, args.world_size, args.dist_url + ), + flush=True, + ) + torch.distributed.init_process_group( + backend=args.dist_backend, + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank, + timeout=datetime.timedelta( + days=365 + ), # allow auto-downloading and de-compressing + ) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +def get_dist_info(): + if torch.__version__ < "1.0": + initialized = dist._initialized + else: + initialized = dist.is_initialized() + if initialized: + rank = dist.get_rank() + world_size = dist.get_world_size() + else: # non-distributed training + rank = 0 + world_size = 1 + return rank, world_size + + +def main_process(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + rank, _ = get_dist_info() + if rank == 0: + return func(*args, **kwargs) + + return wrapper + + +def download_cached_file(url, check_hash=True, progress=False): + """ + Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again. + If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded. + """ + + def get_cached_file_path(): + # a hack to sync the file path across processes + parts = torch.hub.urlparse(url) + filename = os.path.basename(parts.path) + cached_file = os.path.join(timm_hub.get_cache_dir(), filename) + + return cached_file + + if is_main_process(): + timm_hub.download_cached_file(url, check_hash, progress) + + if is_dist_avail_and_initialized(): + dist.barrier() + + return get_cached_file_path() diff --git a/lavis/common/gradcam.py b/lavis/common/gradcam.py new file mode 100644 index 0000000000000000000000000000000000000000..d53a5254d4b319eaf2cbfbd081b0ca8e38c5c7a0 --- /dev/null +++ b/lavis/common/gradcam.py @@ -0,0 +1,24 @@ +import numpy as np +from matplotlib import pyplot as plt +from scipy.ndimage import filters +from skimage import transform as skimage_transform + + +def getAttMap(img, attMap, blur=True, overlap=True): + attMap -= attMap.min() + if attMap.max() > 0: + attMap /= attMap.max() + attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") + if blur: + attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) + attMap -= attMap.min() + attMap /= attMap.max() + cmap = plt.get_cmap("jet") + attMapV = cmap(attMap) + attMapV = np.delete(attMapV, 3, 2) + if overlap: + attMap = ( + 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img + + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV + ) + return attMap diff --git a/lavis/common/logger.py b/lavis/common/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..aa1ea0d096db7b4914c6ba2c031c06a40fd793f3 --- /dev/null +++ b/lavis/common/logger.py @@ -0,0 +1,195 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import datetime +import logging +import time +from collections import defaultdict, deque + +import torch +import torch.distributed as dist + +from lavis.common import dist_utils + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not dist_utils.is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda") + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value, + ) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError( + "'{}' object has no attribute '{}'".format(type(self).__name__, attr) + ) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append("{}: {}".format(name, str(meter))) + return self.delimiter.join(loss_str) + + def global_avg(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append("{}: {:.4f}".format(name, meter.global_avg)) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = "" + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt="{avg:.4f}") + data_time = SmoothedValue(fmt="{avg:.4f}") + space_fmt = ":" + str(len(str(len(iterable)))) + "d" + log_msg = [ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + ] + if torch.cuda.is_available(): + log_msg.append("max mem: {memory:.0f}") + log_msg = self.delimiter.join(log_msg) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print( + log_msg.format( + i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB, + ) + ) + else: + print( + log_msg.format( + i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + ) + ) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print( + "{} Total time: {} ({:.4f} s / it)".format( + header, total_time_str, total_time / len(iterable) + ) + ) + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def setup_logger(): + logging.basicConfig( + level=logging.INFO if dist_utils.is_main_process() else logging.WARN, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[logging.StreamHandler()], + ) diff --git a/lavis/common/optims.py b/lavis/common/optims.py new file mode 100644 index 0000000000000000000000000000000000000000..fb7d5664748f3b591c00e8a09cf46b3960ddda3a --- /dev/null +++ b/lavis/common/optims.py @@ -0,0 +1,139 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import math + +from lavis.common.registry import registry + + +@registry.register_lr_scheduler("linear_warmup_step_lr") +class LinearWarmupStepLRScheduler: + def __init__( + self, + optimizer, + max_epoch, + min_lr, + init_lr, + decay_rate=1, + warmup_start_lr=-1, + warmup_steps=0, + **kwargs + ): + self.optimizer = optimizer + + self.max_epoch = max_epoch + self.min_lr = min_lr + + self.decay_rate = decay_rate + + self.init_lr = init_lr + self.warmup_steps = warmup_steps + self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr + + def step(self, cur_epoch, cur_step): + if cur_epoch == 0: + warmup_lr_schedule( + step=cur_step, + optimizer=self.optimizer, + max_step=self.warmup_steps, + init_lr=self.warmup_start_lr, + max_lr=self.init_lr, + ) + else: + step_lr_schedule( + epoch=cur_epoch, + optimizer=self.optimizer, + init_lr=self.init_lr, + min_lr=self.min_lr, + decay_rate=self.decay_rate, + ) + + +@registry.register_lr_scheduler("linear_warmup_cosine_lr") +class LinearWarmupCosineLRScheduler: + def __init__( + self, + optimizer, + max_epoch, + min_lr, + init_lr, + warmup_steps=0, + warmup_start_lr=-1, + **kwargs + ): + self.optimizer = optimizer + + self.max_epoch = max_epoch + self.min_lr = min_lr + + self.init_lr = init_lr + self.warmup_steps = warmup_steps + self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr + + def step(self, cur_epoch, cur_step): + # assuming the warmup iters less than one epoch + if cur_epoch == 0: + warmup_lr_schedule( + step=cur_step, + optimizer=self.optimizer, + max_step=self.warmup_steps, + init_lr=self.warmup_start_lr, + max_lr=self.init_lr, + ) + else: + cosine_lr_schedule( + epoch=cur_epoch, + optimizer=self.optimizer, + max_epoch=self.max_epoch, + init_lr=self.init_lr, + min_lr=self.min_lr, + ) + + +@registry.register_lr_scheduler("constant_lr") +class ConstantLRScheduler: + def __init__(self, optimizer, init_lr, warmup_start_lr=-1, warmup_steps=0, **kwargs): + self.optimizer = optimizer + self.lr = init_lr + self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr + self.warmup_steps = warmup_steps + + def step(self, cur_epoch, cur_step): + if cur_epoch == 0: + warmup_lr_schedule( + step=cur_step, + optimizer=self.optimizer, + max_step=self.warmup_steps, + init_lr=self.warmup_start_lr, + max_lr=self.lr, + ) + else: + for param_group in self.optimizer.param_groups: + param_group["lr"] = self.lr + + +def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr): + """Decay the learning rate""" + lr = (init_lr - min_lr) * 0.5 * ( + 1.0 + math.cos(math.pi * epoch / max_epoch) + ) + min_lr + for param_group in optimizer.param_groups: + param_group["lr"] = lr + + +def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr): + """Warmup the learning rate""" + lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1)) + for param_group in optimizer.param_groups: + param_group["lr"] = lr + + +def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate): + """Decay the learning rate""" + lr = max(min_lr, init_lr * (decay_rate**epoch)) + for param_group in optimizer.param_groups: + param_group["lr"] = lr diff --git a/lavis/common/registry.py b/lavis/common/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..9039d8aaa580f19cc0d43ed9330bd90055045867 --- /dev/null +++ b/lavis/common/registry.py @@ -0,0 +1,329 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + + +class Registry: + mapping = { + "builder_name_mapping": {}, + "task_name_mapping": {}, + "processor_name_mapping": {}, + "model_name_mapping": {}, + "lr_scheduler_name_mapping": {}, + "runner_name_mapping": {}, + "state": {}, + "paths": {}, + } + + @classmethod + def register_builder(cls, name): + r"""Register a dataset builder to registry with key 'name' + + Args: + name: Key with which the builder will be registered. + + Usage: + + from lavis.common.registry import registry + from lavis.datasets.base_dataset_builder import BaseDatasetBuilder + """ + + def wrap(builder_cls): + from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder + + assert issubclass( + builder_cls, BaseDatasetBuilder + ), "All builders must inherit BaseDatasetBuilder class, found {}".format( + builder_cls + ) + if name in cls.mapping["builder_name_mapping"]: + raise KeyError( + "Name '{}' already registered for {}.".format( + name, cls.mapping["builder_name_mapping"][name] + ) + ) + cls.mapping["builder_name_mapping"][name] = builder_cls + return builder_cls + + return wrap + + @classmethod + def register_task(cls, name): + r"""Register a task to registry with key 'name' + + Args: + name: Key with which the task will be registered. + + Usage: + + from lavis.common.registry import registry + """ + + def wrap(task_cls): + from lavis.tasks.base_task import BaseTask + + assert issubclass( + task_cls, BaseTask + ), "All tasks must inherit BaseTask class" + if name in cls.mapping["task_name_mapping"]: + raise KeyError( + "Name '{}' already registered for {}.".format( + name, cls.mapping["task_name_mapping"][name] + ) + ) + cls.mapping["task_name_mapping"][name] = task_cls + return task_cls + + return wrap + + @classmethod + def register_model(cls, name): + r"""Register a task to registry with key 'name' + + Args: + name: Key with which the task will be registered. + + Usage: + + from lavis.common.registry import registry + """ + + def wrap(model_cls): + from lavis.models import BaseModel + + assert issubclass( + model_cls, BaseModel + ), "All models must inherit BaseModel class" + if name in cls.mapping["model_name_mapping"]: + raise KeyError( + "Name '{}' already registered for {}.".format( + name, cls.mapping["model_name_mapping"][name] + ) + ) + cls.mapping["model_name_mapping"][name] = model_cls + return model_cls + + return wrap + + @classmethod + def register_processor(cls, name): + r"""Register a processor to registry with key 'name' + + Args: + name: Key with which the task will be registered. + + Usage: + + from lavis.common.registry import registry + """ + + def wrap(processor_cls): + from lavis.processors import BaseProcessor + + assert issubclass( + processor_cls, BaseProcessor + ), "All processors must inherit BaseProcessor class" + if name in cls.mapping["processor_name_mapping"]: + raise KeyError( + "Name '{}' already registered for {}.".format( + name, cls.mapping["processor_name_mapping"][name] + ) + ) + cls.mapping["processor_name_mapping"][name] = processor_cls + return processor_cls + + return wrap + + @classmethod + def register_lr_scheduler(cls, name): + r"""Register a model to registry with key 'name' + + Args: + name: Key with which the task will be registered. + + Usage: + + from lavis.common.registry import registry + """ + + def wrap(lr_sched_cls): + if name in cls.mapping["lr_scheduler_name_mapping"]: + raise KeyError( + "Name '{}' already registered for {}.".format( + name, cls.mapping["lr_scheduler_name_mapping"][name] + ) + ) + cls.mapping["lr_scheduler_name_mapping"][name] = lr_sched_cls + return lr_sched_cls + + return wrap + + @classmethod + def register_runner(cls, name): + r"""Register a model to registry with key 'name' + + Args: + name: Key with which the task will be registered. + + Usage: + + from lavis.common.registry import registry + """ + + def wrap(runner_cls): + if name in cls.mapping["runner_name_mapping"]: + raise KeyError( + "Name '{}' already registered for {}.".format( + name, cls.mapping["runner_name_mapping"][name] + ) + ) + cls.mapping["runner_name_mapping"][name] = runner_cls + return runner_cls + + return wrap + + @classmethod + def register_path(cls, name, path): + r"""Register a path to registry with key 'name' + + Args: + name: Key with which the path will be registered. + + Usage: + + from lavis.common.registry import registry + """ + assert isinstance(path, str), "All path must be str." + if name in cls.mapping["paths"]: + raise KeyError("Name '{}' already registered.".format(name)) + cls.mapping["paths"][name] = path + + @classmethod + def register(cls, name, obj): + r"""Register an item to registry with key 'name' + + Args: + name: Key with which the item will be registered. + + Usage:: + + from lavis.common.registry import registry + + registry.register("config", {}) + """ + path = name.split(".") + current = cls.mapping["state"] + + for part in path[:-1]: + if part not in current: + current[part] = {} + current = current[part] + + current[path[-1]] = obj + + # @classmethod + # def get_trainer_class(cls, name): + # return cls.mapping["trainer_name_mapping"].get(name, None) + + @classmethod + def get_builder_class(cls, name): + return cls.mapping["builder_name_mapping"].get(name, None) + + @classmethod + def get_model_class(cls, name): + return cls.mapping["model_name_mapping"].get(name, None) + + @classmethod + def get_task_class(cls, name): + return cls.mapping["task_name_mapping"].get(name, None) + + @classmethod + def get_processor_class(cls, name): + return cls.mapping["processor_name_mapping"].get(name, None) + + @classmethod + def get_lr_scheduler_class(cls, name): + return cls.mapping["lr_scheduler_name_mapping"].get(name, None) + + @classmethod + def get_runner_class(cls, name): + return cls.mapping["runner_name_mapping"].get(name, None) + + @classmethod + def list_runners(cls): + return sorted(cls.mapping["runner_name_mapping"].keys()) + + @classmethod + def list_models(cls): + return sorted(cls.mapping["model_name_mapping"].keys()) + + @classmethod + def list_tasks(cls): + return sorted(cls.mapping["task_name_mapping"].keys()) + + @classmethod + def list_processors(cls): + return sorted(cls.mapping["processor_name_mapping"].keys()) + + @classmethod + def list_lr_schedulers(cls): + return sorted(cls.mapping["lr_scheduler_name_mapping"].keys()) + + @classmethod + def list_datasets(cls): + return sorted(cls.mapping["builder_name_mapping"].keys()) + + @classmethod + def get_path(cls, name): + return cls.mapping["paths"].get(name, None) + + @classmethod + def get(cls, name, default=None, no_warning=False): + r"""Get an item from registry with key 'name' + + Args: + name (string): Key whose value needs to be retrieved. + default: If passed and key is not in registry, default value will + be returned with a warning. Default: None + no_warning (bool): If passed as True, warning when key doesn't exist + will not be generated. Useful for MMF's + internal operations. Default: False + """ + original_name = name + name = name.split(".") + value = cls.mapping["state"] + for subname in name: + value = value.get(subname, default) + if value is default: + break + + if ( + "writer" in cls.mapping["state"] + and value == default + and no_warning is False + ): + cls.mapping["state"]["writer"].warning( + "Key {} is not present in registry, returning default value " + "of {}".format(original_name, default) + ) + return value + + @classmethod + def unregister(cls, name): + r"""Remove an item from registry with key 'name' + + Args: + name: Key which needs to be removed. + Usage:: + + from mmf.common.registry import registry + + config = registry.unregister("config") + """ + return cls.mapping["state"].pop(name, None) + + +registry = Registry() diff --git a/lavis/common/utils.py b/lavis/common/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c714cc988f9b63c716bc5560075ef33ec718fbb4 --- /dev/null +++ b/lavis/common/utils.py @@ -0,0 +1,455 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import io +import json +import logging +import os +import pickle +import re +import shutil +import tarfile +import urllib +import urllib.error +import urllib.request +from typing import Optional +from urllib.parse import urlparse + +import numpy as np +import pandas as pd +import yaml +from iopath.common.download import download +from iopath.common.file_io import file_lock, g_pathmgr +from lavis.common.dist_utils import download_cached_file +from lavis.common.registry import registry +from torch.utils.model_zoo import tqdm +from torchvision.datasets.utils import ( + check_integrity, + download_file_from_google_drive, + extract_archive, +) + + +def now(): + from datetime import datetime + + return datetime.now().strftime("%Y%m%d%H%M")[:-1] + + +def is_url(url_or_filename): + parsed = urlparse(url_or_filename) + return parsed.scheme in ("http", "https") + + +def get_cache_path(rel_path): + return os.path.expanduser(os.path.join(registry.get_path("cache_root"), rel_path)) + + +def get_abs_path(rel_path): + return os.path.join(registry.get_path("library_root"), rel_path) + + +def load_json(filename): + with open(filename, "r") as f: + return json.load(f) + + +# The following are adapted from torchvision and vissl +# torchvision: https://github.com/pytorch/vision +# vissl: https://github.com/facebookresearch/vissl/blob/main/vissl/utils/download.py + + +def makedir(dir_path): + """ + Create the directory if it does not exist. + """ + is_success = False + try: + if not g_pathmgr.exists(dir_path): + g_pathmgr.mkdirs(dir_path) + is_success = True + except BaseException: + print(f"Error creating directory: {dir_path}") + return is_success + + +def get_redirected_url(url: str): + """ + Given a URL, returns the URL it redirects to or the + original URL in case of no indirection + """ + import requests + + with requests.Session() as session: + with session.get(url, stream=True, allow_redirects=True) as response: + if response.history: + return response.url + else: + return url + + +def to_google_drive_download_url(view_url: str) -> str: + """ + Utility function to transform a view URL of google drive + to a download URL for google drive + Example input: + https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp/view + Example output: + https://drive.google.com/uc?export=download&id=137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp + """ + splits = view_url.split("/") + assert splits[-1] == "view" + file_id = splits[-2] + return f"https://drive.google.com/uc?export=download&id={file_id}" + + +def download_google_drive_url(url: str, output_path: str, output_file_name: str): + """ + Download a file from google drive + Downloading an URL from google drive requires confirmation when + the file of the size is too big (google drive notifies that + anti-viral checks cannot be performed on such files) + """ + import requests + + with requests.Session() as session: + + # First get the confirmation token and append it to the URL + with session.get(url, stream=True, allow_redirects=True) as response: + for k, v in response.cookies.items(): + if k.startswith("download_warning"): + url = url + "&confirm=" + v + + # Then download the content of the file + with session.get(url, stream=True, verify=True) as response: + makedir(output_path) + path = os.path.join(output_path, output_file_name) + total_size = int(response.headers.get("Content-length", 0)) + with open(path, "wb") as file: + from tqdm import tqdm + + with tqdm(total=total_size) as progress_bar: + for block in response.iter_content( + chunk_size=io.DEFAULT_BUFFER_SIZE + ): + file.write(block) + progress_bar.update(len(block)) + + +def _get_google_drive_file_id(url: str) -> Optional[str]: + parts = urlparse(url) + + if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None: + return None + + match = re.match(r"/file/d/(?P[^/]*)", parts.path) + if match is None: + return None + + return match.group("id") + + +def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None: + with open(filename, "wb") as fh: + with urllib.request.urlopen( + urllib.request.Request(url, headers={"User-Agent": "vissl"}) + ) as response: + with tqdm(total=response.length) as pbar: + for chunk in iter(lambda: response.read(chunk_size), ""): + if not chunk: + break + pbar.update(chunk_size) + fh.write(chunk) + + +def download_url( + url: str, + root: str, + filename: Optional[str] = None, + md5: Optional[str] = None, +) -> None: + """Download a file from a url and place it in root. + Args: + url (str): URL to download file from + root (str): Directory to place downloaded file in + filename (str, optional): Name to save the file under. + If None, use the basename of the URL. + md5 (str, optional): MD5 checksum of the download. If None, do not check + """ + root = os.path.expanduser(root) + if not filename: + filename = os.path.basename(url) + fpath = os.path.join(root, filename) + + makedir(root) + + # check if file is already present locally + if check_integrity(fpath, md5): + print("Using downloaded and verified file: " + fpath) + return + + # expand redirect chain if needed + url = get_redirected_url(url) + + # check if file is located on Google Drive + file_id = _get_google_drive_file_id(url) + if file_id is not None: + return download_file_from_google_drive(file_id, root, filename, md5) + + # download the file + try: + print("Downloading " + url + " to " + fpath) + _urlretrieve(url, fpath) + except (urllib.error.URLError, IOError) as e: # type: ignore[attr-defined] + if url[:5] == "https": + url = url.replace("https:", "http:") + print( + "Failed download. Trying https -> http instead." + " Downloading " + url + " to " + fpath + ) + _urlretrieve(url, fpath) + else: + raise e + + # check integrity of downloaded file + if not check_integrity(fpath, md5): + raise RuntimeError("File not found or corrupted.") + + +def download_and_extract_archive( + url: str, + download_root: str, + extract_root: Optional[str] = None, + filename: Optional[str] = None, + md5: Optional[str] = None, + remove_finished: bool = False, +) -> None: + download_root = os.path.expanduser(download_root) + if extract_root is None: + extract_root = download_root + if not filename: + filename = os.path.basename(url) + + download_url(url, download_root, filename, md5) + + archive = os.path.join(download_root, filename) + print("Extracting {} to {}".format(archive, extract_root)) + extract_archive(archive, extract_root, remove_finished) + + +def cache_url(url: str, cache_dir: str) -> str: + """ + This implementation downloads the remote resource and caches it locally. + The resource will only be downloaded if not previously requested. + """ + parsed_url = urlparse(url) + dirname = os.path.join(cache_dir, os.path.dirname(parsed_url.path.lstrip("/"))) + makedir(dirname) + filename = url.split("/")[-1] + cached = os.path.join(dirname, filename) + with file_lock(cached): + if not os.path.isfile(cached): + logging.info(f"Downloading {url} to {cached} ...") + cached = download(url, dirname, filename=filename) + logging.info(f"URL {url} cached in {cached}") + return cached + + +# TODO (prigoyal): convert this into RAII-style API +def create_file_symlink(file1, file2): + """ + Simply create the symlinks for a given file1 to file2. + Useful during model checkpointing to symlinks to the + latest successful checkpoint. + """ + try: + if g_pathmgr.exists(file2): + g_pathmgr.rm(file2) + g_pathmgr.symlink(file1, file2) + except Exception as e: + logging.info(f"Could NOT create symlink. Error: {e}") + + +def save_file(data, filename, append_to_json=True, verbose=True): + """ + Common i/o utility to handle saving data to various file formats. + Supported: + .pkl, .pickle, .npy, .json + Specifically for .json, users have the option to either append (default) + or rewrite by passing in Boolean value to append_to_json. + """ + if verbose: + logging.info(f"Saving data to file: {filename}") + file_ext = os.path.splitext(filename)[1] + if file_ext in [".pkl", ".pickle"]: + with g_pathmgr.open(filename, "wb") as fopen: + pickle.dump(data, fopen, pickle.HIGHEST_PROTOCOL) + elif file_ext == ".npy": + with g_pathmgr.open(filename, "wb") as fopen: + np.save(fopen, data) + elif file_ext == ".json": + if append_to_json: + with g_pathmgr.open(filename, "a") as fopen: + fopen.write(json.dumps(data, sort_keys=True) + "\n") + fopen.flush() + else: + with g_pathmgr.open(filename, "w") as fopen: + fopen.write(json.dumps(data, sort_keys=True) + "\n") + fopen.flush() + elif file_ext == ".yaml": + with g_pathmgr.open(filename, "w") as fopen: + dump = yaml.dump(data) + fopen.write(dump) + fopen.flush() + else: + raise Exception(f"Saving {file_ext} is not supported yet") + + if verbose: + logging.info(f"Saved data to file: {filename}") + + +def load_file(filename, mmap_mode=None, verbose=True, allow_pickle=False): + """ + Common i/o utility to handle loading data from various file formats. + Supported: + .pkl, .pickle, .npy, .json + For the npy files, we support reading the files in mmap_mode. + If the mmap_mode of reading is not successful, we load data without the + mmap_mode. + """ + if verbose: + logging.info(f"Loading data from file: {filename}") + + file_ext = os.path.splitext(filename)[1] + if file_ext == ".txt": + with g_pathmgr.open(filename, "r") as fopen: + data = fopen.readlines() + elif file_ext in [".pkl", ".pickle"]: + with g_pathmgr.open(filename, "rb") as fopen: + data = pickle.load(fopen, encoding="latin1") + elif file_ext == ".npy": + if mmap_mode: + try: + with g_pathmgr.open(filename, "rb") as fopen: + data = np.load( + fopen, + allow_pickle=allow_pickle, + encoding="latin1", + mmap_mode=mmap_mode, + ) + except ValueError as e: + logging.info( + f"Could not mmap {filename}: {e}. Trying without g_pathmgr" + ) + data = np.load( + filename, + allow_pickle=allow_pickle, + encoding="latin1", + mmap_mode=mmap_mode, + ) + logging.info("Successfully loaded without g_pathmgr") + except Exception: + logging.info("Could not mmap without g_pathmgr. Trying without mmap") + with g_pathmgr.open(filename, "rb") as fopen: + data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1") + else: + with g_pathmgr.open(filename, "rb") as fopen: + data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1") + elif file_ext == ".json": + with g_pathmgr.open(filename, "r") as fopen: + data = json.load(fopen) + elif file_ext == ".yaml": + with g_pathmgr.open(filename, "r") as fopen: + data = yaml.load(fopen, Loader=yaml.FullLoader) + elif file_ext == ".csv": + with g_pathmgr.open(filename, "r") as fopen: + data = pd.read_csv(fopen) + else: + raise Exception(f"Reading from {file_ext} is not supported yet") + return data + + +def abspath(resource_path: str): + """ + Make a path absolute, but take into account prefixes like + "http://" or "manifold://" + """ + regex = re.compile(r"^\w+://") + if regex.match(resource_path) is None: + return os.path.abspath(resource_path) + else: + return resource_path + + +def makedir(dir_path): + """ + Create the directory if it does not exist. + """ + is_success = False + try: + if not g_pathmgr.exists(dir_path): + g_pathmgr.mkdirs(dir_path) + is_success = True + except BaseException: + logging.info(f"Error creating directory: {dir_path}") + return is_success + + +def is_url(input_url): + """ + Check if an input string is a url. look for http(s):// and ignoring the case + """ + is_url = re.match(r"^(?:http)s?://", input_url, re.IGNORECASE) is not None + return is_url + + +def download_and_untar(url): + cached_file = download_cached_file( + url, check_hash=False, progress=True + ) + # get path to untarred directory + untarred_dir = os.path.basename(url).split(".")[0] + parent_dir = os.path.dirname(cached_file) + + full_dir = os.path.join(parent_dir, untarred_dir) + + if not os.path.exists(full_dir): + with tarfile.open(cached_file) as tar: + tar.extractall(parent_dir) + + return full_dir + +def cleanup_dir(dir): + """ + Utility for deleting a directory. Useful for cleaning the storage space + that contains various training artifacts like checkpoints, data etc. + """ + if os.path.exists(dir): + logging.info(f"Deleting directory: {dir}") + shutil.rmtree(dir) + logging.info(f"Deleted contents of directory: {dir}") + + +def get_file_size(filename): + """ + Given a file, get the size of file in MB + """ + size_in_mb = os.path.getsize(filename) / float(1024**2) + return size_in_mb + +def is_serializable(value): + """ + This function checks if the provided value can be serialized into a JSON string. + """ + try: + json.dumps(value) + return True + except (TypeError, OverflowError): + return False + +def is_convertible_to_int(value): + return bool(re.match(r'^-?\d+$', str(value))) \ No newline at end of file diff --git a/lavis/common/vqa_tools/__init__.py b/lavis/common/vqa_tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b98da85428159ad0dcfab7685c080848ecf8c7b --- /dev/null +++ b/lavis/common/vqa_tools/__init__.py @@ -0,0 +1,8 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +__author__ = "aagrawal" diff --git a/lavis/common/vqa_tools/__pycache__/__init__.cpython-310.pyc b/lavis/common/vqa_tools/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8c35836f8d491d901d03e9456ae3944fa5ff92f Binary files /dev/null and b/lavis/common/vqa_tools/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/common/vqa_tools/__pycache__/vqa.cpython-310.pyc b/lavis/common/vqa_tools/__pycache__/vqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b567c4eb27170330bb9c5565295dc84313dc4d2 Binary files /dev/null and b/lavis/common/vqa_tools/__pycache__/vqa.cpython-310.pyc differ diff --git a/lavis/common/vqa_tools/__pycache__/vqa_eval.cpython-310.pyc b/lavis/common/vqa_tools/__pycache__/vqa_eval.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cb38cf259b9060783a511a9188c2a1071017428 Binary files /dev/null and b/lavis/common/vqa_tools/__pycache__/vqa_eval.cpython-310.pyc differ diff --git a/lavis/common/vqa_tools/vqa.py b/lavis/common/vqa_tools/vqa.py new file mode 100644 index 0000000000000000000000000000000000000000..a386b9094b0528b33e7511aff4027f30459a7ff7 --- /dev/null +++ b/lavis/common/vqa_tools/vqa.py @@ -0,0 +1,211 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +__author__ = "aagrawal" +__version__ = "0.9" + +# Interface for accessing the VQA dataset. + +# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link: +# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py). + +# The following functions are defined: +# VQA - VQA class that loads VQA annotation file and prepares data structures. +# getQuesIds - Get question ids that satisfy given filter conditions. +# getImgIds - Get image ids that satisfy given filter conditions. +# loadQA - Load questions and answers with the specified question ids. +# showQA - Display the specified questions and answers. +# loadRes - Load result file and create result object. + +# Help on each function can be accessed by: "help(COCO.function)" + +import json +import datetime +import copy + + +class VQA: + def __init__(self, annotation_file=None, question_file=None): + """ + Constructor of VQA helper class for reading and visualizing questions and answers. + :param annotation_file (str): location of VQA annotation file + :return: + """ + # load dataset + self.dataset = {} + self.questions = {} + self.qa = {} + self.qqa = {} + self.imgToQA = {} + if not annotation_file == None and not question_file == None: + print("loading VQA annotations and questions into memory...") + time_t = datetime.datetime.utcnow() + dataset = json.load(open(annotation_file, "r")) + questions = json.load(open(question_file, "r")) + self.dataset = dataset + self.questions = questions + self.createIndex() + + def createIndex(self): + # create index + print("creating index...") + imgToQA = {ann["image_id"]: [] for ann in self.dataset["annotations"]} + qa = {ann["question_id"]: [] for ann in self.dataset["annotations"]} + qqa = {ann["question_id"]: [] for ann in self.dataset["annotations"]} + for ann in self.dataset["annotations"]: + imgToQA[ann["image_id"]] += [ann] + qa[ann["question_id"]] = ann + for ques in self.questions["questions"]: + qqa[ques["question_id"]] = ques + print("index created!") + + # create class members + self.qa = qa + self.qqa = qqa + self.imgToQA = imgToQA + + def info(self): + """ + Print information about the VQA annotation file. + :return: + """ + for key, value in self.datset["info"].items(): + print("%s: %s" % (key, value)) + + def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]): + """ + Get question ids that satisfy given filter conditions. default skips that filter + :param imgIds (int array) : get question ids for given imgs + quesTypes (str array) : get question ids for given question types + ansTypes (str array) : get question ids for given answer types + :return: ids (int array) : integer array of question ids + """ + imgIds = imgIds if type(imgIds) == list else [imgIds] + quesTypes = quesTypes if type(quesTypes) == list else [quesTypes] + ansTypes = ansTypes if type(ansTypes) == list else [ansTypes] + + if len(imgIds) == len(quesTypes) == len(ansTypes) == 0: + anns = self.dataset["annotations"] + else: + if not len(imgIds) == 0: + anns = sum( + [self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA], + [], + ) + else: + anns = self.dataset["annotations"] + anns = ( + anns + if len(quesTypes) == 0 + else [ann for ann in anns if ann["question_type"] in quesTypes] + ) + anns = ( + anns + if len(ansTypes) == 0 + else [ann for ann in anns if ann["answer_type"] in ansTypes] + ) + ids = [ann["question_id"] for ann in anns] + return ids + + def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]): + """ + Get image ids that satisfy given filter conditions. default skips that filter + :param quesIds (int array) : get image ids for given question ids + quesTypes (str array) : get image ids for given question types + ansTypes (str array) : get image ids for given answer types + :return: ids (int array) : integer array of image ids + """ + quesIds = quesIds if type(quesIds) == list else [quesIds] + quesTypes = quesTypes if type(quesTypes) == list else [quesTypes] + ansTypes = ansTypes if type(ansTypes) == list else [ansTypes] + + if len(quesIds) == len(quesTypes) == len(ansTypes) == 0: + anns = self.dataset["annotations"] + else: + if not len(quesIds) == 0: + anns = sum( + [self.qa[quesId] for quesId in quesIds if quesId in self.qa], [] + ) + else: + anns = self.dataset["annotations"] + anns = ( + anns + if len(quesTypes) == 0 + else [ann for ann in anns if ann["question_type"] in quesTypes] + ) + anns = ( + anns + if len(ansTypes) == 0 + else [ann for ann in anns if ann["answer_type"] in ansTypes] + ) + ids = [ann["image_id"] for ann in anns] + return ids + + def loadQA(self, ids=[]): + """ + Load questions and answers with the specified question ids. + :param ids (int array) : integer ids specifying question ids + :return: qa (object array) : loaded qa objects + """ + if type(ids) == list: + return [self.qa[id] for id in ids] + elif type(ids) == int: + return [self.qa[ids]] + + def showQA(self, anns): + """ + Display the specified annotations. + :param anns (array of object): annotations to display + :return: None + """ + if len(anns) == 0: + return 0 + for ann in anns: + quesId = ann["question_id"] + print("Question: %s" % (self.qqa[quesId]["question"])) + for ans in ann["answers"]: + print("Answer %d: %s" % (ans["answer_id"], ans["answer"])) + + def loadRes(self, resFile, quesFile): + """ + Load result file and return a result object. + :param resFile (str) : file name of result file + :return: res (obj) : result api object + """ + res = VQA() + res.questions = json.load(open(quesFile)) + res.dataset["info"] = copy.deepcopy(self.questions["info"]) + res.dataset["task_type"] = copy.deepcopy(self.questions["task_type"]) + res.dataset["data_type"] = copy.deepcopy(self.questions["data_type"]) + res.dataset["data_subtype"] = copy.deepcopy(self.questions["data_subtype"]) + res.dataset["license"] = copy.deepcopy(self.questions["license"]) + + print("Loading and preparing results... ") + time_t = datetime.datetime.utcnow() + anns = json.load(open(resFile)) + assert type(anns) == list, "results is not an array of objects" + annsQuesIds = [ann["question_id"] for ann in anns] + assert set(annsQuesIds) == set( + self.getQuesIds() + ), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file." + for ann in anns: + quesId = ann["question_id"] + if res.dataset["task_type"] == "Multiple Choice": + assert ( + ann["answer"] in self.qqa[quesId]["multiple_choices"] + ), "predicted answer is not one of the multiple choices" + qaAnn = self.qa[quesId] + ann["image_id"] = qaAnn["image_id"] + ann["question_type"] = qaAnn["question_type"] + ann["answer_type"] = qaAnn["answer_type"] + print( + "DONE (t=%0.2fs)" % ((datetime.datetime.utcnow() - time_t).total_seconds()) + ) + + res.dataset["annotations"] = anns + res.createIndex() + return res diff --git a/lavis/common/vqa_tools/vqa_eval.py b/lavis/common/vqa_tools/vqa_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..ee808b349bb6166c744338b02af2bc84a68650ff --- /dev/null +++ b/lavis/common/vqa_tools/vqa_eval.py @@ -0,0 +1,324 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +# coding=utf-8 + +__author__ = "aagrawal" + +# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link: +# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py). +import sys +import re + + +class VQAEval: + def __init__(self, vqa=None, vqaRes=None, n=2): + self.n = n + self.accuracy = {} + self.evalQA = {} + self.evalQuesType = {} + self.evalAnsType = {} + self.vqa = vqa + self.vqaRes = vqaRes + if vqa is not None: + self.params = {"question_id": vqa.getQuesIds()} + self.contractions = { + "aint": "ain't", + "arent": "aren't", + "cant": "can't", + "couldve": "could've", + "couldnt": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didnt": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadnt": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasnt": "hasn't", + "havent": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isnt": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "oclock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldnt": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "thats": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "theres": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasnt": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "werent": "weren't", + "whatll": "what'll", + "whatre": "what're", + "whats": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "wont": "won't", + "wouldve": "would've", + "wouldnt": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", + } + self.manualMap = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + self.articles = ["a", "an", "the"] + + self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)") + self.commaStrip = re.compile("(\d)(,)(\d)") + self.punct = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", + ] + + def evaluate(self, quesIds=None): + if quesIds == None: + quesIds = [quesId for quesId in self.params["question_id"]] + gts = {} + res = {} + for quesId in quesIds: + gts[quesId] = self.vqa.qa[quesId] + res[quesId] = self.vqaRes.qa[quesId] + + # ================================================= + # Compute accuracy + # ================================================= + accQA = [] + accQuesType = {} + accAnsType = {} + print("computing accuracy") + step = 0 + for quesId in quesIds: + resAns = res[quesId]["answer"] + resAns = resAns.replace("\n", " ") + resAns = resAns.replace("\t", " ") + resAns = resAns.strip() + resAns = self.processPunctuation(resAns) + resAns = self.processDigitArticle(resAns) + gtAcc = [] + gtAnswers = [ans["answer"] for ans in gts[quesId]["answers"]] + if len(set(gtAnswers)) > 1: + for ansDic in gts[quesId]["answers"]: + ansDic["answer"] = self.processPunctuation(ansDic["answer"]) + for gtAnsDatum in gts[quesId]["answers"]: + otherGTAns = [ + item for item in gts[quesId]["answers"] if item != gtAnsDatum + ] + matchingAns = [item for item in otherGTAns if item["answer"] == resAns] + acc = min(1, float(len(matchingAns)) / 3) + gtAcc.append(acc) + quesType = gts[quesId]["question_type"] + ansType = gts[quesId]["answer_type"] + avgGTAcc = float(sum(gtAcc)) / len(gtAcc) + accQA.append(avgGTAcc) + if quesType not in accQuesType: + accQuesType[quesType] = [] + accQuesType[quesType].append(avgGTAcc) + if ansType not in accAnsType: + accAnsType[ansType] = [] + accAnsType[ansType].append(avgGTAcc) + self.setEvalQA(quesId, avgGTAcc) + self.setEvalQuesType(quesId, quesType, avgGTAcc) + self.setEvalAnsType(quesId, ansType, avgGTAcc) + if step % 100 == 0: + self.updateProgress(step / float(len(quesIds))) + step = step + 1 + + self.setAccuracy(accQA, accQuesType, accAnsType) + print("Done computing accuracy") + + def processPunctuation(self, inText): + outText = inText + for p in self.punct: + if (p + " " in inText or " " + p in inText) or ( + re.search(self.commaStrip, inText) != None + ): + outText = outText.replace(p, "") + else: + outText = outText.replace(p, " ") + outText = self.periodStrip.sub("", outText, re.UNICODE) + return outText + + def processDigitArticle(self, inText): + outText = [] + tempText = inText.lower().split() + for word in tempText: + word = self.manualMap.setdefault(word, word) + if word not in self.articles: + outText.append(word) + else: + pass + for wordId, word in enumerate(outText): + if word in self.contractions: + outText[wordId] = self.contractions[word] + outText = " ".join(outText) + return outText + + def setAccuracy(self, accQA, accQuesType, accAnsType): + self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA), self.n) + self.accuracy["perQuestionType"] = { + quesType: round( + 100 * float(sum(accQuesType[quesType])) / len(accQuesType[quesType]), + self.n, + ) + for quesType in accQuesType + } + self.accuracy["perAnswerType"] = { + ansType: round( + 100 * float(sum(accAnsType[ansType])) / len(accAnsType[ansType]), self.n + ) + for ansType in accAnsType + } + + def setEvalQA(self, quesId, acc): + self.evalQA[quesId] = round(100 * acc, self.n) + + def setEvalQuesType(self, quesId, quesType, acc): + if quesType not in self.evalQuesType: + self.evalQuesType[quesType] = {} + self.evalQuesType[quesType][quesId] = round(100 * acc, self.n) + + def setEvalAnsType(self, quesId, ansType, acc): + if ansType not in self.evalAnsType: + self.evalAnsType[ansType] = {} + self.evalAnsType[ansType][quesId] = round(100 * acc, self.n) + + def updateProgress(self, progress): + barLength = 20 + status = "" + if isinstance(progress, int): + progress = float(progress) + if not isinstance(progress, float): + progress = 0 + status = "error: progress var must be float\r\n" + if progress < 0: + progress = 0 + status = "Halt...\r\n" + if progress >= 1: + progress = 1 + status = "Done...\r\n" + block = int(round(barLength * progress)) + text = "\rFinshed Percent: [{0}] {1}% {2}".format( + "#" * block + "-" * (barLength - block), int(progress * 100), status + ) + sys.stdout.write(text) + sys.stdout.flush() diff --git a/lavis/configs/datasets/aokvqa/defaults.yaml b/lavis/configs/datasets/aokvqa/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2562db220cd9b08c0dd02c6b76dae070242e3c20 --- /dev/null +++ b/lavis/configs/datasets/aokvqa/defaults.yaml @@ -0,0 +1,35 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + aok_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json + storage: + - aokvqa/annotations/aokvqa_v1p0_train.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json + storage: + - aokvqa/annotations/aokvqa_v1p0_val.json + - aokvqa/annotations/specialized_vocab_train_lavis.json + # - aokvqa/annotations/large_vocab_train_lavis.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json + storage: + - aokvqa/annotations/aokvqa_v1p0_test.json + - aokvqa/annotations/specialized_vocab_train_lavis.json + images: + storage: coco/images/ diff --git a/lavis/configs/datasets/aokvqa/defaults_instruct.yaml b/lavis/configs/datasets/aokvqa/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3d6dc37f7e37057e4cebaffef784059321efa25e --- /dev/null +++ b/lavis/configs/datasets/aokvqa/defaults_instruct.yaml @@ -0,0 +1,52 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + aok_vqa_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: qa + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json + storage: + - aokvqa/annotations/aokvqa_v1p0_train.json + # val: + # url: + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json + # storage: + # - aokvqa/annotations/aokvqa_v1p0_val.json + # - aokvqa/annotations/specialized_vocab_train_lavis.json + # # - aokvqa/annotations/large_vocab_train_lavis.json + # test: + # url: + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json + # storage: + # - aokvqa/annotations/aokvqa_v1p0_test.json + # - aokvqa/annotations/specialized_vocab_train_lavis.json + images: + # storage: /coco/images + storage: /export/share/datasets/vision/coco/images diff --git a/lavis/configs/datasets/audiocaps/defaults_mm_cap.yaml b/lavis/configs/datasets/audiocaps/defaults_mm_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..76a05788dea9ad9bf1084aaacc558b09c6c390b3 --- /dev/null +++ b/lavis/configs/datasets/audiocaps/defaults_mm_cap.yaml @@ -0,0 +1,49 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + audiocaps_mm_caption: # name of the dataset builder + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + + text_processor: + train: + name: "blip_instruction" + modality: audio + task: caption + eval: + name: "blip_caption" + + data_type: [audio] + + build_info: + kwargs: + missing_ids: [2sh7ZkazyO8, 966jA2-z0mQ, 52RlolYyjAE, HVAc9hm4jjk, 8lPjqvYWNyM, eXgPnnE3TuQ] + annotations: + train: + url: + - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/train.csv + storage: + - audiocaps/annotations/train.csv + + val: + url: + - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/val.csv + storage: + - audiocaps/annotations/val.csv + + test: + url: + - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/test.csv + storage: + - audiocaps/annotations/test.csv + + audio: + storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio \ No newline at end of file diff --git a/lavis/configs/datasets/audiocaps/defaults_mm_cap_instruct.yaml b/lavis/configs/datasets/audiocaps/defaults_mm_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..27cf6b06e97b300c9e905fc60e1062c01112d839 --- /dev/null +++ b/lavis/configs/datasets/audiocaps/defaults_mm_cap_instruct.yaml @@ -0,0 +1,52 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + audiocaps_mm_caption_instruct: # name of the dataset builder + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + + text_processor: + train: + name: "blip_instruction" + modality: audio + task: caption + eval: + name: "blip_caption" + + data_type: [audio] + + missing_ids: [2sh7ZkazyO8, 966jA2-z0mQ, 52RlolYyjAE, HVAc9hm4jjk, 8lPjqvYWNyM, eXgPnnE3TuQ] + + build_info: + kwargs: + cached: False + cached_dir: /export/einstein-vision/audio_datasets/audiocaps/beats_features + annotations: + train: + url: + - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/train.csv + storage: + - audiocaps/annotations/train.csv + + # val: + # url: + # - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/val.csv + # storage: + # - audiocaps/annotation/val.csv + + # test: + # url: + # - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/test.csv + # storage: + # - /export/einstein-vision/audio_datasets/audiocaps/dataset/test.csv + + audio: + storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio \ No newline at end of file diff --git a/lavis/configs/datasets/audiocaps/defaults_mm_qa.yaml b/lavis/configs/datasets/audiocaps/defaults_mm_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..04b9ae39b6efb127c78782d0977185b2252ca50e --- /dev/null +++ b/lavis/configs/datasets/audiocaps/defaults_mm_qa.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + audiocaps_mm_qa: # name of the dataset builder + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: True + + text_processor: + train: + name: "blip_instruction" + modality: audio + task: qa + eval: + name: "blip_question" + + data_type: [audio] + + build_info: + kwargs: + cached: False + # add_binary: True + cached_dir: /export/einstein-vision/audio_datasets/audiocaps/beats_features + missing_ids: [2sh7ZkazyO8, 966jA2-z0mQ, 52RlolYyjAE, HVAc9hm4jjk, 8lPjqvYWNyM, eXgPnnE3TuQ] + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/audiocaps/audio_qa_final_train.csv + # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/audio_qa_data/audio_qa_final_train.csv + storage: + - audiocaps_qa/annotations/train.csv + # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/audio_qa_data/audio_qa_final_train.csv + + # val: + # url: + # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/audiocaps/audio_qa_final_val.csv + # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/audio_qa_data/audio_qa_final_val.csv + # storage: + # # - audiocaps_qa/annotations/val.csv + # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/audio_qa_data/audio_qa_final_val.csv + + audio: + storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio \ No newline at end of file diff --git a/lavis/configs/datasets/audioset/defaults_mm_cap.yaml b/lavis/configs/datasets/audioset/defaults_mm_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c00f9aa8fa89c97b2ad79a1e27e5befb7577ae8f --- /dev/null +++ b/lavis/configs/datasets/audioset/defaults_mm_cap.yaml @@ -0,0 +1,47 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + audioset_mm_caption: # 14141 + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: False + + text_processor: + train: + name: blip_instruction + modality: audio + task: classification + eval: + name: blip_caption + + data_type: [audio] + + build_info: + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data//audioset/balanced_train_clean.csv + # - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv + - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv + storage: + - audioset/balanced_train_clean.csv + # - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv + - audioset/annotations/class_labels_indices.csv + + # val: + # url: + # - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv + # - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv + # storage: + # - audioset/annotations/eval_segments.csv + # - audioset/annotations/class_labels_indices.csv + audio: + storage: /export/einstein-vision/audio_datasets/AudioSet/all_audio \ No newline at end of file diff --git a/lavis/configs/datasets/audioset/defaults_mm_cap_instruct.yaml b/lavis/configs/datasets/audioset/defaults_mm_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2b0c7746fb659f812c6114e5fbfdbc4b2dc2e3c2 --- /dev/null +++ b/lavis/configs/datasets/audioset/defaults_mm_cap_instruct.yaml @@ -0,0 +1,48 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + audioset_mm_caption_instruct: # 14141 + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: False + + text_processor: + train: + name: blip_instruction + modality: audio + task: classification + eval: + name: blip_caption + + data_type: [audio] + + build_info: + annotations: + train: + url: + # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data//audioset/balanced_train_clean.csv + - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv + - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv + storage: + - audioset/annotations/balanced_train_clean.csv + # - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv + - audioset/annotations/class_labels_indices.csv + + # val: + # url: + # - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv + # - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv + # storage: + # - audioset/annotations/eval_segments.csv + # - audioset/annotations/class_labels_indices.csv + + audio: + storage: /export/einstein-vision/audio_datasets/AudioSet/all_audio \ No newline at end of file diff --git a/lavis/configs/datasets/avsd/defaults_dial.yaml b/lavis/configs/datasets/avsd/defaults_dial.yaml new file mode 100644 index 0000000000000000000000000000000000000000..939ac9bcc1916c7ab09fe86692aaaeffc780dd22 --- /dev/null +++ b/lavis/configs/datasets/avsd/defaults_dial.yaml @@ -0,0 +1,24 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + avsd_dialogue: # name of the dataset builder + dataset_card: dataset_card/avsd_dialogue.md + data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json + storage: avsd/annotations/train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json + storage: avsd/annotations/val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json + storage: avsd/annotations/test.json + features: + storage: avsd/features/ diff --git a/lavis/configs/datasets/avsd/defaults_mm_dial_instruct.yaml b/lavis/configs/datasets/avsd/defaults_mm_dial_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a151652d7cbe2539dea0f2625c372f5b5311ee3 --- /dev/null +++ b/lavis/configs/datasets/avsd/defaults_mm_dial_instruct.yaml @@ -0,0 +1,65 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + avsd_mm_dialogue_instruct: # name of the dataset builder + data_type: [video, audio] + + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + + video_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json + storage: + - avsd/annotations/train.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json + storage: + - avsd/annotations/val.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json + storage: + - avsd/annotations/test.json + templates: null + + audio: + storage: /export/video-language-dataset/data/charade/videos + + video: + storage: /export/video-language-dataset/data/charade/videos + diff --git a/lavis/configs/datasets/blip_diffusion_datasets/defaults.yaml b/lavis/configs/datasets/blip_diffusion_datasets/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fc9327b41ff273e699869e6d2a6959168b82d819 --- /dev/null +++ b/lavis/configs/datasets/blip_diffusion_datasets/defaults.yaml @@ -0,0 +1,14 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + blip_diffusion_finetune: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + images: + storage: "" diff --git a/lavis/configs/datasets/capfilt14m/defaults_cap.yaml b/lavis/configs/datasets/capfilt14m/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7d2821c7b34ada42edf5fb60324c9aaeab7e4848 --- /dev/null +++ b/lavis/configs/datasets/capfilt14m/defaults_cap.yaml @@ -0,0 +1,30 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + capfilt14m: # 13873136 + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + text_processor: + train: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/capfilt14m/annotation.json + # - /export/share/datasets/vision_language/capfilt_14m_new/annotation.json + storage: + - capfilt14m/annotations/annotation.json + # - /export/share/datasets/vision_language/capfilt_14m_new/annotation.json + images: + storage: /export/share/datasets/vision/coco/images \ No newline at end of file diff --git a/lavis/configs/datasets/capfilt14m/defaults_cap_instruct.yaml b/lavis/configs/datasets/capfilt14m/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec4e59e8f3da7ad2984f4af61b9483049afaef03 --- /dev/null +++ b/lavis/configs/datasets/capfilt14m/defaults_cap_instruct.yaml @@ -0,0 +1,34 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + capfilt14m_instruct: # 13873136 + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/capfilt14m/annotation.json + # - /export/share/datasets/vision_language/capfilt_14m_new/annotation.json + storage: + - capfilt14m/annotations/annotation.json + # - /export/share/datasets/vision_language/capfilt_14m_new/annotation.json + + images: + storage: /export/share/datasets/vision/coco/images \ No newline at end of file diff --git a/lavis/configs/datasets/charade/defaults_cap.yaml b/lavis/configs/datasets/charade/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c28a1ada295f570e2969f5353d3ee9422a25efbb --- /dev/null +++ b/lavis/configs/datasets/charade/defaults_cap.yaml @@ -0,0 +1,52 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + charade_caption: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/charade/train_lavis.json + # - /export/video-language-dataset/data/charade/train_lavis.json + storage: + - charade/annotations/train.json + # - /export/video-language-dataset/data/charade/train_lavis.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/charade/val_lavis.json + # - /export/video-language-dataset/data/charade/val_lavis.json + storage: + - charade/annotations/val.json + # - /export/video-language-dataset/data/charade/val_lavis.json + videos: + storage: /export/video-language-dataset/data/charade/videos diff --git a/lavis/configs/datasets/charade/defaults_cap_instruct.yaml b/lavis/configs/datasets/charade/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f1a7ba82c9af6e2e4694a21a41e5d760e5a97573 --- /dev/null +++ b/lavis/configs/datasets/charade/defaults_cap_instruct.yaml @@ -0,0 +1,54 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + charade_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + + text_processor: + train: + name: blip_instruction + modality: video + task: caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/charade/train_lavis.json + # - /export/video-language-dataset/data/charade/train_lavis.json + storage: + - charade/annotations/train.json + # - /export/video-language-dataset/data/charade/train_lavis.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/charade/val_lavis.json + # - /export/video-language-dataset/data/charade/val_lavis.json + storage: + - charade/annotations/val.json + # - /export/video-language-dataset/data/charade/val_lavis.json + videos: + storage: /export/video-language-dataset/data/charade/videos diff --git a/lavis/configs/datasets/clotho/defaults_mm_cap.yaml b/lavis/configs/datasets/clotho/defaults_mm_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e04f2b814f297c55ec65f297a4a3f5c9157ae840 --- /dev/null +++ b/lavis/configs/datasets/clotho/defaults_mm_cap.yaml @@ -0,0 +1,41 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + clothov2: # name of the dataset builder + audio_processor: + train: + name: beats_audio + eval: + name: beats_audio + + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + + data_type: [audio] + + build_info: + kwargs: + clotho_root: /export/einstein-vision/audio_datasets/clothov2/ + split: eval + + annotations: + train: + url: + - https://zenodo.org/record/4783391/files/clotho_captions_development.csv + storage: + - clothov2/annotations/clotho_captions_development.csv + val: + url: + - https://zenodo.org/record/4783391/files/clotho_captions_evaluation.csv + storage: + - clothov2/annotations/clotho_captions_evaluation.csv + audio: + storage: /export/einstein-vision/audio_datasets/clothov2/CLOTHO_v2.1/clotho_audio_files/ + \ No newline at end of file diff --git a/lavis/configs/datasets/clotho/defaults_mm_cap_instruct.yaml b/lavis/configs/datasets/clotho/defaults_mm_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a4b755b21422604937de2034a1fe8209f302912 --- /dev/null +++ b/lavis/configs/datasets/clotho/defaults_mm_cap_instruct.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + clothov2_instruct: # name of the dataset builder + audio_processor: + train: + name: beats_audio + eval: + name: beats_audio + + text_processor: + train: + name: "blip_instruction" + modality: audio + task: caption + eval: + name: "blip_caption" + + data_type: [audio] + + build_info: + kwargs: + clotho_root: /export/einstein-vision/audio_datasets/clothov2/ + split: eval + + annotations: + train: + url: + - https://zenodo.org/record/4783391/files/clotho_captions_development.csv + storage: + - clothov2/annotations/clotho_captions_development.csv + val: + url: + - https://zenodo.org/record/4783391/files/clotho_captions_evaluation.csv + storage: + - clothov2/annotations/clotho_captions_evaluation.csv + audio: + storage: /export/einstein-vision/audio_datasets/clothov2/CLOTHO_v2.1/clotho_audio_files/ + \ No newline at end of file diff --git a/lavis/configs/datasets/clotho/defaults_mm_qa.yaml b/lavis/configs/datasets/clotho/defaults_mm_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e7d97651b18034cb3ed3259ac669267df07a8800 --- /dev/null +++ b/lavis/configs/datasets/clotho/defaults_mm_qa.yaml @@ -0,0 +1,44 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + clotho_qa: # name of the dataset builder + audio_processor: + train: + name: beats_audio + eval: + name: beats_audio + + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + + data_type: [audio] + + build_info: + + annotations: + train: + url: + - https://zenodo.org/records/6473207/files/clotho_aqa_train.csv + storage: + - clotho_Qa/annotations/clotho_aqa_train.csv + val: + url: + - https://zenodo.org/records/6473207/files/clotho_aqa_val.csv + storage: + - clotho_qa/annotations/clotho_aqa_val.csv + + test: + url: + - https://zenodo.org/records/6473207/files/clotho_aqa_test.csv + storage: + - clotho_qa/annotations/clotho_aqa_test.csv + audio: + storage: /export/einstein-vision/audio_datasets/clotho-aqa/audio_files + \ No newline at end of file diff --git a/lavis/configs/datasets/coco/defaults_cap.yaml b/lavis/configs/datasets/coco/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9f9ffc8d293fed0bac7d745ee8c00f53ce39565d --- /dev/null +++ b/lavis/configs/datasets/coco/defaults_cap.yaml @@ -0,0 +1,28 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_caption: # name of the dataset builder + dataset_card: dataset_card/coco_caption.md + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json + md5: aa31ac474cf6250ebb81d18348a07ed8 + storage: coco/annotations/coco_karpathy_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json + md5: b273847456ef5580e33713b1f7de52a0 + storage: coco/annotations/coco_karpathy_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json + md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 + storage: coco/annotations/coco_karpathy_test.json + images: + storage: coco/images/ diff --git a/lavis/configs/datasets/coco/defaults_cap_instruct.yaml b/lavis/configs/datasets/coco/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bce77990676e85abed2f50e122909f953ffb14b7 --- /dev/null +++ b/lavis/configs/datasets/coco/defaults_cap_instruct.yaml @@ -0,0 +1,44 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_caption_instruct: # name of the dataset builder + dataset_card: dataset_card/coco_caption.md + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json + md5: aa31ac474cf6250ebb81d18348a07ed8 + storage: coco/annotations/coco_karpathy_train.json + # val: + # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json + # md5: b273847456ef5580e33713b1f7de52a0 + # storage: coco/annotations/coco_karpathy_val.json + # test: + # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json + # md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 + # storage: coco/annotations/coco_karpathy_test.json + images: + storage: /export/share/datasets/vision/coco/images diff --git a/lavis/configs/datasets/coco/defaults_ret.yaml b/lavis/configs/datasets/coco/defaults_ret.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4bcc8a07b23bd77e0457ff5055b5037df7c9112f --- /dev/null +++ b/lavis/configs/datasets/coco/defaults_ret.yaml @@ -0,0 +1,27 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_retrieval: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json + md5: aa31ac474cf6250ebb81d18348a07ed8 + storage: coco/annotations/coco_karpathy_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json + md5: b273847456ef5580e33713b1f7de52a0 + storage: coco/annotations/coco_karpathy_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json + md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 + storage: coco/annotations/coco_karpathy_test.json + images: + storage: coco/images/ diff --git a/lavis/configs/datasets/coco/defaults_vqa.yaml b/lavis/configs/datasets/coco/defaults_vqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..08e036d2fd55408afd6c9a799ce8b8f7c97abd90 --- /dev/null +++ b/lavis/configs/datasets/coco/defaults_vqa.yaml @@ -0,0 +1,41 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json + storage: + - coco/annotations/vqa_train.json + - coco/annotations/vqa_val.json + val: + url: + # TODO make this order insensitive + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json + storage: + - coco/annotations/vqa_val_eval.json + - coco/annotations/answer_list.json + - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json + - coco/annotations/v2_mscoco_val2014_annotations.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json + storage: + - coco/annotations/vqa_test.json + - coco/annotations/answer_list.json + images: + storage: coco/images/ diff --git a/lavis/configs/datasets/coco/defaults_vqa_instruct.yaml b/lavis/configs/datasets/coco/defaults_vqa_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a85202b3ab408190b0f9d556836257baa81a49a --- /dev/null +++ b/lavis/configs/datasets/coco/defaults_vqa_instruct.yaml @@ -0,0 +1,57 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_vqa_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: qa + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json + storage: + - coco/annotations/vqa_train.json + - coco/annotations/vqa_val.json + # val: + # url: + # # TODO make this order insensitive + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json + # storage: + # - coco/annotations/vqa_val_eval.json + # - coco/annotations/answer_list.json + # - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json + # - coco/annotations/v2_mscoco_val2014_annotations.json + # test: + # url: + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json + # storage: + # - coco/annotations/vqa_test.json + # - coco/annotations/answer_list.json + images: + storage: /export/share/datasets/vision/coco/images diff --git a/lavis/configs/datasets/coco/eval_vqa.yaml b/lavis/configs/datasets/coco/eval_vqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bfc637955aa6d7972150d671368fa6aa7d235cfd --- /dev/null +++ b/lavis/configs/datasets/coco/eval_vqa.yaml @@ -0,0 +1,27 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: + # TODO make this order insensitive + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json + storage: + - coco/annotations/vqa_val_eval.json + - coco/annotations/answer_list.json + - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json + - coco/annotations/v2_mscoco_val2014_annotations.json + images: + storage: coco/images/ diff --git a/lavis/configs/datasets/coin/defaults_cap.yaml b/lavis/configs/datasets/coin/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c783102dacac9eda77a3c84e77d0d240840c454d --- /dev/null +++ b/lavis/configs/datasets/coin/defaults_cap.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coin_caption: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/coin/train.json + # - /export/video-language-dataset/data/coin/annotations/train_lavis.json + storage: + - coin/annotations/train.json + # - /export/video-language-dataset/data/coin/annotations/train_lavis.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/coin/val.json + # - /export/video-language-dataset/data/coin/annotations/val_lavis.json + storage: + - coin/annotations/val.json + # - /export/video-language-dataset/data/coin/annotations/val_lavis.json + videos: + storage: /export/video-language-dataset/data/coin/annotations/videos/ diff --git a/lavis/configs/datasets/coin/defaults_cap_instruct.yaml b/lavis/configs/datasets/coin/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e1f8c8f468571112cdb0b63548b047584a292d15 --- /dev/null +++ b/lavis/configs/datasets/coin/defaults_cap_instruct.yaml @@ -0,0 +1,53 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coin_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_instruction + task: caption + modality: image + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/coin/train.json + # - /export/video-language-dataset/data/coin/annotations/train_lavis.json + storage: + - coin/annotations/train.json + # - /export/video-language-dataset/data/coin/annotations/train_lavis.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/coin/val.json + # - /export/video-language-dataset/data/coin/annotations/val_lavis.json + storage: + - coin/annotations/val.json + # - /export/video-language-dataset/data/coin/annotations/val_lavis.json + videos: + storage: /export/video-language-dataset/data/coin/annotations/videos/ diff --git a/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml b/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7f62cd3a2e5f69cc821ddb683c5eb642700d2274 --- /dev/null +++ b/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml @@ -0,0 +1,20 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + conceptual_caption_12m: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - /export/home/workspace/datasets/cc12m.json + storage: + - conceptual_caption/annotations/cc12m.json + images: + storage: conceptual_caption/images_12m diff --git a/lavis/configs/datasets/conceptual_caption/defaults_12m_instruct.yaml b/lavis/configs/datasets/conceptual_caption/defaults_12m_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1c576bbcc7c066316ba98c3018f450b4bc666789 --- /dev/null +++ b/lavis/configs/datasets/conceptual_caption/defaults_12m_instruct.yaml @@ -0,0 +1,37 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + conceptual_caption_12m_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + task: caption + modality: image + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/cc12m/x_instructblip_clean.json + # - /export/home/workspace/datasets/cc12m.json + storage: + - conceptual_caption/annotations/cc12m.json + images: + storage: conceptual_caption/images_12m diff --git a/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml b/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fcba29b0ec781b3424ef06ffe59b474c82cd14f3 --- /dev/null +++ b/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml @@ -0,0 +1,20 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + conceptual_caption_3m: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - /export/home/workspace/datasets/cc3m.json + storage: + - conceptual_caption/annotations/cc3m.json + images: + storage: conceptual_caption/images diff --git a/lavis/configs/datasets/conceptual_caption/defaults_3m_instruct.yaml b/lavis/configs/datasets/conceptual_caption/defaults_3m_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..05f2b523afde062c7075ac78ac090c76aa13c53b --- /dev/null +++ b/lavis/configs/datasets/conceptual_caption/defaults_3m_instruct.yaml @@ -0,0 +1,36 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + conceptual_caption_3m_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + task: caption + modality: image + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - /export/home/workspace/datasets/cc3m.json + storage: + - conceptual_caption/annotations/cc3m.json + images: + storage: conceptual_caption/images diff --git a/lavis/configs/datasets/didemo/defaults_ret.yaml b/lavis/configs/datasets/didemo/defaults_ret.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7923d03ce84fea806b7605ff425e0b362506fe62 --- /dev/null +++ b/lavis/configs/datasets/didemo/defaults_ret.yaml @@ -0,0 +1,25 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + didemo_retrieval: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json + storage: didemo/annotations/retrieval_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json + storage: didemo/annotations/retrieval_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json + storage: didemo/annotations/retrieval_test.json + videos: + storage: didemo/videos + # storage: /export/share/dongxuli/data/didemo_retrieval/videos diff --git a/lavis/configs/datasets/discriminatory_reasoning/defaults_mm_audio_video.yaml b/lavis/configs/datasets/discriminatory_reasoning/defaults_mm_audio_video.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4b11dbd0982ac9fc3ff3350cc978905dbf4193d2 --- /dev/null +++ b/lavis/configs/datasets/discriminatory_reasoning/defaults_mm_audio_video.yaml @@ -0,0 +1,63 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + audio_video_discrn: + # data_dir: ${env.data_dir}/datasets + audio_processor: + train: + name: beats_audio + n_frames: 2 + eval: + name: beats_audio + n_frames: 2 + + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + video_processor: + train: + name: alpro_video_train + n_frms: 2 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 2 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + data_type: [video, audio] # [images|videos|features] + + build_info: + kwargs: + total: all + shuffle_modalities: False + balance_labels: True + dataset_name: audiocaps + ground_truth: False + raw: False + + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json + # - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/discriminatory_reasoning/discriminatory_dataset/audiocaps_discrn.json + storage: + - discrn/annotations/audiocaps.json + # - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/discriminatory_reasoning/discriminatory_dataset/audiocaps_discrn.json + + audio: + storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio/val + video: + storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val \ No newline at end of file diff --git a/lavis/configs/datasets/discriminatory_reasoning/defaults_mm_image_pc.yaml b/lavis/configs/datasets/discriminatory_reasoning/defaults_mm_image_pc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2389aa75447520439f5fa498d2019ecaa7c4f22f --- /dev/null +++ b/lavis/configs/datasets/discriminatory_reasoning/defaults_mm_image_pc.yaml @@ -0,0 +1,48 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + image_pc_discrn: # name of the dataset builder + vis_processor: + train: + name: "clip_image_train" + eval: + name: "clip_image_eval" + pc_processor: + train: + name: "ulip_pc" + eval: + name: "ulip_pc" + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + data_type: [images, pc] # [images|videos|features] + + + build_info: + + kwargs: + total: all + shuffle_modalities: False + balance_labels: True + dataset_name: objaverse + + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json + # - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/discriminatory_reasoning/discriminatory_dataset/objaverse_discrn.json + storage: + - discrn/annotations/objaverse.json + # - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/discriminatory_reasoning/discriminatory_dataset/objaverse_discrn.json + pc: + storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel + + images: + storage: /export/einstein-vision/3d_vision/objaverse_captions/images/ \ No newline at end of file diff --git a/lavis/configs/datasets/discriminatory_reasoning/discriminatory_dataset/audiocaps_discrn.json b/lavis/configs/datasets/discriminatory_reasoning/discriminatory_dataset/audiocaps_discrn.json new file mode 100644 index 0000000000000000000000000000000000000000..9af592542f56bdaa0285272cb0e9587165f92043 --- /dev/null +++ b/lavis/configs/datasets/discriminatory_reasoning/discriminatory_dataset/audiocaps_discrn.json @@ -0,0 +1 @@ +[{"captions": ["a person is burping then speaks and laughs", "a toilet flushes and a female speaks"], "sample_ids": ["wAAkbZToh8", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["burp, laugh, speak", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man burps and a woman speaks", "a toilet flushes and a man speaks"], "question": "which entity is a person", "label": 0}, {"captions": ["a dog barks and whimpers", "a stream of water runs briefly"], "sample_ids": ["sShpyu2l4YQ", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["barks, whimpers, dog", "stream, water, run"], "captions_pred_video": ["the puppies are playing with a toy", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a dog is barking and growling", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a man speaks and is typing on a keyboard"], "sample_ids": ["xjhAnI2q6hM", "x9JovgqUcs"], "start_seconds": ["6", "500"], "properties": ["engine revs, vehicle, people", "a, man, speaks, keyboard"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a man speaks and types on a keyboard"], "question": "which entity is typing on a keyboard?", "label": 1}, {"captions": ["a man woman speak while crickets sing", "people cheer as a vehicle engine revs"], "sample_ids": ["zTLVJCo4WEE", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["a, crickets, sing", "engine revs, vehicle, people"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a vehicle?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sfAvvZwdLCY", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["water drains, flushes, water", "female, spraying, scream"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["dogs barking and whimpering", "winds blows roughly as a vehicle races past"], "sample_ids": ["tIY7qOV3rEM", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["barking, whimpering, dog", "wind, blows, vehicle"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a jet engine roars and wind blows "], "question": "which entity is more calm", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["yNtRmrn0io8", "sLUnaPT5gM8"], "start_seconds": ["210", "0"], "properties": ["storm, distance, strike", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a house in the middle of the night", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["rain falls and thunder roars", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a woman speaks happily and an animal chirps"], "sample_ids": ["uEU-Hg5MTN8", "uWAAAL4CIoc"], "start_seconds": ["27", "0"], "properties": ["a woman, laughs, animal", "a woman, chirps, animal"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a dog is barking "], "question": "which entity has a woman speaking and an animal chirps?", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a child speaks in closed space"], "sample_ids": ["sncRqQ67iJU", "yW6FWLSLkx4"], "start_seconds": ["460", "40"], "properties": ["loud, repeatedly, man", "child, space, speak"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person is snoring", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a stream of water runs briefly"], "sample_ids": ["wDVMhEdTiVw", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["gun, shoot, water", "stream, water, run"], "captions_pred_video": ["a blurry image of trees and water in the forest", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zofjfKhqLk8", "zFjIWfSD-4"], "start_seconds": ["10", "410"], "properties": ["background, metal, clank", "People, motor, brakes"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a machine?", "label": 0}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["voJh2gJxXhA", "sLUnaPT5gM8"], "start_seconds": ["50", "0"], "properties": ["music, frog, croak", "loud, laughter, intermittent"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a small engine idles continuously", "an airplane engine spools and people speak"], "sample_ids": ["y5WII6cTH7k", "wTjoRj1se3U"], "start_seconds": ["40", "390"], "properties": ["engine, idle, continuously", "airplane, engine, spool"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["a motorcycle engine is idling", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vZAqdHZ81yA", "uEU-Hg5MTN8"], "start_seconds": ["180", "27"], "properties": ["engine, motorcycle, idling", "a woman, laughs, animal"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine is idling loudly", "a woman is speaking and a baby is crying"], "question": "which entity is not a person?", "label": 0}, {"captions": ["male speech with light ticking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xO-Q2BlIIPU", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["male, speech, ticking", "a woman, something, fried"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a stream of water runs briefly"], "sample_ids": ["uZesmtKZGSw", "x-PeY8Yb8M4"], "start_seconds": ["250", "300"], "properties": ["men, talk, cars", "stream, water, run"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["birds chirp as a bell rings", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["ziUT9IFTkjg", "zl9Dqx-j7q4"], "start_seconds": ["10", "6"], "properties": ["chirp, bell, ring", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["yajyRTUQk3U", "vlJS7LN2XyM"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "background, clocks, ticking"], "captions_pred_video": ["- a woman cooking in the kitchen", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a ticktock of a clock"], "question": "which entity has a quieter background", "label": 1}, {"captions": ["a train horn sounds and railroad crossing ring", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["s7knHCFW82w", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["horn, sound, train", "airplane, boy, fly"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["u--KhUW8l1Y", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["horn, siren, life", "a woman, laughs, animal"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "multiple people speak and children yell while water gurgles"], "sample_ids": ["w0xsN8X18Y", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["rain, thunder, surface", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "water flows as men speak and yell"], "sample_ids": ["xKB8O8LTs6s", "vJ7JPEFhyLA"], "start_seconds": ["70", "16"], "properties": ["music, gunshots, explosion", "water, flow, men"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wyllXV6PjKo", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["a kid, talk, cry", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman speaks and a baby cries", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a machine beeps continuously"], "sample_ids": ["u2f5NpsoHBg", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["person, laugh, clap", "beeps, machine, continuously"], "captions_pred_video": ["is being projected on a screen at the front of the stage", null], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "three men talk while wind blows and some liquid flows"], "sample_ids": ["uYT5gxnyMWM", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["a, scream, girl", "three men, wind, flow"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a girl speaking followed by a scream and more girls talking?", "label": 0}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vfYTJq7nU", "vfYTJq7nU"], "start_seconds": ["130", "130"], "properties": ["rustling, ducks, quack", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a duck quacks and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is a video of ducks?", "label": 0}, {"captions": ["a motorcycle engine works nearby", "a horn rings out as a machine runs by"], "sample_ids": ["tOSWIURC-4", "slZLHwNbbt4"], "start_seconds": ["0", "300"], "properties": ["engine, work, nearby", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a lawn mower is running ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "water is sprayed across a hard surface"], "sample_ids": ["soTOh3zYJfY", "sQwlkXjQabo"], "start_seconds": ["40", "10"], "properties": ["vehicle, skid, tires", "water, spray, surface"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["un9VQlzgZM", "yajyRTUQk3U"], "start_seconds": ["5", "400"], "properties": ["females, talk, laugh", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["y8dSeubCNI", "uEU-Hg5MTN8"], "start_seconds": ["4", "27"], "properties": ["men, women, car", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine revving and people talking in the background", "a woman is speaking and a baby is crying"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["u--KhUW8l1Y", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["engine, sound, horn", "female, spraying, scream"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["u6jIvCtKarQ", "yajyRTUQk3U"], "start_seconds": ["70", "400"], "properties": ["a, man, speaks", "a woman, something, fried"], "captions_pred_video": ["footage of a person using a blender on a stove top", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a person sniffles and sneezes", "a frog croaks as other frogs croak in the background"], "sample_ids": ["uRlbY6aoBU", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["sneezes, sniffles, person", "background, frog, croak"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["a man is sneezing ", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zcDwZ6W7E3E", "zFjIWfSD-4"], "start_seconds": ["180", "410"], "properties": ["a, man, speak", "People, motor, brakes"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["weDbePuc-Xc", "zj2R0XoFr5k"], "start_seconds": ["40", "50"], "properties": ["cartoon character, music, vocalize", "airplane, boy, fly"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "a clock ticktocks"], "sample_ids": ["ukg5L09Wpvo", "v-g-j2uTByM"], "start_seconds": ["150", "30"], "properties": ["clickety-clack, train, whistle", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a railroad crossing bell rings as a train horn blows"], "sample_ids": ["w0xsN8X18Y", "tZGN5a7ybxo"], "start_seconds": ["30", "60"], "properties": ["music, surface, rain", "ring, train, horn"], "captions_pred_video": [null, "is taken from a moving vehicle on the train tracks"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a train is moving and blowing its horn "], "question": "which entity is a warning", "label": 1}, {"captions": ["an insect buzzes around continuously", "water flows and trickles"], "sample_ids": ["v25l1jef3JY", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "water, flow, trickle"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "water is splashing and gurgling"], "question": "which entity is not a living thing", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["wqZ135Ssz0", "y2bVZ7rz-5M"], "start_seconds": ["60", "280"], "properties": ["two men, woman, birds", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a beep occurs briefly", "a drill drills through something then people begin laughing"], "sample_ids": ["xtWeJ56-U-g", "tEE3MpBt1sg"], "start_seconds": ["20", "50"], "properties": ["beep, occur, briefly", "drill, something, laugh"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "paper is crumpling consistently"], "sample_ids": ["ugHJF0hfYkg", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a helicopter is flying overhead ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a person whistles a meandering tune", "birds chirp and objects are moved around"], "sample_ids": ["uFoga8sHpiw", "yPUYU6t3rwo"], "start_seconds": ["90", "370"], "properties": ["person, tune, whistle", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a bird in a cage", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a person whistles a song", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["ugHJF0hfYkg", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["loud, intense, propeller", "wind, blow, vehicle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a car speeding up in the distance", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["u0TrcHhkPQ", "zl9Dqx-j7q4"], "start_seconds": ["20", "6"], "properties": ["distance, car, speed", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vqZuVbG6-HI", "ukg5L09Wpvo"], "start_seconds": ["130", "150"], "properties": ["background, male, female", "clickety-clack, train, whistle"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["zgUgkpk78xU", "xfaoyyzw2WU"], "start_seconds": ["70", "180"], "properties": ["horn, bells, ring", "loud, jet engine, roar"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["shmR4OZtzqA", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["man, engine, idle", "loud, laughter, intermittent"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man speaks while a motor runs", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind blows and a stream of water flows nearby", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sYITalLZjj4", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["stream, flow, wind", "engine, idle, woman"], "captions_pred_video": ["two ducks are swimming in the water near each other", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["wind blows and birds chirp", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["yajyRTUQk3U", "su6FAOcOA8c"], "start_seconds": ["400", "4"], "properties": ["a woman, something, fried", "engine, idle, woman"], "captions_pred_video": ["- a woman cooking in the kitchen", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman is speaking and a subway train is moving "], "question": "which woman is speaking while something is fried?", "label": 0}, {"captions": ["a female speaks softly as paper crinkles", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xvDdE3zNf8Y", "tDVADusiIoc"], "start_seconds": ["120", "60"], "properties": ["a, female, speaks", "water, radio, man"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["an engine runs loudly", "a car speeding up in the distance"], "sample_ids": ["vqZuVbG6-HI", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["loud, engine, run", "distance, car, speed"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["an aircraft engine runs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yLCORCnd35Q", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["engine, aircraft, runs", "airplane, boy, fly"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "people applaud and hoot and chat quietly"], "sample_ids": ["vBslzh7saPw", "wwyfGO2J4"], "start_seconds": ["90", "90"], "properties": ["engine, roar, louder", "people, applaud, hoot"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "water pouring and bubbling"], "sample_ids": ["sOa7g-44Dag", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["audio, scratching, man", "water, bubbles, pouring"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "water is running from a faucet"], "question": "which entity is a video", "label": 1}, {"captions": ["a man talks as several small engines run", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["u9A6VZQCZpU", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["a, man, talk", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaking with light rustling", "a man speaks followed by another man speaking outside"], "sample_ids": ["zOZleIRqZm4", "viuTg1M-dqg"], "start_seconds": ["80", "30"], "properties": ["light, rustling, man", "two men, speak, follow"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one man speaking?", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a child speaks in closed space"], "sample_ids": ["x5cuQjOdM3E", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["cat, talk, meow", "child, space, speak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a man talks while metallic objects are rapped and steam is released"], "sample_ids": ["vbZ-0lGPneg", "vuUVPzd2FXw"], "start_seconds": ["30", "160"], "properties": ["a woman, a television program, a bird", "a, steam, release"], "captions_pred_video": ["of a man holding a baby duck in his hands", "of the person cooking on the grill with a spatula"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man is speaking and dishes are clanging"], "question": "which entity has a man talking?", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "a door slams shut and an object moves on a hard surface"], "sample_ids": ["uC9dtII1KDI", "zkKdxzNC97Y"], "start_seconds": ["150", "27"], "properties": ["wind, gusts, distance", "hard, surface, door"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a door is opened and closed"], "question": "which entity is not a door?", "label": 0}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a toilet flushes and a female speaks"], "sample_ids": ["sapQIQUhFc", "yaln9y8I7ms"], "start_seconds": ["280", "230"], "properties": ["liquid, flow, distance", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a toilet flushes and a man speaks"], "question": "which entity is about a toilet?", "label": 1}, {"captions": ["a door opens and closes", "ticking continues without interruption"], "sample_ids": ["vBHyYJ8pL0", "v-g-j2uTByM"], "start_seconds": ["2", "30"], "properties": ["open, close, door", "ticking, continuous, clock"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a clock is ticking loudly"], "question": "which entity is a clock", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a car accelerates and wind blows"], "sample_ids": ["uzQnlJXBbOM", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["ringing, beep, stop", "accelerates, wind, blows"], "captions_pred_video": ["footage of a person using a cell phone on a table", null], "captions_pred_audio": ["a telephone rings and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "an airplane flies overhead as a woman speaks"], "sample_ids": ["vbpKkWvfOu4", "zj2R0XoFr5k"], "start_seconds": ["560", "50"], "properties": ["a, man, speaks", "airplane, fly, overhead"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "water runs into a sink while men speak"], "sample_ids": ["wRBHTgrbiwg", "vzceMbklWc"], "start_seconds": ["50", "180"], "properties": ["bird, owl, speak", "water, sink, run"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "water is running and a man is speaking"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["electronic beeps occur in a short series", "water flows as men speak and yell"], "sample_ids": ["y682ml90jGw", "vJ7JPEFhyLA"], "start_seconds": ["11", "16"], "properties": ["beeps, series, electronic", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a beeping sound is being made ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more natural", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vs65y4qmyBE", "sLUnaPT5gM8"], "start_seconds": ["340", "0"], "properties": ["wind, blows, strongly", "loud, laughter, intermittent"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["three men talk while wind blows and some liquid flows", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vJ7JPEFhyLA", "y8WEcpOlT3I"], "start_seconds": ["16", "40"], "properties": ["three men, wind, flow", "harsh, wind, blows"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with wind noise in the background "], "question": "which entity has a harsher wind blowing", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "water flows and trickles"], "sample_ids": ["vimzuGQvdcU", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "water, flow, trickle"], "captions_pred_video": ["a group of people are rafting down a river", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "vehicles pass by on a roadway"], "sample_ids": ["w5W5Kqtc8E", "tgbONvsP47Y"], "start_seconds": ["100", "0"], "properties": ["wind, blow, vehicle", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "plastic is tapped on while someone speaks"], "sample_ids": ["soTOh3zYJfY", "wvKpEYswXO0"], "start_seconds": ["40", "150"], "properties": ["vehicle, skid, tires", "plastic, tap, speak"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["water splashes as an animal walks through", "continuous snoring"], "sample_ids": ["w1ir-sZ3Im8", "sLkeqCDJIyw"], "start_seconds": ["90", "120"], "properties": ["animal, water, splashes", "loud, snoring, noise"], "captions_pred_video": ["footage of a group of people riding horses through a river", ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a bird is chirping and tweeting a bird song"], "sample_ids": ["vZAw4apG0Es", "wPz6QRAkEb4"], "start_seconds": ["30", "60"], "properties": ["background, tick, repeat", "chirps, tweets, song"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a bird in a cage on top of a pole"], "captions_pred_audio": ["a clock is ticking and people are talking", "birds are chirping in the background "], "question": "which entity is a bird", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a dog barks and whimpers"], "sample_ids": ["xKB8O8LTs6s", "sShpyu2l4YQ"], "start_seconds": ["70", "0"], "properties": ["music, gunshots, explosion", "barks, whimpers, dog"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "the puppies are playing with a toy"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a dog is barking and growling"], "question": "which entity is more calm", "label": 1}, {"captions": ["a person snoring", "an infant crying frantically"], "sample_ids": ["t8tv5YRMJUg", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["a person, snore, loud", "cry, infant, frantically"], "captions_pred_video": ["of a man getting his face licked by another man", "of the baby crying in the car seat"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a baby cries loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "an adult man speaks over glass clinking"], "sample_ids": ["vJvryTwuAV8", "u6jIvCtKarQ"], "start_seconds": ["16", "70"], "properties": ["audience, cheer, man", "a, man, speaks"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage of a person using a blender on a stove top"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a man is speaking and dishes are being moved with background noise "], "question": "which man speaks over glass clinking", "label": 1}, {"captions": ["birds chirp as a train approaches", "people cheer as a vehicle engine revs"], "sample_ids": ["xM4joTqDVp4", "xjhAnI2q6hM"], "start_seconds": ["160", "6"], "properties": ["bird, chirp, train", "engine revs, vehicle, people"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vJvryTwuAV8", "uEU-Hg5MTN8"], "start_seconds": ["16", "27"], "properties": ["audience, cheer, man", "a woman, laughs, animal"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zj2R0XoFr5k", "tiDFTC-5vU"], "start_seconds": ["50", "30"], "properties": ["airplane, boy, fly", "male, duck, laugh"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", null], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["a drill runs and two people laugh", "a person speaks briefly"], "sample_ids": ["tEE3MpBt1sg", "zOZleIRqZm4"], "start_seconds": ["50", "80"], "properties": ["two people, laugh, drill", "person, talk, brief"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking with crickets chirping in the background"], "question": "which entity is talking", "label": 1}, {"captions": ["a person screams glaringly", "an airplane engine roars increasingly louder"], "sample_ids": ["xC8kbrKJmco", "vBslzh7saPw"], "start_seconds": ["0", "90"], "properties": ["glaringly, screams, person", "engine, roar, louder"], "captions_pred_video": [null, "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a goat is bleating ", "a jet engine roars and accelerates "], "question": "which entity is louder", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "men speak and a nozzle sprays liquid"], "sample_ids": ["v5P-ThUCINM", "wRV8yMk886E"], "start_seconds": ["400", "0"], "properties": ["background, chirp, bird", "liquid, spray, nozzle"], "captions_pred_video": [null, "two cars are parked in a parking lot at night"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a man speaks followed by a loud burst"], "question": "which entity is more likely to be used in a science class", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a man speaks as a car is passing by"], "sample_ids": ["tDVADusiIoc", "sK4u5T8hW78"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "a, car, pass"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["v5P-ThUCINM", "su6FAOcOA8c"], "start_seconds": ["400", "4"], "properties": ["background, chirp, bird", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["an infant crying frantically", "people applaud and hoot and chat quietly"], "sample_ids": ["zwOBqeFTgiU", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["cry, infant, frantically", "people, applaud, hoot"], "captions_pred_video": ["of the baby crying in the car seat", null], "captions_pred_audio": ["a baby cries loudly", "people are clapping and speaking with background noise "], "question": "which entity is a group of people?", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "food is frying while a woman speaks"], "sample_ids": ["sEprKHm8Sj8", "yhQ2Lg-7qDY"], "start_seconds": ["90", "130"], "properties": ["car, tires, slows", "food, woman, speak"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a faucet is running and a man is speaking"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "a stream of water runs briefly"], "sample_ids": ["xvDdE3zNf8Y", "x-PeY8Yb8M4"], "start_seconds": ["120", "300"], "properties": ["A, crumple, paper", "stream, water, run"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman speaks and crumples paper", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "small dogs yip and bark sharply"], "sample_ids": ["vK93VuO0yNc", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["male voice, bus, rumble", "bark, yip, sharply"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "someone snores nearby"], "sample_ids": ["wyllXV6PjKo", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["a baby, a woman, a man", "someone snores, nearby, someone"], "captions_pred_video": [null, "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a woman speaks and a baby cries", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks as several small engines run", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["u9A6VZQCZpU", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["a, man, talk", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a jet engine roars "], "question": "which entity is about a man talking?", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zF8yoL0rkbI", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["engine, run, someone", "a woman, a television program, a bird"], "captions_pred_video": ["footage of the traffic on the street at night", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program playing?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vD6lYD1l0BY", "ukg5L09Wpvo"], "start_seconds": ["330", "150"], "properties": ["a, machine, run", "clickety-clack, train, whistle"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["several insects fly while two men talk", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["s-T9OVOiMLo", "tdWhHV3X25Q"], "start_seconds": ["330", "60"], "properties": ["several, fly, men", "applause, audience, yells"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and wind blows", "an airplane accelerates briefly"], "sample_ids": ["sxIvBMSavMQ", "zjTG0gaGCUI"], "start_seconds": ["210", "80"], "properties": ["birds, chirp, wind", "accelerates, airplane, briefly"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a jet engine roars as wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "a man speaks on a radio as wind blows"], "sample_ids": ["uWAAAL4CIoc", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["a, dog, vocalize", "man, radio, blows"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking on a radio?", "label": 1}, {"captions": ["birds vocalize and a man speaks", "water is sprayed across a hard surface"], "sample_ids": ["v0wPrLBI3hg", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["vocalize, bird, speak", "water, spray, surface"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["people speak then an engine runs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["uMTTDZ2mb4", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["engine, run, people", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is about a vehicle engine?", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["u5RmF3c3Aw", "ukg5L09Wpvo"], "start_seconds": ["60", "150"], "properties": ["engine, car, zoom", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man talks as several small engines run", "a man speaks as a motor runs in the background"], "sample_ids": ["u9A6VZQCZpU", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["bees buzz and wind blows", "a piece of wood is being placed down and sawed"], "sample_ids": ["tMJne1a4AFI", "uiItxDsDMFI"], "start_seconds": ["0", "30"], "properties": ["bees buzz, wind blows, bees", "wood, piece, saw"], "captions_pred_video": ["a swarm of bees on the ground", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a swarm of bees buzzing around", "a saw is being used with background noise "], "question": "which entity is being cut", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["y8WEcpOlT3I", "vzxHnu-SFEw"], "start_seconds": ["40", "80"], "properties": ["harsh, wind, blows", "two objects, woman, speak"], "captions_pred_video": ["on how to use a sewing machine youtube", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a stream of water runs briefly"], "sample_ids": ["x9JovgqUcs", "x-PeY8Yb8M4"], "start_seconds": ["500", "300"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "small dogs yip and bark sharply"], "sample_ids": ["tDVADusiIoc", "v-wcQf4BDY0"], "start_seconds": ["60", "120"], "properties": ["man, radio, blows", "bark, yip, sharply"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["an insect buzzes around continuously", "loud clanking and banging with brief male speech"], "sample_ids": ["v25l1jef3JY", "sWZzXuWYY"], "start_seconds": ["0", "420"], "properties": ["buzzes, continuously, insect", "male, speech, banging"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a sewing machine runs and a man speaks"], "question": "which entity is louder", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "small dogs yip and bark sharply"], "sample_ids": ["zdYdyF9-m8U", "v-wcQf4BDY0"], "start_seconds": ["7", "120"], "properties": ["wind, crash, shoreline", "bark, yip, sharply"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["waves crash and wind blows ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "bird squawks are accompanied by a man and woman speaking"], "sample_ids": ["ylpYOorfH4o", "wqZ135Ssz0"], "start_seconds": ["410", "60"], "properties": ["motor, run, steady", "man, woman, squawks"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wqZ135Ssz0", "yajyRTUQk3U"], "start_seconds": ["60", "400"], "properties": ["man, woman, squawks", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a man speaks as a car is passing by"], "sample_ids": ["y2bVZ7rz-5M", "sK4u5T8hW78"], "start_seconds": ["280", "30"], "properties": ["motor noise, horn, siren", "a, car, pass"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "birds chirp and objects are moved around"], "sample_ids": ["uoGVs9yUqY4", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["multiple, vocalize, wind", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["for how to make a wooden shed door youtube", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["children speak as a female ask them questions", "a telephone rings followed by a woman talking"], "sample_ids": ["wEBlkGWVWwE", "tGcFnX0GHI"], "start_seconds": ["260", "0"], "properties": ["female, speak, questions", "ring, talk, woman"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "pigeons vocalize and birds chirp"], "sample_ids": ["vzceMbklWc", "uiS58TNyUiw"], "start_seconds": ["180", "430"], "properties": ["water, faucet, sink", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["water is running and a man is speaking", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["paper folding and crinkling", "a frog vocalizes as birds chirp"], "sample_ids": ["zPpG3RD8lSs", "wqUmIEzuNz4"], "start_seconds": ["20", "30"], "properties": ["paper, fold, crinkle", "frog, bird, vocalize"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "a frog sitting in the grass on a sunny day"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a cat meows and rustles"], "question": "which entity is not a frog?", "label": 0}, {"captions": ["a dark barks and whimpers", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sYj4hpDUZDQ", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["barks, whimpers, dark", "multiple, people, yell"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", null], "captions_pred_audio": ["a dog barks and a cat meows", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a vehicle engine revs and tires squeal", "a car accelerates and wind blows"], "sample_ids": ["yDoT73BWsdA", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["engine revs, tires squeal, vehicle", "accelerates, wind, blows"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "wind blowing followed by a zoom"], "sample_ids": ["wRBHTgrbiwg", "vr8ZXjEBhMQ"], "start_seconds": ["50", "150"], "properties": ["birds, chirp, cooing", "wind, blow, zoom"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wSVhSdj0F0", "zj2R0XoFr5k"], "start_seconds": ["10", "50"], "properties": ["horn honks, keys jingle, electronic beep", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a man speaks with another voice speaking in the background"], "sample_ids": ["wqZ135Ssz0", "u21-Z5gJCB8"], "start_seconds": ["60", "30"], "properties": ["man, woman, squawks", "background, voice, man"], "captions_pred_video": [null, "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking with another voice speaking in the background?", "label": 1}, {"captions": ["wind blows as people chatter quietly", "water splashes and a door squeaks"], "sample_ids": ["xBxDz0CFVn0", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["wind, chatter, people", "sound, splash, door"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a dog barks and taps with background noise "], "question": "which entity is more quiet", "label": 0}, {"captions": ["long loud burping by a man", "a man speaks as a car is passing by"], "sample_ids": ["xmiUIOhtZyQ", "sK4u5T8hW78"], "start_seconds": ["60", "30"], "properties": ["loud, burp, man", "a, car, pass"], "captions_pred_video": ["homer simpson drinking a beer", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person burps and music plays in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a car speeding up in the distance"], "sample_ids": ["wwyfGO2J4", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["people, applaud, hoot", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a propeller rotates loudly and intensely"], "sample_ids": ["zofjfKhqLk8", "ugHJF0hfYkg"], "start_seconds": ["10", "10"], "properties": ["background, metal, clings", "loud, intense, propeller"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "water pouring and bubbling"], "sample_ids": ["yDoT73BWsdA", "uyRfq-jKPpo"], "start_seconds": ["10", "50"], "properties": ["engine revs, tires squeal, vehicle", "water, bubbles, pouring"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a baby cries and a woman speaks", "a stream of water runs briefly"], "sample_ids": ["tMbMDvT50j8", "x-PeY8Yb8M4"], "start_seconds": ["12", "300"], "properties": ["a, cry, woman", "stream, water, run"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a baby cries and a woman speaks", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["s6DESzUTGjY", "tDVADusiIoc"], "start_seconds": ["16", "60"], "properties": ["wind, laugh, woman", "water, radio, man"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman laughing?", "label": 0}, {"captions": ["a horn honks and then loudly blares", "a duck quacks continuously"], "sample_ids": ["wnpJndXuxLc", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["horn, honk, loud", "quacks, continuously, duck"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["people speak in a closed space", "wind blows as people chatter quietly"], "sample_ids": ["sTpirNYo8vQ", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["people, space, speak", "wind, chatter, people"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking with wind noise in the background "], "question": "which entity is in a closed space", "label": 0}, {"captions": ["someone snores nearby", "a woman speaks as she rubs two objects together"], "sample_ids": ["spJCm8tD9Zo", "vzxHnu-SFEw"], "start_seconds": ["90", "80"], "properties": ["someone snores, nearby, someone", "two objects, woman, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "water rushes and then a vehicle zooms past"], "sample_ids": ["wRBHTgrbiwg", "s4Uz1Ffgo04"], "start_seconds": ["50", "100"], "properties": ["bird, owl, speak", "water, rushes, vehicle"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is more active", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["zj2R0XoFr5k", "sSMl2vc3ek"], "start_seconds": ["50", "20"], "properties": ["airplane, boy, fly", "loud, multiple, distance"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", null], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "winds blows roughly as a vehicle races past"], "sample_ids": ["ylpYOorfH4o", "xjvTpk2Zpr8"], "start_seconds": ["410", "70"], "properties": ["engine, run, loud", "wind, blows, vehicle"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a child speaks in closed space", "a stream of water flows quickly"], "sample_ids": ["yW6FWLSLkx4", "wbHTKEJZyhc"], "start_seconds": ["40", "20"], "properties": ["child, space, speak", "stream, water, flow"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a waterfall is flowing and people are speaking "], "question": "which entity is moving", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wwyfGO2J4", "uYT5gxnyMWM"], "start_seconds": ["90", "50"], "properties": ["people, applaud, hoot", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is more violent", "label": 1}, {"captions": ["a woman speaks with water running", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["wTideSjRFS0", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["water, running, woman", "background, birds, rustling"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "birds are chirping and a chime is ringing "], "question": "which entity has a bird in the background?", "label": 1}, {"captions": ["a train horn sounds as a railroad passing bell rings", "winds blows roughly as a vehicle races past"], "sample_ids": ["zgUgkpk78xU", "xjvTpk2Zpr8"], "start_seconds": ["70", "70"], "properties": ["horn, bell, train", "wind, blows, vehicle"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a helicopter engine idles continuously", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["ugHJF0hfYkg", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["engine, idle, continuously", "applause, audience, yells"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking and a crowd is clapping"], "question": "which entity is not a person?", "label": 0}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["xNMovAf3o50", "ziUT9IFTkjg"], "start_seconds": ["0", "10"], "properties": ["rain, thunder, music", "background, birds, rustling"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", null], "captions_pred_audio": ["thunder and rain with music playing in the background ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a church bell rings several times", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sUVVjE3Ucp8", "sLUnaPT5gM8"], "start_seconds": ["0", "0"], "properties": ["ring, bell, several", "loud, laughter, intermittent"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a church bell is ringing ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind noise makes sound into a microphone", "a male speaks and another male speaks"], "sample_ids": ["w8uLijTqtlU", "viuTg1M-dqg"], "start_seconds": ["70", "30"], "properties": ["wind, microphone, noise", "two males, speaking, male"], "captions_pred_video": ["footage is blurry and shaky", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a recording", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wTjoRj1se3U", "uYT5gxnyMWM"], "start_seconds": ["390", "50"], "properties": ["engine, run, people", "a, scream, girl"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a jet engine is running and people are talking", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a vehicle engine accelerating then running on idle"], "sample_ids": ["xZepNM9qcRA", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["background, motor, run", "engine, accelerate, idle"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "a car speeding up in the distance"], "sample_ids": ["sZvwOuuPGP0", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["engine, diesel, truck", "distance, car, speed"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", null], "captions_pred_audio": ["a medium engine is running ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving faster", "label": 0}, {"captions": ["a vehicle accelerates squealing tires", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sd7xVssqlw", "sSMl2vc3ek"], "start_seconds": ["50", "20"], "properties": ["accelerates, tires, squealing", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["children cry and people talk", "a stream of water flows as people talk and wind blows"], "sample_ids": ["xLwHe825Zs", "xBxDz0CFVn0"], "start_seconds": ["18", "30"], "properties": ["people talk, children cry, people talk", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing as people talk and wind blows?", "label": 1}, {"captions": ["a infant makes noise and is excited", "a man is filing a hard object"], "sample_ids": ["wIJK3-5y0kA", "vveS8HT7Uog"], "start_seconds": ["30", "100"], "properties": ["noise, excited, infant", "a man, hard, object"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "footage is of a workbench with various tools on it including a hammer and a screwdriver"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is filing and speaking with background noise and breathing "], "question": "which object is harder to file", "label": 0}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a stream of water runs briefly"], "sample_ids": ["siJFXfGWgDk", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["man, woman, vehicle", "stream, water, run"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "paper is crumpling consistently"], "sample_ids": ["ylpYOorfH4o", "v5cSxLaHADY"], "start_seconds": ["410", "0"], "properties": ["engine, run, loud", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and an engine is revving", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a consistent ticking pattern"], "sample_ids": ["wtDqrBygTcU", "sCeWURVHfOM"], "start_seconds": ["30", "30"], "properties": ["man, engine, run", "ticking, pattern, clock"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "- a close-up view of the clock's inner workings"], "captions_pred_audio": ["a man is speaking and a motor is running", "ticking of a clock"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "birds chirp and objects are moved around"], "sample_ids": ["tOj4tdLRaA", "yPUYU6t3rwo"], "start_seconds": ["70", "370"], "properties": ["woman, laugh, baby", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a baby laughs and a woman speaks", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "a heavy rain falls endlessly"], "sample_ids": ["zliInBdC98Y", "wP8ZKrlx3oA"], "start_seconds": ["30", "40"], "properties": ["a, baby, cries, wails", "heavy, rain, fall"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a baby cries and a woman speaks", "a heavy rain is falling on a surface"], "question": "which entity is falling", "label": 1}, {"captions": ["goats bleat and metal clings", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tH17JPjDPnc", "tdWhHV3X25Q"], "start_seconds": ["260", "60"], "properties": ["bleat, metal, clings", "applause, audience, yells"], "captions_pred_video": ["feed of the goats eating hay in the barn", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "a clock ticks quietly and rhythmically"], "sample_ids": ["slZLHwNbbt4", "u7C-AEBQM"], "start_seconds": ["300", "30"], "properties": ["train, horn, sound", "ticks, rhythmic, quiet"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a ticktock of a clock"], "question": "which entity is quieter", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "a car accelerates and wind blows"], "sample_ids": ["vSeGhaZt-aI", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["water, bubbles, speak", "accelerates, wind, blows"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a infant makes noise and is excited", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wIJK3-5y0kA", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["noise, excited, infant", "music, gunfire, explosion"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a baby cries and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a child yells and another yells", "a man speaks as a car is passing by"], "sample_ids": ["vMDHu7Lxcgw", "sK4u5T8hW78"], "start_seconds": ["410", "30"], "properties": ["two, yell, child", "a, car, pass"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["an insect buzzes around continuously", "a toilet flushes and a female speaks"], "sample_ids": ["v25l1jef3JY", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["buzzes, continuously, insect", "female, flushes, toilet"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage is blurry and out of focus"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a toilet flushes and a man speaks"], "question": "which entity is not a person", "label": 0}, {"captions": ["an engine idles consistently before sputtering some", "some men converse over an engine running"], "sample_ids": ["rwTERCUno", "sCiy7QS1U"], "start_seconds": ["90", "300"], "properties": ["engine, idle, sputter", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine is idling and vibrating", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaking with light rustling", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zOZleIRqZm4", "sLUnaPT5gM8"], "start_seconds": ["80", "0"], "properties": ["light, rustling, man", "loud, laughter, intermittent"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["water quietly rushes by while birds chirp in the background", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["sYITalLZjj4", "wSVhSdj0F0"], "start_seconds": ["30", "10"], "properties": ["water, rushes, background, birds", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["two ducks are swimming in the water near each other", null], "captions_pred_audio": ["wind blows and birds chirp", "a car horn honks and keys jangle with background noise "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a vehicle accelerates and squeals tires", "water pouring and bubbling"], "sample_ids": ["yRx9txMcBl0", "uyRfq-jKPpo"], "start_seconds": ["40", "50"], "properties": ["accelerates, tires, squeals", "water, bubbles, pouring"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a car is revving its engine and skidding ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "some tunes played by whistling"], "sample_ids": ["uC9dtII1KDI", "u6BnG6YZqJ4"], "start_seconds": ["150", "0"], "properties": ["wind, gusts, distance", "tune, play, whistling"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a person whistling a song"], "question": "which entity is not playing a tune?", "label": 0}, {"captions": ["a dog barks and whimpers", "a frog croaks as other frogs croak in the background"], "sample_ids": ["sShpyu2l4YQ", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["barks, whimpers, dog", "background, frog, croak"], "captions_pred_video": ["the puppies are playing with a toy", "a close up of a frog in the water"], "captions_pred_audio": ["a dog is barking and growling", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "some men converse over an engine running"], "sample_ids": ["s6DESzUTGjY", "sCiy7QS1U"], "start_seconds": ["16", "300"], "properties": ["wind, laugh, woman", "men, converse, engine"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", null], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a woman laughing?", "label": 0}, {"captions": ["people speak in the background as a clock ticktocks", "pigeons vocalize and birds chirp"], "sample_ids": ["vZAw4apG0Es", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["background, clock, ticktocks", "vocalize, bird, chirp"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "of the pigeon in the cage"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a car revs and accelerates loudly and men and women chatter among themselves"], "sample_ids": ["uZesmtKZGSw", "y8dSeubCNI"], "start_seconds": ["250", "4"], "properties": ["car, track, man", "men, women, car"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "an engine revving and people talking in the background"], "question": "which entity has more people", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "dishes cling together then a man begins to speak"], "sample_ids": ["vKrYfzleLB8", "sQGXqGcwOTc"], "start_seconds": ["110", "3"], "properties": ["a, ring, gunshots", "cling, speak, dishes"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "paper folding and crinkling"], "sample_ids": ["wjsXBsc7M40", "zPpG3RD8lSs"], "start_seconds": ["10", "20"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "paper, fold, crinkle"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a baby laughs and a woman speaks", "the wind blows and a mouse clicks "], "question": "which entity is more likely to be a video", "label": 0}, {"captions": ["a man is filing a hard object", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vveS8HT7Uog", "xBxDz0CFVn0"], "start_seconds": ["100", "30"], "properties": ["a man, hard, object", "stream, water, flow"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "some men converse over an engine running"], "sample_ids": ["s59PfAghdkM", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["bird, chirp, background, horse, neigh", "men, converse, engine"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", null], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more likely to be found in a museum", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a woman speaks and other women and a man talk with her"], "sample_ids": ["v0x1odnXtP0", "vbpKkWvfOu4"], "start_seconds": ["210", "560"], "properties": ["keyboard, type, computer", "a, woman, man"], "captions_pred_video": ["how to make money on youtube in spanish", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking and a man is speaking"], "question": "which is a group of people", "label": 1}, {"captions": ["a child speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yW6FWLSLkx4", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["a, child, speaks", "three men, wind, flow"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "an emergency vehicle engine runs then a horn blows and siren sounds"], "sample_ids": ["shmR4OZtzqA", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["man, engine, idle", "engine, horn, siren"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man speaks while a motor runs", "a truck is honking its horn and a siren is blaring "], "question": "which vehicle has a horn and siren?", "label": 1}, {"captions": ["a small engine idles continuously", "dishes cling together then a man begins to speak"], "sample_ids": ["y5WII6cTH7k", "sQGXqGcwOTc"], "start_seconds": ["40", "3"], "properties": ["engine, idle, continuously", "cling, speak, dishes"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["an engine is knocking and vibrating ", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "running water in a faucet with some clinks"], "sample_ids": ["yaln9y8I7ms", "zNRChLjqcU"], "start_seconds": ["230", "220"], "properties": ["female, flushes, toilet", "water, faucet, run"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and a man speaks", "water is running from a faucet into a sink"], "question": "which entity is a faucet?", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a stream of water runs briefly"], "sample_ids": ["sDSppXIlJrs", "x-PeY8Yb8M4"], "start_seconds": ["27", "300"], "properties": ["microphone, water, wind", "stream, water, run"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["an insect buzzes around continuously", "water is sprayed across a hard surface"], "sample_ids": ["v25l1jef3JY", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["buzzes, continuously, insect", "water, spray, surface"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vZAw4apG0Es", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["background, tick, repeat", "a woman, something, fried"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a clock is ticking and people are talking", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a person sniffs and sneezes"], "sample_ids": ["yaln9y8I7ms", "uRlbY6aoBU"], "start_seconds": ["230", "0"], "properties": ["female, flushes, toilet", "sneezes, person, sniffs"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is sneezing "], "question": "which entity is a person", "label": 1}, {"captions": ["a toilet flushes and water drains", "water pouring and bubbling"], "sample_ids": ["sfAvvZwdLCY", "uyRfq-jKPpo"], "start_seconds": ["20", "50"], "properties": ["water drains, flushes, water", "water, bubbles, pouring"], "captions_pred_video": ["footage of the toilet in the bathroom", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a toilet is flushed", "water is running from a faucet"], "question": "which entity is a source of water", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "birds chirp and objects are moved around"], "sample_ids": ["yZmhM1HcsyE", "yPUYU6t3rwo"], "start_seconds": ["4", "370"], "properties": ["engine, roar, water", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks with water running", "an airplane engine runs"], "sample_ids": ["wTideSjRFS0", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["water, running, woman", "engine, airplane, runs"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "the clinking of a train bell with the humming of an engine and a train horn blowing"], "sample_ids": ["xyL9F5VrjkE", "zgUgkpk78xU"], "start_seconds": ["20", "70"], "properties": ["engine, run, wind", "clinking, humming, horn"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man speaks as horns blow", "water is sprayed across a hard surface"], "sample_ids": ["tHyNqRyK34A", "sQwlkXjQabo"], "start_seconds": ["24", "10"], "properties": ["a, man, speaks", "water, spray, surface"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["yRx9txMcBl0", "y2bVZ7rz-5M"], "start_seconds": ["40", "280"], "properties": ["accelerates, tires, squeals", "motor noise, horn, siren"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning device", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "people speak as gunfire rings out"], "sample_ids": ["xzKKf9bKNUo", "wqTCwqVRDlk"], "start_seconds": ["10", "80"], "properties": ["background, noise, snoring", "gunfire, ring, speak"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wP8ZKrlx3oA", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["rain, storm, thunder", "three men, wind, flow"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a weather event", "label": 0}, {"captions": ["a propeller rotates loudly and intensely", "a man speaks over intermittent keyboard taps"], "sample_ids": ["ugHJF0hfYkg", "tw76HGONaKg"], "start_seconds": ["10", "570"], "properties": ["loud, intense, propeller", "audio, man, keyboard"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man speaks and types on a computer keyboard "], "question": "which is quieter", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a consistent ticking pattern"], "sample_ids": ["sU53zg9Jp7s", "sCeWURVHfOM"], "start_seconds": ["380", "30"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "ticking, pattern, clock"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "- a close-up view of the clock's inner workings"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "ticking of a clock"], "question": "which entity is a clock?", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["zkKdxzNC97Y", "zl9Dqx-j7q4"], "start_seconds": ["27", "6"], "properties": ["loud, bang, noise", "engine, laugh, loud"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a door is opened and closed", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["electronic beeps occur in a short series", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["y682ml90jGw", "xKB8O8LTs6s"], "start_seconds": ["11", "70"], "properties": ["beeps, series, electronic", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a beeping sound is being made ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "wind blowing followed by a zoom"], "sample_ids": ["tw76HGONaKg", "vr8ZXjEBhMQ"], "start_seconds": ["570", "150"], "properties": ["A, game, keyboard", "wind, blow, zoom"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["two frogs croak at each other", "several insects fly while two men talk"], "sample_ids": ["zg0X6BnhOLQ", "s-T9OVOiMLo"], "start_seconds": ["410", "330"], "properties": ["two frogs, croak, at each other", "several, fly, men"], "captions_pred_video": ["footage of lightning in the sky at night", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a frog is croaking", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a duck quacks continuously"], "sample_ids": ["se87d6yxEOA", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["run, whistle, pass", "quacks, continuously, duck"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "roadway noise occurs and a truck accelerates"], "sample_ids": ["xyL9F5VrjkE", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["engine, run, wind", "noise, truck, accelerate"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a fire truck entering a garage"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a car is driving on the road "], "question": "which truck is moving", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yPUYU6t3rwo", "tiDFTC-5vU"], "start_seconds": ["370", "30"], "properties": ["birds chirp, objects are moved around, birds", "male, duck, laugh"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", null], "captions_pred_audio": ["insects buzz and a man speaks", "a man is speaking and ducks are quacking"], "question": "which entity is about a duck?", "label": 1}, {"captions": ["a child speaks in closed space", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yW6FWLSLkx4", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["child, space, speak", "People, motor, brakes"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a closed space", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "a toilet flushes and water drains"], "sample_ids": ["uYT5gxnyMWM", "sfAvvZwdLCY"], "start_seconds": ["50", "20"], "properties": ["person, spray, yell", "water drains, flushes, water"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a man speaks as a motor runs in the background"], "sample_ids": ["zofjfKhqLk8", "xZepNM9qcRA"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "background, motor, run"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man speaks while a motorcycle revs and accelerates "], "question": "which motor is running in the background", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "some men converse over an engine running"], "sample_ids": ["vs65y4qmyBE", "sCiy7QS1U"], "start_seconds": ["340", "300"], "properties": ["engine, run, man", "men, converse, engine"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a man speaking to an engine?", "label": 0}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "small dogs yip and bark sharply"], "sample_ids": ["tDVADusiIoc", "v-wcQf4BDY0"], "start_seconds": ["60", "120"], "properties": ["wind, radio, waves", "bark, yip, sharply"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["shmR4OZtzqA", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["man, engine, idle", "a woman, a television program, a bird"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man speaks while a motor runs", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["some people speak", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vbZ-0lGPneg", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "airplane, boy, fly"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "some light rustling followed by a loud burp and a girl speaking"], "sample_ids": ["xZepNM9qcRA", "vdoxuJn9lTc"], "start_seconds": ["30", "40"], "properties": ["background, motor, run", "burp, loud, girl"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a child speaks followed by a burp"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a horn blasts as warning bells ring", "a woman speaks happily and an animal chirps"], "sample_ids": ["zgUgkpk78xU", "uWAAAL4CIoc"], "start_seconds": ["70", "0"], "properties": ["horn, bells, ring", "a woman, chirps, animal"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person speaks briefly", "wind blows as people chatter quietly"], "sample_ids": ["zOZleIRqZm4", "xBxDz0CFVn0"], "start_seconds": ["80", "30"], "properties": ["person, talk, brief", "wind, chatter, people"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xKB8O8LTs6s", "tdWhHV3X25Q"], "start_seconds": ["70", "60"], "properties": ["music, radio, gunshots", "applause, audience, yells"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["yLy-WycbVVE", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["background, people, talk", "harsh, wind, blows"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "pigeons vocalize and birds chirp"], "sample_ids": ["siJFXfGWgDk", "uiS58TNyUiw"], "start_seconds": ["50", "430"], "properties": ["man, woman, vehicle", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["water flows and trickles", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tB7hWb9gTuQ", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["water, flow, trickle", "applause, audience, yells"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["water is splashing and gurgling", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["white noise and birds chirping", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wRBHTgrbiwg", "ukg5L09Wpvo"], "start_seconds": ["50", "150"], "properties": ["noise, white, chirping", "clickety-clack, train, whistle"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a train blows its whistle and blows its horn "], "question": "which noise is continuous", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a car accelerates and wind blows"], "sample_ids": ["sZPuqDgX2V0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["engine, accelerate, intercom", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which car accelerates", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "wind blows as people chatter quietly"], "sample_ids": ["tK4VlLsNxak", "xBxDz0CFVn0"], "start_seconds": ["120", "30"], "properties": ["a, dial, telephone", "wind, chatter, people"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["u21-Z5gJCB8", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["background, voice, man", "rooster, crow, background, men"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a person is whistling", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sIXTftIuUgw", "wqZ135Ssz0"], "start_seconds": ["90", "60"], "properties": ["person, whistling, person", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["sQGXqGcwOTc", "ukg5L09Wpvo"], "start_seconds": ["3", "150"], "properties": ["audio, kid, giggles", "clickety-clack, train, whistle"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["xO-Q2BlIIPU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["two men, exclamation, speak", "stream, water, flow"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity is a video of a stream of water flowing as people talk and wind blows?", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "someone whistles a tune"], "sample_ids": ["s3cTDAj31g", "sIXTftIuUgw"], "start_seconds": ["80", "90"], "properties": ["man, talk, woman", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a baby is crying", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "some tunes played by whistling"], "sample_ids": ["w5W5Kqtc8E", "u6BnG6YZqJ4"], "start_seconds": ["100", "0"], "properties": ["water, splashes, motorboat", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a heavy rain falls endlessly", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wP8ZKrlx3oA", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["heavy, rain, fall", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a train blows its whistle and blows its horn "], "question": "which entity is continuous", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "someone snores nearby"], "sample_ids": ["uEU-Hg5MTN8", "spJCm8tD9Zo"], "start_seconds": ["27", "90"], "properties": ["a woman, laughs, animal", "someone snores, nearby, someone"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["w8uLijTqtlU", "wqZ135Ssz0"], "start_seconds": ["70", "60"], "properties": ["wind, microphone, noise", "two men, woman, birds"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a man speaks as a car is passing by"], "sample_ids": ["x5cuQjOdM3E", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["cat, talk, meow", "a, car, pass"], "captions_pred_video": ["a black background with an airplane flying in the sky", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a male speaks and another male speaks", "an airplane engine spools and people speak"], "sample_ids": ["viuTg1M-dqg", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["two males, speaking, male", "airplane, engine, spool"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine is running and people are talking"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a motorcycle engine is revving while people are speaking", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["y8dSeubCNI", "uEU-Hg5MTN8"], "start_seconds": ["4", "27"], "properties": ["engine revving, people speaking, motorcycle", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine revving and people talking in the background", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["goats bleat and people speak", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["z5iUE5h0EPs", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["goats bleat, people speak, language", "a woman, something, fried"], "captions_pred_video": ["of the goat in the barn", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a goat bleats and a man speaks", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a toilet flushes and water drains", "water flows as men speak and yell"], "sample_ids": ["sfAvvZwdLCY", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["water drains, flushes, water", "water, flow, men"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water", "label": 1}, {"captions": ["paper is crumpling consistently", "plastic is tapped on while someone speaks"], "sample_ids": ["v5cSxLaHADY", "wvKpEYswXO0"], "start_seconds": ["0", "150"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "plastic, tap, speak"], "captions_pred_video": ["footage of the person holding a pair of scissors", "of the person preparing food in the kitchen"], "captions_pred_audio": ["paper is crumpled and crinkled", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is being tapped on", "label": 0}, {"captions": ["an electric engine works nearby followed by a child talking", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["xSKJGCItUWE", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["engine, work, child", "animal, grunts, snorts"], "captions_pred_video": ["footage of the helicopter flying in the room", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["ylpYOorfH4o", "uZesmtKZGSw"], "start_seconds": ["410", "250"], "properties": ["engine, running, wind", "men, talk, cars"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a telephone rings followed by a woman talking"], "sample_ids": ["ul60S8TXDA8", "tGcFnX0GHI"], "start_seconds": ["60", "0"], "properties": ["sound, distance, bell", "ring, talk, woman"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", null], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["people speak and tapping occurs", "white noise and snoring with some rustling in the background"], "sample_ids": ["tFCUUGdREgA", "xzKKf9bKNUo"], "start_seconds": ["70", "10"], "properties": ["people, tap, speak", "background, noise, snoring"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "shows a woman laying on a bed with her eyes closed and her mouth open"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a person snoring loudly"], "question": "which entity is more quiet", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "a clock ticktocks"], "sample_ids": ["ujMt0-D-x2k", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["snoring, rhythmical, nearby", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of the dog playing with a toy on the floor", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person is snoring loudly", "a clock is ticking loudly"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "people applaud and hoot and chat quietly"], "sample_ids": ["vddP56-ogds", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["water, splash, person, laugh", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a train horn sounds as it passes by", "a pigeon cooing as an insect buzzes by briefly"], "sample_ids": ["ukg5L09Wpvo", "yZrFNS7GFBQ"], "start_seconds": ["150", "30"], "properties": ["sound, train, horn", "pigeon, buzzes, insect"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "of the bird in the cage"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "an owl hoots in the background "], "question": "which entity is a bird?", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a infant makes noise and is excited"], "sample_ids": ["vXlk0lIQBFo", "wIJK3-5y0kA"], "start_seconds": ["470", "30"], "properties": ["wind, speak, vocalize", "noise, excited, infant"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a man speaks as horns blow", "people cheer as a vehicle engine revs"], "sample_ids": ["tHyNqRyK34A", "xjhAnI2q6hM"], "start_seconds": ["24", "6"], "properties": ["a, man, speaks", "engine revs, vehicle, people"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["ukg5L09Wpvo", "yks4cLgIDMc"], "start_seconds": ["150", "170"], "properties": ["a train, a horn, a bell", "background, speaking, child"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["a door opens and birds chirp", "a man speaks as a motor runs in the background"], "sample_ids": ["yeFvk9x0wWI", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "background, motor, run"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["uZesmtKZGSw", "uYT5gxnyMWM"], "start_seconds": ["250", "50"], "properties": ["men, talk, cars", "a, scream, girl"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has more people", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a woman speaks as she rubs two objects together"], "sample_ids": ["yYEVLuqEytU", "vzxHnu-SFEw"], "start_seconds": ["40", "80"], "properties": ["grunt, slurp, background", "two objects, woman, speak"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "people applaud and hoot and chat quietly"], "sample_ids": ["zVacuqSb4LI", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["blares, fades, train", "people, applaud, hoot"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", null], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a telephone rings followed by a woman talking"], "sample_ids": ["rwtmaKiCcQU", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["nozzle, depressed, spray can", "ring, talk, woman"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", null], "captions_pred_audio": ["spraying and people speaking", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "people cheer as a vehicle engine revs"], "sample_ids": ["wRBHTgrbiwg", "xjhAnI2q6hM"], "start_seconds": ["50", "6"], "properties": ["bird, owl, speak", "engine revs, vehicle, people"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tDlysoZiA1I", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["animal, grunts, chirps", "engine, laugh, loud"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a small engine spits as it runs", "an infant crying as a woman laughs"], "sample_ids": ["sZvwOuuPGP0", "xhmRY9yhC7c"], "start_seconds": ["50", "20"], "properties": ["spits, engine, runs", "a, laugh, infant"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a medium engine is running ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["paper folding and crinkling", "a man speaks followed by another man speaking outside"], "sample_ids": ["zPpG3RD8lSs", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["paper, fold, crinkle", "two men, speak, follow"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wyllXV6PjKo", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["a kid, talk, cry", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["someone is snoring while sleeping", "wind blowing followed by a zoom"], "sample_ids": ["ujMt0-D-x2k", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["snore, sleep, someone", "wind, blow, zoom"], "captions_pred_video": ["of the dog playing with a toy on the floor", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a person is snoring loudly", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a person", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "water flows as men speak and yell"], "sample_ids": ["vddP56-ogds", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["water, splash, person, laugh", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows water flowing?", "label": 1}, {"captions": ["water flows followed by women screaming", "a infant makes noise and is excited"], "sample_ids": ["w5W5Kqtc8E", "wIJK3-5y0kA"], "start_seconds": ["100", "30"], "properties": ["water, flow, women", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a frog croaks as other frogs croak in the background"], "sample_ids": ["zuua6-5goWw", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["birds, chirp, quiet, man, speaks", "background, frog, croak"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a close up of a frog in the water"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a frog is croaking"], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind blows strongly and a young man speaks", "a man speaks as a car is passing by"], "sample_ids": ["vs65y4qmyBE", "sK4u5T8hW78"], "start_seconds": ["340", "30"], "properties": ["wind, blows, strongly", "a, car, pass"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking to a car?", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "roadway noise occurs and a truck accelerates"], "sample_ids": ["xjhAnI2q6hM", "tgbONvsP47Y"], "start_seconds": ["6", "0"], "properties": ["engine revs, vehicle, people", "noise, truck, accelerate"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a car is driving on the road "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a girl speaks followed by a scream and more girls talking", "distant men speak as a spray can nozzle is depressed"], "sample_ids": ["uYT5gxnyMWM", "rwtmaKiCcQU"], "start_seconds": ["50", "30"], "properties": ["a, scream, girl", "nozzle, depressed, spray can"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "shows a man spraying paint on a wall with a spray gun"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "spraying and people speaking"], "question": "which entity is about a spray can?", "label": 1}, {"captions": ["a drill runs and two people laugh", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tEE3MpBt1sg", "tiDFTC-5vU"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "male, duck, laugh"], "captions_pred_video": ["footage is blurry due to the smoke in the air", null], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking and ducks are quacking"], "question": "which entity has more people laughing", "label": 1}, {"captions": ["a motorcycle engine is idling", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vZAqdHZ81yA", "tDVADusiIoc"], "start_seconds": ["180", "60"], "properties": ["engine, motorcycle, idling", "water, radio, man"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["an engine is idling loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vh30P49Po6s", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["loud, continuous, quacks", "engine, idle, woman"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and a subway train is moving "], "question": "which entity is quieter", "label": 1}, {"captions": ["a weapon fires multiple times", "a vehicle engine revs and tires squeal"], "sample_ids": ["sMC07Ucy7kg", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["weapon, fire, multiple", "engine revs, tires squeal, vehicle"], "captions_pred_video": ["footage is from a car's point of view", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["music plays followed by gunshots and then an explosion", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xKB8O8LTs6s", "yDoT73BWsdA"], "start_seconds": ["70", "10"], "properties": ["music, gunshots, explosion", "engine, revs, vehicle"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a car accelerates and wind blows"], "sample_ids": ["wPz6QRAkEb4", "u0TrcHhkPQ"], "start_seconds": ["60", "20"], "properties": ["chirps, tweets, song", "accelerates, wind, blows"], "captions_pred_video": ["a bird in a cage on top of a pole", null], "captions_pred_audio": ["birds are chirping in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["people speak then an engine runs", "a toilet flushes and a female speaks"], "sample_ids": ["uMTTDZ2mb4", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["engine, run, people", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["x9JovgqUcs", "uZesmtKZGSw"], "start_seconds": ["500", "250"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a duck quacks continuously"], "sample_ids": ["xV7Mg1QucSc", "vh30P49Po6s"], "start_seconds": ["14", "30"], "properties": ["alarm, ticktocks, laughs", "quacks, continuously, duck"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["goats bleat and people speak", "water pouring and bubbling"], "sample_ids": ["z5iUE5h0EPs", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["goats bleat, people speak, language", "water, bubbles, pouring"], "captions_pred_video": ["of the goat in the barn", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a goat bleats and a man speaks", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "some tunes played by whistling"], "sample_ids": ["vSeGhaZt-aI", "u6BnG6YZqJ4"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, run", "tune, play, whistling"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "an infant crying frantically"], "sample_ids": ["xvDdE3zNf8Y", "zwOBqeFTgiU"], "start_seconds": ["120", "30"], "properties": ["A, crumple, paper", "cry, infant, frantically"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of the baby crying in the car seat"], "captions_pred_audio": ["a woman speaks and crumples paper", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["someone snores nearby", "vehicles pass by on a roadway"], "sample_ids": ["spJCm8tD9Zo", "tgbONvsP47Y"], "start_seconds": ["90", "0"], "properties": ["someone snores, nearby, someone", "pass, vehicle, roadway"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person is snoring loudly", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a baby cries and a woman moans", "a child speaks"], "sample_ids": ["smDKStoHBJo", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["a, cry, woman", "a, child, speaks"], "captions_pred_video": ["a man holding a crying baby in his arms", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a child?", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zALy31PjDl0", "sLUnaPT5gM8"], "start_seconds": ["21", "0"], "properties": ["a man, a vehicle, a horn", "loud, laughter, intermittent"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a propeller rotates loudly and intensely"], "sample_ids": ["y2ZBGpgbhHM", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["birds, tweet, pant", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["birds chirping and a dog panting", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["wEBlkGWVWwE", "vzxHnu-SFEw"], "start_seconds": ["260", "80"], "properties": ["a, babble, woman", "two objects, woman, speak"], "captions_pred_video": ["shows a person writing on the whiteboard", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman speaks as she rubs two objects together?", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "a child speaks"], "sample_ids": ["y2ZBGpgbhHM", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["dog, chirp, breathe", "a, child, speaks"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["water bubbles and gurgles.", "a car accelerates and wind blows"], "sample_ids": ["tB7hWb9gTuQ", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["bubbles, gurgles, water", "accelerates, wind, blows"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", null], "captions_pred_audio": ["water is splashing and gurgling", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "vehicles pass by on a roadway"], "sample_ids": ["w0xsN8X18Y", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["music, surface, rain", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a car is driving on the road "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "a car accelerates and wind blows"], "sample_ids": ["x4a9YGIw4ok", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["water, gurgles, stops", "accelerates, wind, blows"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and water splashes", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp and wind blows", "wind blows as people chatter quietly"], "sample_ids": ["sxIvBMSavMQ", "xBxDz0CFVn0"], "start_seconds": ["210", "30"], "properties": ["birds, chirp, wind", "wind, chatter, people"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["y2bVZ7rz-5M", "ukg5L09Wpvo"], "start_seconds": ["280", "150"], "properties": ["engine, horn, siren", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["sOa7g-44Dag", "t25U-v4k4ts"], "start_seconds": ["30", "40"], "properties": ["background, man, spray", "a, chirps, bird"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking and bees are buzzing"], "question": "which entity has a bird chirp?", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vZAw4apG0Es", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["background, clock, ticktocks", "a woman, something, fried"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a clock is ticking and people are talking", "a woman is speaking while food is frying in the background"], "question": "which entity has a clock ticktocking in the background?", "label": 0}, {"captions": ["an adult woman and an adult man speak", "a small engine spits as it runs"], "sample_ids": ["zTLVJCo4WEE", "sZvwOuuPGP0"], "start_seconds": ["30", "50"], "properties": ["two people, adult, speak", "spits, engine, runs"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "of a bulldozer clearing a road in a forest stock footage and royalty-free videos"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a medium engine is running "], "question": "which entity is not a person", "label": 1}, {"captions": ["an insect buzzes around continuously", "a vehicle engine accelerating then running on idle"], "sample_ids": ["v25l1jef3JY", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "engine, accelerate, idle"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "an engine is idling"], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a male speaks over some small clicks", "paper is crumpling consistently"], "sample_ids": ["uXxVebHsGZ8", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["male, clicks, speak", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xKB8O8LTs6s", "tDVADusiIoc"], "start_seconds": ["70", "60"], "properties": ["music, gunfire, explosion", "water, radio, man"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "water flows and trickles"], "sample_ids": ["yZp6xizR0yU", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["animal, bleat, cry", "water, flow, trickle"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a man speaking with light rustling"], "sample_ids": ["wztCSUxOf8", "zOZleIRqZm4"], "start_seconds": ["130", "80"], "properties": ["a crowd, yells, applauds", "light, rustling, man"], "captions_pred_video": [null, "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking with crickets chirping in the background"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "water flows as men speak and yell"], "sample_ids": ["yRx9txMcBl0", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["accelerates, tires, squeals", "water, flow, men"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a vehicle?", "label": 0}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wSVhSdj0F0", "vfYTJq7nU"], "start_seconds": ["10", "130"], "properties": ["horn honks, keys jingle, slam", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a duck quacks and a woman speaks"], "question": "which entity is more likely to be heard in a car", "label": 0}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "some men converse over an engine running"], "sample_ids": ["y2bVZ7rz-5M", "sCiy7QS1U"], "start_seconds": ["280", "300"], "properties": ["motor noise, horn, siren", "men, converse, engine"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation between men?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man speaks as a car is passing by"], "sample_ids": ["sfAvvZwdLCY", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "a, car, pass"], "captions_pred_video": ["footage of the toilet in the bathroom", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["ukxt9I7eMMg", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["continuous, woman, speaking", "loud, jet engine, roar"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "someone whistles a tune"], "sample_ids": ["zkKdxzNC97Y", "sIXTftIuUgw"], "start_seconds": ["27", "90"], "properties": ["loud, bang, noise", "someone, tune, whistle"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a person whistling a song"], "question": "which entity is softer", "label": 1}, {"captions": ["a dog barks and whimpers", "vehicles pass by on a roadway"], "sample_ids": ["sShpyu2l4YQ", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["barks, whimpers, dog", "pass, vehicle, roadway"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a dog is barking and growling", "a car is driving on the road "], "question": "which entity is more passive", "label": 1}, {"captions": ["someone snores nearby", "some tunes played by whistling"], "sample_ids": ["spJCm8tD9Zo", "u6BnG6YZqJ4"], "start_seconds": ["90", "0"], "properties": ["someone snores, nearby, someone", "tune, play, whistling"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a person is snoring loudly", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "paper is crumpling consistently"], "sample_ids": ["vSeGhaZt-aI", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, speak", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "an insect buzzes around continuously"], "sample_ids": ["wRV8yMk886E", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["liquid, spray, nozzle", "buzzes, continuously, insect"], "captions_pred_video": ["two cars are parked in a parking lot at night", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["an insect buzzes around continuously", "a person sniffles and then sneezes in the distance"], "sample_ids": ["v25l1jef3JY", "uRlbY6aoBU"], "start_seconds": ["0", "0"], "properties": ["buzzes, continuously, insect", "a, distance, sneeze"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is sneezing "], "question": "which entity is not a person?", "label": 0}, {"captions": ["rain falls on a surface as men speak and music plays", "an adult male speaks and dials a rotary phone"], "sample_ids": ["w0xsN8X18Y", "tK4VlLsNxak"], "start_seconds": ["30", "120"], "properties": ["music, surface, rain", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": [null, "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking and using a sewing machine"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["small dogs yip and bark sharply", "vehicles pass by on a roadway"], "sample_ids": ["v-wcQf4BDY0", "tgbONvsP47Y"], "start_seconds": ["120", "0"], "properties": ["bark, yip, sharply", "pass, vehicle, roadway"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a dog barks and growls", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a male speaks and another male speaks", "a car accelerates and wind blows"], "sample_ids": ["viuTg1M-dqg", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["two males, speaking, male", "accelerates, wind, blows"], "captions_pred_video": ["footage of water coming out of a hole in the ground", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "wind blows as people chatter quietly"], "sample_ids": ["vimzuGQvdcU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "wind, chatter, people"], "captions_pred_video": ["a group of people are rafting down a river", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "an infant crying frantically"], "sample_ids": ["wz7N8YRy74I", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, men", "cry, infant, frantically"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a toilet flushes and water drains", "someone is typing on a computer keyboard"], "sample_ids": ["sfAvvZwdLCY", "v0x1odnXtP0"], "start_seconds": ["20", "210"], "properties": ["water drains, flushes, water", "keyboard, type, computer"], "captions_pred_video": ["footage of the toilet in the bathroom", "how to make money on youtube in spanish"], "captions_pred_audio": ["a toilet is flushed", "a person is typing on a keyboard"], "question": "which object is used to type on a computer", "label": 1}, {"captions": ["a man speaks uses a drill", "a duck quacks loudly and continuously"], "sample_ids": ["x5eIC7S0fbg", "vh30P49Po6s"], "start_seconds": ["60", "30"], "properties": ["A man is speaking, uses a drill, and is a tool", "loud, continuous, quacks"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a duck is quacking loudly"], "question": "which entity is a tool", "label": 0}, {"captions": ["a small engine idles continuously", "winds blows roughly as a vehicle races past"], "sample_ids": ["y5WII6cTH7k", "xjvTpk2Zpr8"], "start_seconds": ["40", "70"], "properties": ["engine, idle, continuously", "wind, blows, vehicle"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a jet engine roars and wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zuua6-5goWw", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["birds, chirp, quiet, man, speaks", "a woman, something, fried"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a person screams glaringly", "a goat bleats as a person speaks"], "sample_ids": ["xC8kbrKJmco", "tPJvjq9QePY"], "start_seconds": ["0", "40"], "properties": ["glaringly, screams, person", "bleats, person, speak"], "captions_pred_video": [null, "a dog and a sheep in a barn"], "captions_pred_audio": ["a goat is bleating ", "a baby cries and a man speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "some tunes played by whistling"], "sample_ids": ["u7C-AEBQM", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["ticks, rhythmic, quiet", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a ticktock of a clock", "a person whistling a song"], "question": "which entity is not playing a tune?", "label": 0}, {"captions": ["a door opens and birds chirp", "winds blows roughly as a vehicle races past"], "sample_ids": ["yeFvk9x0wWI", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["door, open, birds", "wind, blows, vehicle"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a man is filing a hard object", "a stream of water runs briefly"], "sample_ids": ["vveS8HT7Uog", "x-PeY8Yb8M4"], "start_seconds": ["100", "300"], "properties": ["a man, hard, object", "stream, water, run"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a car is driving on a wet road "], "question": "which entity is not a stream of water?", "label": 0}, {"captions": ["a fly buzzes around loudly as birds chirp", "a kid speaks followed by music playing"], "sample_ids": ["uJV8NDaHqqk", "tQWGZLItBXk"], "start_seconds": ["100", "170"], "properties": ["loud, fly, chirp", "music, kid, speak"], "captions_pred_video": ["a bee hive in a wooden box", "worms revolution screenshots"], "captions_pred_audio": ["a swarm of bees buzzing around", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity is quieter", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "small dogs yip and bark sharply"], "sample_ids": ["u7C-AEBQM", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["ticks, rhythmic, quiet", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a ticktock of a clock", "a dog barks and growls"], "question": "which entity is louder", "label": 1}, {"captions": ["long loud burping by a man", "a clock ticktocks"], "sample_ids": ["xmiUIOhtZyQ", "v-g-j2uTByM"], "start_seconds": ["60", "30"], "properties": ["loud, burp, man", "ticktocks, clock, ticktocks"], "captions_pred_video": ["homer simpson drinking a beer", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person burps and music plays in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a duck quacks continuously"], "sample_ids": ["xyL9F5VrjkE", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["wind, motor, distance", "quacks, continuously, duck"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "small dogs yip and bark sharply"], "sample_ids": ["vddP56-ogds", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["water, flow, laugh", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "some men converse over an engine running"], "sample_ids": ["vimzuGQvdcU", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["a, man, yells", "men, converse, engine"], "captions_pred_video": ["a group of people are rafting down a river", null], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more quiet", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zFjIWfSD-4", "su6FAOcOA8c"], "start_seconds": ["410", "4"], "properties": ["People, motor, brakes", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wsHBIgzs9Fs", "tdWhHV3X25Q"], "start_seconds": ["50", "60"], "properties": ["horn, continuous, buzzing", "applause, audience, yells"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a man is speaking and a crowd is clapping"], "question": "which entity is a response to a performance", "label": 1}, {"captions": ["a weapon fires multiple times", "vehicles pass by on a roadway"], "sample_ids": ["sMC07Ucy7kg", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["weapon, fire, multiple", "pass, vehicle, roadway"], "captions_pred_video": ["footage is from a car's point of view", "footage of a fire truck entering a garage"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a car is driving on the road "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a child speaks in closed space"], "sample_ids": ["ylpYOorfH4o", "yW6FWLSLkx4"], "start_seconds": ["410", "40"], "properties": ["engine, run, loud", "child, space, speak"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tOj4tdLRaA", "tdWhHV3X25Q"], "start_seconds": ["70", "60"], "properties": ["woman, laugh, baby", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sAam2NqGhLY", "vYkA3cfXp5Q"], "start_seconds": ["20", "30"], "properties": ["snoring, breathing, child", "engine, accelerate, idle"], "captions_pred_video": ["of a little girl sleeping on a couch", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a person is snoring", "an engine is idling"], "question": "which entity is not a person", "label": 1}, {"captions": ["a man speaks as crickets sing", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["ryFDPxgDOGc", "vbZ-0lGPneg"], "start_seconds": ["570", "30"], "properties": ["a, crickets, sing", "a woman, a television program, a bird"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a dog is whimpering"], "question": "which entity has a man speaking as crickets sing?", "label": 0}, {"captions": ["a muffled toilet flushes and the water drains", "race cars go around a track as a man commentates"], "sample_ids": ["sfAvvZwdLCY", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["flushes, drains, water", "car, track, man"], "captions_pred_video": ["footage of the toilet in the bathroom", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "an animal growls followed by birds chirping"], "sample_ids": ["tOj4tdLRaA", "y2ZBGpgbhHM"], "start_seconds": ["70", "30"], "properties": ["woman, laugh, baby", "animal, growl, bird"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "birds chirping and a dog panting"], "question": "which entity is more likely to be a solitary event", "label": 1}, {"captions": ["a woman talking as an infant is crying", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["tMbMDvT50j8", "zFjIWfSD-4"], "start_seconds": ["12", "410"], "properties": ["a, talk, infant", "People, motor, brakes"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a woman talking to an infant?", "label": 0}, {"captions": ["an insect buzzes around continuously", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["v25l1jef3JY", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["buzzes, continuously, insect", "People, motor, brakes"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is not a person?", "label": 0}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uPDn2BFTHk", "uYT5gxnyMWM"], "start_seconds": ["140", "50"], "properties": ["lady, laugh, baby", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is more violent", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "pigeons vocalize and birds chirp"], "sample_ids": ["xzKKf9bKNUo", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["background, noise, snoring", "vocalize, bird, chirp"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "of the pigeon in the cage"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["ticking continues without interruption", "people speak as gunfire rings out"], "sample_ids": ["v-g-j2uTByM", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["ticking, continuous, clock", "gunfire, ring, speak"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be interrupted", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a car speeding up in the distance"], "sample_ids": ["sEprKHm8Sj8", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["car, tires, slows", "distance, car, speed"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which car is speeding up in the distance", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "water flows as men speak and yell"], "sample_ids": ["zkKdxzNC97Y", "vJ7JPEFhyLA"], "start_seconds": ["27", "16"], "properties": ["loud, bang, noise", "water, flow, men"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking and yelling?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "water flows and trickles"], "sample_ids": ["vh30P49Po6s", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["loud, continuous, quacks", "water, flow, trickle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a duck is quacking loudly", "water is splashing and gurgling"], "question": "which entity is quieter", "label": 1}, {"captions": ["a dog barks and whimpers", "a woman speaks happily and an animal chirps"], "sample_ids": ["sShpyu2l4YQ", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["barks, whimpers, dog", "a woman, chirps, animal"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and a dog is barking "], "question": "which entity is a bird?", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["uiS58TNyUiw", "wqZ135Ssz0"], "start_seconds": ["430", "60"], "properties": ["vocalize, bird, chirp", "two men, woman, birds"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a video of birds?", "label": 0}, {"captions": ["someone is snoring while sleeping", "pigeons vocalize and birds chirp"], "sample_ids": ["ujMt0-D-x2k", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["snore, sleep, someone", "vocalize, bird, chirp"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of the pigeon in the cage"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a horn honks and then loudly blares", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wnpJndXuxLc", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["horn, honk, loud", "three men, wind, flow"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["water splashes and a door squeaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sdXV-ylviw", "zFjIWfSD-4"], "start_seconds": ["190", "410"], "properties": ["sound, splash, door", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and taps with background noise ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a door that squeaks?", "label": 0}, {"captions": ["a telephone rings followed by a woman talking", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tGcFnX0GHI", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["ring, talk, woman", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people talking", "label": 1}, {"captions": ["bees buzz as wind blows", "a man speaks as a car is passing by"], "sample_ids": ["tMJne1a4AFI", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["bees, buzz, wind", "a, car, pass"], "captions_pred_video": ["a swarm of bees on the ground", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "water flows as men speak and yell"], "sample_ids": ["vms5XGTDVQc", "vJ7JPEFhyLA"], "start_seconds": ["220", "16"], "properties": ["paper, crumpled, crinkled", "water, flow, men"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a moving object", "label": 1}, {"captions": ["dogs barking and whimpering", "a man speaks as a car is passing by"], "sample_ids": ["tIY7qOV3rEM", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "a, car, pass"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more passive", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sOa7g-44Dag", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["audio, scratching, man", "male, duck, laugh"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a man talks as something metal hits against and glass is set down"], "sample_ids": ["zuua6-5goWw", "x6ijhqRY38s"], "start_seconds": ["30", "250"], "properties": ["birds, chirp, quiet, man, speaks", "something metal, glass, hit"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a chef preparing a dish with a bottle of wine and a plate of food on a table"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking and dishes are clanging "], "question": "which entity is about a man talking?", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a woman speaks in a fast tone with a male"], "sample_ids": ["vKrYfzleLB8", "sTpirNYo8vQ"], "start_seconds": ["110", "30"], "properties": ["a, ring, gunshots", "a, tone, fast"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a man is speaking while a car is revving and accelerating "], "question": "which entity has a man yell?", "label": 0}, {"captions": ["dogs bark as an engine runs and a person whistles", "a stream of water runs briefly"], "sample_ids": ["zY3icUyMdh8", "x-PeY8Yb8M4"], "start_seconds": ["20", "300"], "properties": ["dog, bark, engine", "stream, water, run"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a female speaks softly as paper crinkles"], "sample_ids": ["xjhAnI2q6hM", "xvDdE3zNf8Y"], "start_seconds": ["6", "120"], "properties": ["engine revs, vehicle, people", "a, female, speaks"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a woman speaks and crumples paper"], "question": "which entity is a person", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "birds chirp and objects are moved around"], "sample_ids": ["t8CV69hcvF0", "yPUYU6t3rwo"], "start_seconds": ["210", "370"], "properties": ["person, sneeze, follow", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman sneezes and speaks", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["an engine runs loudly", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vqZuVbG6-HI", "yajyRTUQk3U"], "start_seconds": ["130", "400"], "properties": ["loud, engine, run", "a woman, something, fried"], "captions_pred_video": ["footage is blurry because it's raining outside", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "pigeons vocalize and birds chirp"], "sample_ids": ["s59PfAghdkM", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["bird, chirp, background, horse, neigh", "vocalize, bird, chirp"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "of the pigeon in the cage"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yFB25fqfU8I", "yajyRTUQk3U"], "start_seconds": ["300", "400"], "properties": ["wave, crash, shoreline", "a woman, something, fried"], "captions_pred_video": ["footage of a person surfing in the ocean", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of something being fried?", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "several insects fly while two men talk"], "sample_ids": ["vzxHnu-SFEw", "s-T9OVOiMLo"], "start_seconds": ["80", "330"], "properties": ["two objects, woman, speak", "several, fly, men"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a woman speaking as she rubs two objects together?", "label": 0}, {"captions": ["dogs bark as an engine runs and a person whistles", "birds coo incessantly"], "sample_ids": ["zY3icUyMdh8", "yZrFNS7GFBQ"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "coo, bird, incessant"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of the bird in the cage"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "an owl hoots in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a dark barks and whimpers", "water is sprayed across a hard surface"], "sample_ids": ["sYj4hpDUZDQ", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["barks, whimpers, dark", "water, spray, surface"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a dog barks and a cat meows", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["food is frying and sizzles", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zNRChLjqcU", "w5W5Kqtc8E"], "start_seconds": ["220", "100"], "properties": ["food is frying, sizzles, food", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running from a faucet into a sink", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a duck quacks loudly and continuously", "a woman speaks happily and an animal chirps"], "sample_ids": ["vh30P49Po6s", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["loud, continuous, quacks", "a woman, chirps, animal"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and a dog is barking "], "question": "which entity is quieter", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yswmmRZFItk", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["background, frog, croak", "a woman, something, fried"], "captions_pred_video": ["a close up of a frog in the water", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a frog is croaking", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sU53zg9Jp7s", "uEU-Hg5MTN8"], "start_seconds": ["380", "27"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "a woman, laughs, animal"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and laughing?", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["s59PfAghdkM", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["bird, chirp, background, horse, neigh", "airplane, boy, fly"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "frogs croak and vocalize"], "sample_ids": ["vb1fPSDI4c", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["multiple, people, yell", "croak, vocalize, frog"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vlS6YMeWAPo", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["sheep, baa, birds", "loud, multiple, distance"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a woman speaks happily and an animal chirps"], "sample_ids": ["ugHJF0hfYkg", "uWAAAL4CIoc"], "start_seconds": ["10", "0"], "properties": ["loud, intense, propeller", "a woman, chirps, animal"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a dog is barking "], "question": "which entity is quieter", "label": 1}, {"captions": ["an adult woman and an adult man speak", "a man speaks as a car is passing by"], "sample_ids": ["zTLVJCo4WEE", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["two people, adult, speak", "a, car, pass"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vfYTJq7nU", "w34HjHr6gAY"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a beep sounds followed by a child speaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "wind blowing followed by a zoom"], "sample_ids": ["y2bVZ7rz-5M", "vr8ZXjEBhMQ"], "start_seconds": ["280", "150"], "properties": ["engine, horn, siren", "wind, blow, zoom"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "people applaud and hoot and chat quietly"], "sample_ids": ["uiS58TNyUiw", "wwyfGO2J4"], "start_seconds": ["430", "90"], "properties": ["audio, man, speaking", "people, applaud, hoot"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a man speaks then blows a vehicle horn as wind blows"], "sample_ids": ["zofjfKhqLk8", "zALy31PjDl0"], "start_seconds": ["10", "21"], "properties": ["background, metal, clank", "a man, a vehicle, a horn"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a motorcycle is parked on the side of a brick walkway"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and a car horn is honking"], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["uYT5gxnyMWM", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["female, spraying, scream", "a, scream, girl"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a baby is crying"], "question": "which entity has a scream", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "some tunes played by whistling"], "sample_ids": ["rwtmaKiCcQU", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["nozzle, depressed, spray can", "tune, play, whistling"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["spraying and people speaking", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["rwtmaKiCcQU", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["nozzle, depressed, spray can", "wind, blow, vehicle"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", null], "captions_pred_audio": ["spraying and people speaking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["several insects fly while two men talk", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["s-T9OVOiMLo", "xKB8O8LTs6s"], "start_seconds": ["330", "70"], "properties": ["several, fly, men", "music, gunfire, explosion"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["su6FAOcOA8c", "zj2R0XoFr5k"], "start_seconds": ["4", "50"], "properties": ["engine, idle, woman", "airplane, boy, fly"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a machine beeps continuously", "a clock ticktocks"], "sample_ids": ["y682ml90jGw", "v-g-j2uTByM"], "start_seconds": ["11", "30"], "properties": ["beeps, machine, continuously", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a beeping sound is being made ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["water splashes as an animal walks through", "a toilet flushes and a female speaks"], "sample_ids": ["w1ir-sZ3Im8", "yaln9y8I7ms"], "start_seconds": ["90", "230"], "properties": ["animal, water, splashes", "female, flushes, toilet"], "captions_pred_video": ["footage of a group of people riding horses through a river", "footage is blurry and out of focus"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a clock ticktocks briefly"], "sample_ids": ["wPz6QRAkEb4", "u7C-AEBQM"], "start_seconds": ["60", "30"], "properties": ["chirps, tweets, song", "ticktocks, clock, ticktocks briefly"], "captions_pred_video": ["a bird in a cage on top of a pole", null], "captions_pred_audio": ["birds are chirping in the background ", "a ticktock of a clock"], "question": "which entity is silent", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vW4x7S1VfQc", "yajyRTUQk3U"], "start_seconds": ["150", "400"], "properties": ["clacking, oil, woman", "a woman, something, fried"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "- a woman cooking in the kitchen"], "captions_pred_audio": ["food sizzles in a frying pan", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "someone whistles a tune"], "sample_ids": ["vbpKkWvfOu4", "sIXTftIuUgw"], "start_seconds": ["560", "90"], "properties": ["a, man, speaks", "someone, tune, whistle"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a stream of water runs briefly"], "sample_ids": ["sapQIQUhFc", "x-PeY8Yb8M4"], "start_seconds": ["280", "300"], "properties": ["water, stream, trickles", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a car is driving on a wet road "], "question": "which stream is running", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["siJFXfGWgDk", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["a, bird, vehicle", "airplane, boy, fly"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a bird flying by?", "label": 0}, {"captions": ["a motorcycle engine is idling", "water splashes as an animal walks through"], "sample_ids": ["vZAqdHZ81yA", "w1ir-sZ3Im8"], "start_seconds": ["180", "90"], "properties": ["engine, motorcycle, idling", "animal, water, splashes"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["an engine is idling loudly", "water splashes and gurgles as people speak"], "question": "which entity is not a vehicle", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["yRx9txMcBl0", "uWPRNLnpy7Y"], "start_seconds": ["40", "10"], "properties": ["motors, tires, screech", "accelerate, laugh, vehicle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "is taken from a car driving down the street"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["some clanking with distant murmuring", "someone whistles a tune"], "sample_ids": ["uMTTDZ2mb4", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["clanking, murmuring, distant", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a person whistling a song"], "question": "which is a musical instrument", "label": 1}, {"captions": ["bees buzz as wind blows", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tMJne1a4AFI", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["bees, buzz, wind", "two men, woman, birds"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a human activity", "label": 1}, {"captions": ["people clap and speak in the distance", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["wwyfGO2J4", "wDVMhEdTiVw"], "start_seconds": ["90", "30"], "properties": ["clap, distance, speak", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["water splashes as an animal walks through", "a man speaks as water trickles down a stream"], "sample_ids": ["w1ir-sZ3Im8", "sapQIQUhFc"], "start_seconds": ["90", "280"], "properties": ["animal, water, splashes", "water, stream, trickles"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a man is speaking and a stream is flowing in the background "], "question": "which entity is a stream?", "label": 1}, {"captions": ["a woman and man speak while food is frying", "paper is crumpling consistently"], "sample_ids": ["zk-xJGQU8-4", "v5cSxLaHADY"], "start_seconds": ["130", "0"], "properties": ["food, man, woman", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a woman sneezes then speaks"], "sample_ids": ["zj2R0XoFr5k", "x4dZyf9Gbj0"], "start_seconds": ["50", "130"], "properties": ["airplane, boy, fly", "sneezes, speaks, woman"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman sneezes and speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["xV7Mg1QucSc", "xfaoyyzw2WU"], "start_seconds": ["14", "180"], "properties": ["alarm, ticktocks, laughs", "loud, jet engine, roar"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tDVADusiIoc", "vYkA3cfXp5Q"], "start_seconds": ["60", "30"], "properties": ["wind, radio, waves", "engine, accelerate, idle"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "vehicles pass by on a roadway"], "sample_ids": ["zgUgkpk78xU", "tgbONvsP47Y"], "start_seconds": ["70", "0"], "properties": ["clinking, humming, horn", "pass, vehicle, roadway"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "small dogs yip and bark sharply"], "sample_ids": ["sfAvvZwdLCY", "v-wcQf4BDY0"], "start_seconds": ["20", "120"], "properties": ["flushes, drains, water", "bark, yip, sharply"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a toilet is flushed", "a dog barks and growls"], "question": "which entity is louder", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a person speaks briefly"], "sample_ids": ["sWZzXuWYY", "zOZleIRqZm4"], "start_seconds": ["420", "80"], "properties": ["male, speech, banging", "person, talk, brief"], "captions_pred_video": [null, "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking with crickets chirping in the background"], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tw76HGONaKg", "zl9Dqx-j7q4"], "start_seconds": ["570", "6"], "properties": ["A, game, keyboard", "engine, laugh, loud"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "a woman speaks with water running"], "sample_ids": ["vb1fPSDI4c", "wTideSjRFS0"], "start_seconds": ["30", "30"], "properties": ["multiple, people, yell", "water, running, woman"], "captions_pred_video": [null, "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a woman is speaking while water is running in the background"], "question": "which entity has more people speaking", "label": 0}, {"captions": ["multiple birds chirp and an animal grunts", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tDlysoZiA1I", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["animal, grunt, multiple", "airplane, boy, fly"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "an infant crying as a woman laughs"], "sample_ids": ["xSKJGCItUWE", "xhmRY9yhC7c"], "start_seconds": ["10", "20"], "properties": ["engine, run, boy", "a, laugh, infant"], "captions_pred_video": ["footage of the helicopter flying in the room", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["vddP56-ogds", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["liquid, laughs, man", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honking?", "label": 1}, {"captions": ["goats bleat and metal clings", "three men talk while wind blows and some liquid flows"], "sample_ids": ["tH17JPjDPnc", "vJ7JPEFhyLA"], "start_seconds": ["260", "16"], "properties": ["bleat, metal, clings", "three men, wind, flow"], "captions_pred_video": ["feed of the goats eating hay in the barn", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a liquid flowing?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["wSVhSdj0F0", "ziUT9IFTkjg"], "start_seconds": ["10", "10"], "properties": ["horn honks, keys jingle, slam", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "birds are chirping and a chime is ringing "], "question": "which entity is more natural", "label": 1}, {"captions": ["a baby cries and a woman speaks", "a man is filing a hard object"], "sample_ids": ["tMbMDvT50j8", "vveS8HT7Uog"], "start_seconds": ["12", "100"], "properties": ["a, cry, woman", "a man, hard, object"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage is of a workbench with various tools on it including a hammer and a screwdriver"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is filing and speaking with background noise and breathing "], "question": "which object is harder to file", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "vehicle engines race around a track as a man commentates"], "sample_ids": ["vzxHnu-SFEw", "sZPuqDgX2V0"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "commentator, race, track"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking and a helicopter is flying overhead "], "question": "which is a video", "label": 1}, {"captions": ["some clanking with distant murmuring", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["uMTTDZ2mb4", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["clanking, murmuring, distant", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video", "label": 1}, {"captions": ["small dogs yip and bark sharply", "water is sprayed across a hard surface"], "sample_ids": ["v-wcQf4BDY0", "sQwlkXjQabo"], "start_seconds": ["120", "10"], "properties": ["bark, yip, sharply", "water, spray, surface"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a dog barks and growls", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "a woman speaks and other women and a man talk with her"], "sample_ids": ["vveS8HT7Uog", "vbpKkWvfOu4"], "start_seconds": ["100", "560"], "properties": ["a man, objects, speak", "a, woman, man"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vcmWSmvti8", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["music, man, fire", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking with wind noise in the background "], "question": "which entity has a man speaking as music plays before artillery is fired?", "label": 0}, {"captions": ["loud clanking and banging with brief male speech", "a duck quacks continuously"], "sample_ids": ["sWZzXuWYY", "vh30P49Po6s"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a consistent ticking pattern", "an airplane engine spools and people speak"], "sample_ids": ["sCeWURVHfOM", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["ticking, pattern, clock", "airplane, engine, spool"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["ticking of a clock", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["an aircraft engine runs", "a horse runs while two women talk"], "sample_ids": ["yLCORCnd35Q", "sdvI1mHAsc"], "start_seconds": ["0", "20"], "properties": ["engine, aircraft, runs", "two women, horse, run"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", null], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "horses clip-clop and a woman speaks"], "question": "which entity is moving", "label": 1}, {"captions": ["a motorcycle engine is idling", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["vZAqdHZ81yA", "yks4cLgIDMc"], "start_seconds": ["180", "170"], "properties": ["engine, motorcycle, idling", "background, speaking, child"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["an engine is idling loudly", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "paper is crumpling consistently"], "sample_ids": ["zkKdxzNC97Y", "v5cSxLaHADY"], "start_seconds": ["27", "0"], "properties": ["hard, surface, door", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a door is opened and closed", "paper is crumpled and crinkled"], "question": "which object is crumpling", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "small dogs growl, bark and yip."], "sample_ids": ["zdYdyF9-m8U", "sShpyu2l4YQ"], "start_seconds": ["7", "0"], "properties": ["wind, crash, shoreline", "growl, bark, yip"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "the puppies are playing with a toy"], "captions_pred_audio": ["waves crash and wind blows ", "a dog is barking and growling"], "question": "which entity is more active", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sEprKHm8Sj8", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["noise, loud, buzzing", "stream, water, flow"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage is blurry and out of focus"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with wind noise in the background "], "question": "which entity is a source of noise", "label": 0}, {"captions": ["a man speaking with light rustling", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["zOZleIRqZm4", "y8WEcpOlT3I"], "start_seconds": ["80", "40"], "properties": ["light, rustling, man", "harsh, wind, blows"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 0}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vfYTJq7nU", "tDVADusiIoc"], "start_seconds": ["130", "60"], "properties": ["rustling, ducks, quack", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "a child speaks in closed space"], "sample_ids": ["vveS8HT7Uog", "yW6FWLSLkx4"], "start_seconds": ["100", "40"], "properties": ["a man, objects, speak", "child, space, speak"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["an audience gives applause", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["x6iCUDmRpKQ", "wDVMhEdTiVw"], "start_seconds": ["38", "30"], "properties": ["applause, audience, give", "gun, shoot, water"], "captions_pred_video": ["a black background with the moon and stars in the sky", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a group of people are clapping and cheering", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["zkKdxzNC97Y", "ziUT9IFTkjg"], "start_seconds": ["27", "10"], "properties": ["loud, bang, noise", "background, birds, rustling"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a clock ticktocks"], "sample_ids": ["ugHJF0hfYkg", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["loud, intense, propeller", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a helicopter is flying overhead ", "a clock is ticking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a stream of water flows as people talk and wind blows"], "sample_ids": ["spYNpeN7rPY", "xBxDz0CFVn0"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "stream, water, flow"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "water splashes as an animal walks through"], "sample_ids": ["tOj4tdLRaA", "w1ir-sZ3Im8"], "start_seconds": ["70", "90"], "properties": ["woman, laugh, baby", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a baby laughs and a woman speaks", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a cat meows as a young woman speaks"], "sample_ids": ["vSeGhaZt-aI", "x5cuQjOdM3E"], "start_seconds": ["50", "30"], "properties": ["water, sink, talk", "cat, meows, young woman"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a cat meows and a woman speaks"], "question": "which entity is a cat?", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a man speaks as a car is passing by"], "sample_ids": ["ylpYOorfH4o", "sK4u5T8hW78"], "start_seconds": ["410", "30"], "properties": ["engine, running, wind", "a, car, pass"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking over a running engine and blowing wind?", "label": 0}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "dishes cling together then a man begins to speak"], "sample_ids": ["vfYTJq7nU", "sQGXqGcwOTc"], "start_seconds": ["130", "3"], "properties": ["rustling, ducks, quack", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a duck quacks and a woman speaks", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["food is frying then a woman speaks", "water pouring and bubbling"], "sample_ids": ["ukxt9I7eMMg", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["food, woman, speak", "water, bubbles, pouring"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["w5W5Kqtc8E", "uZesmtKZGSw"], "start_seconds": ["100", "250"], "properties": ["wind, blow, vehicle", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["un9VQlzgZM", "zj2R0XoFr5k"], "start_seconds": ["5", "50"], "properties": ["females, talk, laugh", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["xM4joTqDVp4", "w34HjHr6gAY"], "start_seconds": ["160", "30"], "properties": ["background, chirp, birds", "beeps, hit, woman"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "an engine runs loudly"], "sample_ids": ["sncRqQ67iJU", "vqZuVbG6-HI"], "start_seconds": ["460", "130"], "properties": ["loud, repeatedly, man", "loud, engine, run"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a person is snoring", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["v7jJS8aAyA", "zj2R0XoFr5k"], "start_seconds": ["10", "50"], "properties": ["wind, blows, loudly", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a person is burping while a girl speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["vdoxuJn9lTc", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["person, burp, girl", "background, motor, run"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a child speaks followed by a burp", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a church bell rings several times", "people applaud and hoot and chat quietly"], "sample_ids": ["sUVVjE3Ucp8", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["ring, bell, several", "people, applaud, hoot"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", null], "captions_pred_audio": ["a church bell is ringing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a stream of water flows quickly"], "sample_ids": ["tEE3MpBt1sg", "wbHTKEJZyhc"], "start_seconds": ["50", "20"], "properties": ["drill, something, laugh", "stream, water, flow"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a waterfall is flowing and people are speaking "], "question": "which entity is moving", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a man speaks as a car is passing by"], "sample_ids": ["vfYTJq7nU", "sK4u5T8hW78"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zj2R0XoFr5k", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["airplane, fly, overhead", "a woman, laughs, animal"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a woman speaking?", "label": 0}, {"captions": ["a muffled toilet flushes and the water drains", "a horse runs while two women talk"], "sample_ids": ["sfAvvZwdLCY", "sdvI1mHAsc"], "start_seconds": ["20", "20"], "properties": ["flushes, drains, water", "two women, horse, run"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "horses clip-clop and a woman speaks"], "question": "which entity is a horse?", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "a man speaks over intermittent keyboard taps"], "sample_ids": ["zuua6-5goWw", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["sound, pop, bird", "audio, man, keyboard"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "loud clanking and banging with brief male speech"], "sample_ids": ["tDlysoZiA1I", "sWZzXuWYY"], "start_seconds": ["0", "420"], "properties": ["animal, grunts, chirps", "male, speech, banging"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a sewing machine runs and a man speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "winds blows roughly as a vehicle races past"], "sample_ids": ["wyllXV6PjKo", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["a kid, talk, cry", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman speaks and a baby cries", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a power tool runs and touches a surface"], "sample_ids": ["vbZ-0lGPneg", "zfvPRf3chY"], "start_seconds": ["30", "290"], "properties": ["a woman, a television program, a bird", "power tool, run, touch"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man is speaking while a power tool is being used "], "question": "which entity is touching a surface", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "someone whistles a tune"], "sample_ids": ["xO-Q2BlIIPU", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["two men, exclamation, speak", "someone, tune, whistle"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["an adult woman and an adult man speak", "a person sneezes followed by another person speaking"], "sample_ids": ["zTLVJCo4WEE", "t8CV69hcvF0"], "start_seconds": ["30", "210"], "properties": ["two people, adult, speak", "person, sneeze, follow"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a woman sneezes and speaks"], "question": "which entity shows two people speaking?", "label": 0}, {"captions": ["water splashes and a door squeaks", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["sdXV-ylviw", "wSVhSdj0F0"], "start_seconds": ["190", "10"], "properties": ["sound, splash, door", "horn honks, keys jingle, electronic beep"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and taps with background noise ", "a car horn honks and keys jangle with background noise "], "question": "which entity has a door?", "label": 0}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "vehicles pass by on a roadway"], "sample_ids": ["zALy31PjDl0", "tgbONvsP47Y"], "start_seconds": ["21", "0"], "properties": ["a man, a vehicle, a horn", "pass, vehicle, roadway"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a car is driving on the road "], "question": "which entity is about vehicles", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sapQIQUhFc", "xBxDz0CFVn0"], "start_seconds": ["280", "30"], "properties": ["water, trickles, flow", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking with wind noise in the background "], "question": "which entity has more water flowing", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["wSVhSdj0F0", "y2bVZ7rz-5M"], "start_seconds": ["10", "280"], "properties": ["horn honks, keys jingle, electronic beep", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honks twice and keys jingle", "label": 0}, {"captions": ["sirens ring and approach with humming of distant traffic", "a piece of wood is being placed down and sawed"], "sample_ids": ["xERFUeZONz8", "uiItxDsDMFI"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "wood, piece, saw"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["an emergency vehicle siren blares", "a saw is being used with background noise "], "question": "which entity is being cut", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["rqu8iB22IY", "vbZ-0lGPneg"], "start_seconds": ["5", "30"], "properties": ["sound, repeats, laugh", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "people speak in the background as a clock ticktocks"], "sample_ids": ["ukxt9I7eMMg", "vZAw4apG0Es"], "start_seconds": ["30", "30"], "properties": ["food, pan, cook", "background, clock, ticktocks"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a clock is ticking and people are talking"], "question": "which entity is a video of a person cooking?", "label": 0}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "vehicles pass by on a roadway"], "sample_ids": ["yI-KvObbDoY", "tgbONvsP47Y"], "start_seconds": ["260", "0"], "properties": ["sound, smack, wind", "pass, vehicle, roadway"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "footage of a fire truck entering a garage"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a clock ticktocks in wind", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yVumC9TGknc", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, wind", "a woman, a television program, a bird"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a series of beeps and chirps", "a woman is speaking and a dog is whimpering"], "question": "which entity is a clock?", "label": 0}, {"captions": ["water gurgles, metal squeaks and the water stops", "race cars go around a track as a man commentates"], "sample_ids": ["x4a9YGIw4ok", "uZesmtKZGSw"], "start_seconds": ["120", "250"], "properties": ["water, gurgles, stops", "car, track, man"], "captions_pred_video": ["footage is blurry and out of focus", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a toilet flushes and water splashes", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is a video of a race?", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "someone whistles a tune"], "sample_ids": ["xERFUeZONz8", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["ring, approach, traffic", "someone, tune, whistle"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", null], "captions_pred_audio": ["an emergency vehicle siren blares", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "wind blows as people chatter quietly"], "sample_ids": ["tDlysoZiA1I", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, multiple", "wind, chatter, people"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["water running down a sink while a man is talking", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vSeGhaZt-aI", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["water, sink, talk", "three men, wind, flow"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man talking?", "label": 0}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["vlJS7LN2XyM", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["background, clocks, ticking", "background, birds, rustling"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "birds are chirping and a chime is ringing "], "question": "which entity has a bird in the background?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a man speaks as a motor runs in the background"], "sample_ids": ["vJrjSeP17yE", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "background, motor, run"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person snoring loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 0}, {"captions": ["birds vocalize and chirp continuously", "a child speaks in closed space"], "sample_ids": ["w1mlz3Pe4fU", "yW6FWLSLkx4"], "start_seconds": ["300", "40"], "properties": ["vocalize, chirp, continuously", "child, space, speak"], "captions_pred_video": ["of a bird in a cage", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["birds are chirping and singing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["xjvTpk2Zpr8", "vbZ-0lGPneg"], "start_seconds": ["70", "30"], "properties": ["engine, run, wind", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a television program?", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["w2JXXIAdUdg", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["snoring, distance, person", "music, gunfire, explosion"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a person snoring and a dog whimpering", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "water splashes as an animal walks through"], "sample_ids": ["s6DESzUTGjY", "w1ir-sZ3Im8"], "start_seconds": ["16", "90"], "properties": ["wind, laugh, woman", "animal, water, splashes"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a helicopter engine runs continuously"], "sample_ids": ["su6FAOcOA8c", "ugHJF0hfYkg"], "start_seconds": ["4", "10"], "properties": ["engine, run, woman", "engine, running, continuously"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a helicopter is flying overhead "], "question": "which entity has an engine running continuously", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["wTjoRj1se3U", "tw76HGONaKg"], "start_seconds": ["390", "570"], "properties": ["engine, run, people", "A, game, keyboard"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man speaks and types on a computer keyboard "], "question": "which entity is a video game?", "label": 1}, {"captions": ["a car speeding up in the distance", "a frog croaks as other frogs croak in the background"], "sample_ids": ["u0TrcHhkPQ", "yswmmRZFItk"], "start_seconds": ["20", "0"], "properties": ["distance, car, speed", "background, frog, croak"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a frog is croaking"], "question": "which entity is a croaking animal?", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "wind blowing followed by a zoom"], "sample_ids": ["vbZ-0lGPneg", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["a woman, a television program, a bird", "wind, blow, zoom"], "captions_pred_video": ["of a man holding a baby duck in his hands", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["wztCSUxOf8", "vlS6YMeWAPo"], "start_seconds": ["130", "40"], "properties": ["a crowd, yells, applauds", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "an engine runs loudly"], "sample_ids": ["yZp6xizR0yU", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["animal, bleat, cry", "loud, engine, run"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["two frogs croak at each other", "a frog croaks as other frogs croak in the background"], "sample_ids": ["zg0X6BnhOLQ", "yswmmRZFItk"], "start_seconds": ["410", "0"], "properties": ["two frogs, croak, at each other", "background, frog, croak"], "captions_pred_video": ["footage of lightning in the sky at night", "a close up of a frog in the water"], "captions_pred_audio": ["a frog is croaking", "a frog is croaking"], "question": "which frog is croaking", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["x5cuQjOdM3E", "zY3icUyMdh8"], "start_seconds": ["30", "20"], "properties": ["cat, meows, young woman", "dog, bark, engine"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a cat meows and a woman speaks", "a car is driving and dogs are barking and squealing "], "question": "which entity is more calm", "label": 0}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "birds vocalize and chirp continuously"], "sample_ids": ["xKB8O8LTs6s", "w1mlz3Pe4fU"], "start_seconds": ["70", "300"], "properties": ["music, gunfire, explosion", "vocalize, chirp, continuously"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of a bird in a cage"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "birds are chirping and singing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a train engine runs and a horn blows", "wind blowing and birds chirping with the distant cooing of a large bird"], "sample_ids": ["zPX9o1uDiI", "wRBHTgrbiwg"], "start_seconds": ["40", "50"], "properties": ["engine, horn, run", "birds, chirp, cooing"], "captions_pred_video": [null, "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "birds are chirping and insects are buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["rqfQRErjfk8", "vlS6YMeWAPo"], "start_seconds": ["170", "40"], "properties": ["crowd, cheers, applauds", "sheep, baa, birds"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["siJFXfGWgDk", "tDVADusiIoc"], "start_seconds": ["50", "60"], "properties": ["man, woman, vehicle", "water, radio, man"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "water is sprayed across a hard surface"], "sample_ids": ["s7knHCFW82w", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["blow horn, get close, train", "water, spray, surface"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "spraying followed by silence"], "question": "which is a liquid", "label": 1}, {"captions": ["a dark barks and whimpers", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sYj4hpDUZDQ", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["barks, whimpers, dark", "loud, laughter, intermittent"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a dog barks and a cat meows", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a drill runs and two people laugh"], "sample_ids": ["vBHyYJ8pL0", "tEE3MpBt1sg"], "start_seconds": ["2", "50"], "properties": ["noise, door, opening", "two people, laugh, drill"], "captions_pred_video": [null, "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["an airplane engine runs", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["yVPZ2MNWpms", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["engine, airplane, runs", "two men, woman, birds"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a human activity", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "paper is crumpling consistently"], "sample_ids": ["wyllXV6PjKo", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["a baby, a woman, a man", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman speaks and a baby cries", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["two frogs croak at each other", "someone whistles a tune"], "sample_ids": ["zg0X6BnhOLQ", "sIXTftIuUgw"], "start_seconds": ["410", "90"], "properties": ["two frogs, croak, at each other", "someone, tune, whistle"], "captions_pred_video": ["footage of lightning in the sky at night", null], "captions_pred_audio": ["a frog is croaking", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["a baby cries and a woman speaks", "water flows as men speak and yell"], "sample_ids": ["tMbMDvT50j8", "vJ7JPEFhyLA"], "start_seconds": ["12", "16"], "properties": ["a, cry, woman", "water, flow, men"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a baby crying and a woman speaking?", "label": 0}, {"captions": ["birds chirp and wind blows", "small dogs growl, bark and yip."], "sample_ids": ["sxIvBMSavMQ", "sShpyu2l4YQ"], "start_seconds": ["210", "0"], "properties": ["birds, chirp, wind", "growl, bark, yip"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "the puppies are playing with a toy"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a dog is barking and growling"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "water is sprayed across a hard surface"], "sample_ids": ["sapQIQUhFc", "sQwlkXjQabo"], "start_seconds": ["280", "10"], "properties": ["water, trickles, flow", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "spraying followed by silence"], "question": "which entity is a spray of water?", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "vehicles pass by on a roadway"], "sample_ids": ["ylpYOorfH4o", "tgbONvsP47Y"], "start_seconds": ["410", "0"], "properties": ["engine, running, wind", "pass, vehicle, roadway"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a stream of water runs briefly"], "sample_ids": ["wy1eKjR7KC0", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["people, talk, distance", "stream, water, run"], "captions_pred_video": ["two police officers riding motorcycles down the street", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a person snores loudly multiple times at a close distance"], "sample_ids": ["s4Uz1Ffgo04", "sSMl2vc3ek"], "start_seconds": ["100", "20"], "properties": ["roars, background, people speaking", "loud, multiple, distance"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a person snoring loudly"], "question": "which entity is louder", "label": 0}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "people speak as gunfire rings out"], "sample_ids": ["xKB8O8LTs6s", "wqTCwqVRDlk"], "start_seconds": ["70", "80"], "properties": ["music, radio, gunshots", "gunfire, ring, speak"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking and a gun is fired"], "question": "which entity has more gunshots", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "an infant crying as a woman laughs"], "sample_ids": ["uPDn2BFTHk", "xhmRY9yhC7c"], "start_seconds": ["140", "20"], "properties": ["lady, laugh, baby", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is a video of a baby cooing and fidgeting as a lady speaks and laughs?", "label": 0}, {"captions": ["people clap and speak in the distance", "a clock ticktocks"], "sample_ids": ["wwyfGO2J4", "v-g-j2uTByM"], "start_seconds": ["90", "30"], "properties": ["clap, distance, speak", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["continuous snoring", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["sLkeqCDJIyw", "tDlysoZiA1I"], "start_seconds": ["120", "0"], "properties": ["loud, snoring, noise", "animal, grunts, chirps"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a person is snoring loudly", "birds are chirping and a rooster is crowing "], "question": "which entity is not a noise", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["rwtmaKiCcQU", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["nozzle, depressed, spray can", "a woman, something, fried"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "- a woman cooking in the kitchen"], "captions_pred_audio": ["spraying and people speaking", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "vehicles pass by on a roadway"], "sample_ids": ["uOpoD0gGXcs", "tgbONvsP47Y"], "start_seconds": ["120", "0"], "properties": ["chirps, woman, bird", "pass, vehicle, roadway"], "captions_pred_video": ["a herd of cows grazing in the field", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a duck quacks loudly and continuously"], "sample_ids": ["s6DESzUTGjY", "vh30P49Po6s"], "start_seconds": ["16", "30"], "properties": ["wind, laugh, woman", "loud, continuous, quacks"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a machine beeps continuously", "wind blows and a vehicle blows a hard then a train blows a horn"], "sample_ids": ["y682ml90jGw", "wnpJndXuxLc"], "start_seconds": ["11", "50"], "properties": ["beeps, machine, continuously", "blows, vehicle, train"], "captions_pred_video": [null, "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a beeping sound is being made ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is not a machine?", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "some tunes played by whistling"], "sample_ids": ["w5W5Kqtc8E", "u6BnG6YZqJ4"], "start_seconds": ["100", "0"], "properties": ["wind, blow, vehicle", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yajyRTUQk3U", "vJ7JPEFhyLA"], "start_seconds": ["400", "16"], "properties": ["noise, woman, speak", "three men, wind, flow"], "captions_pred_video": ["- a woman cooking in the kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "three men talk while wind blows and some liquid flows"], "sample_ids": ["uWAAAL4CIoc", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["a woman, chirps, animal", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a woman and man are speaking"], "sample_ids": ["xV7Mg1QucSc", "vbpKkWvfOu4"], "start_seconds": ["14", "560"], "properties": ["alarm, ticktocks, laughs", "two people, speaking, woman, man"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a woman is speaking and a man is speaking"], "question": "which entity has two people speaking?", "label": 1}, {"captions": ["multiple ducks quack continuously", "wind blows as people chatter quietly"], "sample_ids": ["wfHeoPDLMaM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "wind, chatter, people"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage is blurry and out of focus"], "captions_pred_audio": ["ducks are quacking", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "people applaud and hoot and chat quietly"], "sample_ids": ["sG7TyPnFDR0", "wwyfGO2J4"], "start_seconds": ["180", "90"], "properties": ["beeps, machine, smoke alarm", "people, applaud, hoot"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", null], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a car speeding up in the distance", "a machine beeps continuously"], "sample_ids": ["u0TrcHhkPQ", "y682ml90jGw"], "start_seconds": ["20", "11"], "properties": ["distance, car, speed", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a beeping sound is being made "], "question": "which entity is not silent", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["yDoT73BWsdA", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["engine, revs, vehicle", "music, gunfire, explosion"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wSVhSdj0F0", "sSMl2vc3ek"], "start_seconds": ["10", "20"], "properties": ["beep, clang, footsteps", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["goats bleat and metal clings", "a girl talking, laughing and sneezing noise"], "sample_ids": ["tH17JPjDPnc", "y4tPJXBKDig"], "start_seconds": ["260", "20"], "properties": ["bleat, metal, clings", "a, noise, talk"], "captions_pred_video": ["feed of the goats eating hay in the barn", "footage of the woman wiping her nose with a tissue"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a woman is speaking and coughing with background noise and breathing "], "question": "which entity is talking", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a toilet flushes and a female speaks"], "sample_ids": ["uEU-Hg5MTN8", "yaln9y8I7ms"], "start_seconds": ["27", "230"], "properties": ["a woman, laughs, animal", "female, flushes, toilet"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "water flows as men speak and yell"], "sample_ids": ["v5P-ThUCINM", "vJ7JPEFhyLA"], "start_seconds": ["400", "16"], "properties": ["background, chirp, bird", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a more natural background", "label": 0}, {"captions": ["a fly buzzes around loudly as birds chirp", "a man speaks followed by another man speaking outside"], "sample_ids": ["uJV8NDaHqqk", "viuTg1M-dqg"], "start_seconds": ["100", "30"], "properties": ["loud, fly, chirp", "two men, speak, follow"], "captions_pred_video": ["a bee hive in a wooden box", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a train horn blows as it passes by"], "sample_ids": ["vf9xf3vMsGM", "zVacuqSb4LI"], "start_seconds": ["540", "30"], "properties": ["A man speaks while turning a water faucet on.", "horn, blows, train"], "captions_pred_video": ["of the person washing their hands under the faucet", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a stream of water runs briefly"], "sample_ids": ["zCrAfDfv6-A", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["person, mouse, click", "stream, water, run"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person whistles a song", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["water runs into a sink while men speak", "a car accelerates and wind blows"], "sample_ids": ["vzceMbklWc", "u0TrcHhkPQ"], "start_seconds": ["180", "20"], "properties": ["water, sink, run", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "small dogs yip and bark sharply"], "sample_ids": ["yZmhM1HcsyE", "v-wcQf4BDY0"], "start_seconds": ["4", "120"], "properties": ["engine, roar, water", "bark, yip, sharply"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["electronic beeps occur in a short series", "females talk and laugh over gusting wind"], "sample_ids": ["y682ml90jGw", "un9VQlzgZM"], "start_seconds": ["11", "5"], "properties": ["beeps, series, electronic", "females, talk, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is more natural", "label": 1}, {"captions": ["a man speaks as a machine runs", "small dogs yip and bark sharply"], "sample_ids": ["vD6lYD1l0BY", "v-wcQf4BDY0"], "start_seconds": ["330", "120"], "properties": ["a, machine, run", "bark, yip, sharply"], "captions_pred_video": ["game controller being held in the hands of the person", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a stream of water runs briefly"], "sample_ids": ["tZGN5a7ybxo", "x-PeY8Yb8M4"], "start_seconds": ["60", "300"], "properties": ["ring, train, horn", "stream, water, run"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a car is driving on a wet road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uEU-Hg5MTN8", "uYT5gxnyMWM"], "start_seconds": ["27", "50"], "properties": ["animal, grunts, snorts", "female, spraying, scream"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a baby is crying"], "question": "which entity has a female speaking?", "label": 0}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["uPDn2BFTHk", "zFjIWfSD-4"], "start_seconds": ["140", "410"], "properties": ["lady, laugh, baby", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "paper folding and crinkling"], "sample_ids": ["uOpoD0gGXcs", "zPpG3RD8lSs"], "start_seconds": ["120", "20"], "properties": ["chirps, woman, bird", "paper, fold, crinkle"], "captions_pred_video": ["a herd of cows grazing in the field", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["birds are chirping and a man is speaking", "the wind blows and a mouse clicks "], "question": "which entity is not a living thing", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["t8CV69hcvF0", "zFjIWfSD-4"], "start_seconds": ["210", "410"], "properties": ["person, sneeze, follow", "People, motor, brakes"], "captions_pred_video": ["of an airplane flying in the dark sky at night", null], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a person speaking?", "label": 0}, {"captions": ["water rushes by", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["x-PeY8Yb8M4", "wqZ135Ssz0"], "start_seconds": ["300", "60"], "properties": ["water, rushes, by", "two men, woman, birds"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["ziUT9IFTkjg", "vVhthZ45k3Y"], "start_seconds": ["10", "30"], "properties": ["background, birds, rustling", "cat, purr, hiss"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking and a cat is meowing"], "question": "which entity is more animal", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a horn rings out as a machine runs by"], "sample_ids": ["yajyRTUQk3U", "slZLHwNbbt4"], "start_seconds": ["400", "300"], "properties": ["a woman, something, fried", "a, horn, run"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine?", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "a vehicle accelerates before a race car idles then accelerates quickly"], "sample_ids": ["vf44CgrjT0A", "sjlVMgdGSK0"], "start_seconds": ["20", "30"], "properties": ["loud, long, person", "accelerates, vehicle, race car"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a loud burp", "a car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "an airplane engine runs"], "sample_ids": ["wudZTNBtVqc", "yVPZ2MNWpms"], "start_seconds": ["60", "0"], "properties": ["accelerates, engine, wind", "engine, airplane, runs"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a car is driving by on the road "], "question": "which entity has a moving engine", "label": 1}, {"captions": ["birds vocalize and a man speaks", "water flows and trickles"], "sample_ids": ["v0wPrLBI3hg", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["vocalize, bird, speak", "water, flow, trickle"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an engine runs and a man speaks", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["yT5WfYMRr-U", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["engine, run, man", "engine, revs, vehicle"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is about a vehicle engine?", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "someone is typing on a computer keyboard"], "sample_ids": ["vlS6YMeWAPo", "v0x1odnXtP0"], "start_seconds": ["40", "210"], "properties": ["sheep, baa, birds", "keyboard, type, computer"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "how to make money on youtube in spanish"], "captions_pred_audio": ["a goat bleats and birds chirp", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a man speaks as a car is passing by"], "sample_ids": ["zofjfKhqLk8", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "a, car, pass"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "someone sprays a liquid onto a hard surface making a hiss sound"], "sample_ids": ["s3cTDAj31g", "zO-LSSY92ZM"], "start_seconds": ["80", "30"], "properties": ["man, talk, woman", "liquid, surface, sound"], "captions_pred_video": [null, "youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'"], "captions_pred_audio": ["a man is speaking and a baby is crying", "steam is hissing and hissing"], "question": "which entity is silent", "label": 1}, {"captions": ["a door opens and birds chirp", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["yeFvk9x0wWI", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "gun, shoot, water"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause harm", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a woman speaks happily and an animal chirps"], "sample_ids": ["wz7N8YRy74I", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["rooster, crow, background, people", "a woman, chirps, animal"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a woman is speaking and a dog is barking "], "question": "which entity has a rooster?", "label": 0}, {"captions": ["a person sniffles and then sneezes in the distance", "a child speaks in closed space"], "sample_ids": ["uRlbY6aoBU", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["a, distance, sneeze", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is sneezing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["zkKdxzNC97Y", "vddP56-ogds"], "start_seconds": ["27", "30"], "properties": ["loud, bang, noise", "liquid, laughs, man"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "water is running and gurgling and a man is speaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["zY3icUyMdh8", "xfaoyyzw2WU"], "start_seconds": ["20", "180"], "properties": ["dog, bark, engine", "loud, jet engine, roar"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a train horn blows as it passes by"], "sample_ids": ["zF8yoL0rkbI", "zVacuqSb4LI"], "start_seconds": ["30", "30"], "properties": ["engine, run, someone", "horn, blows, train"], "captions_pred_video": ["footage of the traffic on the street at night", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["multiple ducks quack continuously", "a motor noise is accompanied by a door opening and closing"], "sample_ids": ["wfHeoPDLMaM", "vBHyYJ8pL0"], "start_seconds": ["30", "2"], "properties": ["multiple, quack, continuously", "noise, door, opening"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is accompanied by a door opening and closing?", "label": 1}, {"captions": ["people speak as gunfire rings out", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["wqTCwqVRDlk", "tDlysoZiA1I"], "start_seconds": ["80", "0"], "properties": ["gunfire, ring, speak", "animal, grunts, chirps"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is speaking and a gun is fired", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a boat travels through the waves as the wind blows loudly and a man speaks over a radio"], "sample_ids": ["vimzuGQvdcU", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["a, man, yells", "wind, radio, waves"], "captions_pred_video": ["a group of people are rafting down a river", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a boat traveling through the waves?", "label": 1}, {"captions": ["footsteps followed by a flushing toilet", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yXrw3GRMZag", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["sound, toilet, flush", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a toilet bowl with water in it", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["rustling followed by a toilet flushing", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a man speaking with light rustling"], "sample_ids": ["sa6TLVbooCc", "zOZleIRqZm4"], "start_seconds": ["240", "80"], "properties": ["people, laugh, child", "light, rustling, man"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking with crickets chirping in the background"], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vJrjSeP17yE", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "stream, water, flow"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage is blurry and out of focus"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["water pouring and bubbling", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["uyRfq-jKPpo", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["water, bubbles, pouring", "music, gunfire, explosion"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["water is running from a faucet", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a woman speaks happily and an animal chirps"], "sample_ids": ["yajyRTUQk3U", "uWAAAL4CIoc"], "start_seconds": ["400", "0"], "properties": ["noise, woman, speak", "a woman, chirps, animal"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "people applaud and hoot and chat quietly"], "sample_ids": ["xzKKf9bKNUo", "wwyfGO2J4"], "start_seconds": ["10", "90"], "properties": ["background, noise, snoring", "people, applaud, hoot"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", null], "captions_pred_audio": ["a person snoring loudly", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "vehicles pass by on a roadway"], "sample_ids": ["wRBHTgrbiwg", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["bird, owl, speak", "pass, vehicle, roadway"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a car is driving on the road "], "question": "which entity is more likely to be seen in a city", "label": 1}, {"captions": ["water rushes by", "paper is crumpling consistently"], "sample_ids": ["x-PeY8Yb8M4", "v5cSxLaHADY"], "start_seconds": ["300", "0"], "properties": ["water, rushes, by", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a car is driving on a wet road ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a frog croaks as other frogs croak in the background"], "sample_ids": ["uOpoD0gGXcs", "yswmmRZFItk"], "start_seconds": ["120", "0"], "properties": ["chirps, woman, bird", "background, frog, croak"], "captions_pred_video": ["a herd of cows grazing in the field", "a close up of a frog in the water"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a woman speaks and then a man speaks"], "sample_ids": ["sYITalLZjj4", "vbpKkWvfOu4"], "start_seconds": ["30", "560"], "properties": ["water, rushes, background, birds", "a, man, speaks"], "captions_pred_video": ["two ducks are swimming in the water near each other", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["wind blows and birds chirp", "a woman is speaking and a man is speaking"], "question": "which entity is a video of a woman speaking and then a man speaking?", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "wind blows and a vehicle blows a hard then a train blows a horn"], "sample_ids": ["siJFXfGWgDk", "wnpJndXuxLc"], "start_seconds": ["50", "50"], "properties": ["a, bird, vehicle", "blows, vehicle, train"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity has a train blowing a horn?", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["y8WEcpOlT3I", "vlS6YMeWAPo"], "start_seconds": ["40", "40"], "properties": ["harsh, wind, blows", "sheep, baa, birds"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person is snoring while sleeping", "a man speaks as a motor runs in the background"], "sample_ids": ["vJrjSeP17yE", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "background, motor, run"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person snoring loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 0}, {"captions": ["water splashes and wind noise is made into a microphone", "a child speaks in closed space"], "sample_ids": ["sDSppXIlJrs", "yW6FWLSLkx4"], "start_seconds": ["27", "40"], "properties": ["microphone, water, wind", "child, space, speak"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "someone whistles a tune"], "sample_ids": ["yYEVLuqEytU", "sIXTftIuUgw"], "start_seconds": ["40", "90"], "properties": ["animal, pig, background", "someone, tune, whistle"], "captions_pred_video": ["a baby goat is being petted by a person's hand", null], "captions_pred_audio": ["several sheep bleat and a man speaks", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks as a machine runs", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vD6lYD1l0BY", "zl9Dqx-j7q4"], "start_seconds": ["330", "6"], "properties": ["a, machine, run", "engine, laugh, loud"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "a car speeding up in the distance"], "sample_ids": ["y2ZBGpgbhHM", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["dog, chirp, breathe", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "someone is typing on a computer keyboard"], "sample_ids": ["tjmoSi330GM", "v0x1odnXtP0"], "start_seconds": ["23", "210"], "properties": ["speed, water, boat", "keyboard, type, computer"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "how to make money on youtube in spanish"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a person is typing on a keyboard"], "question": "which object is moving", "label": 0}, {"captions": ["a horn honks and then loudly blares", "a telephone rings followed by a woman talking"], "sample_ids": ["wnpJndXuxLc", "tGcFnX0GHI"], "start_seconds": ["50", "0"], "properties": ["horn, honk, loud", "ring, talk, woman"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is quieter", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "paper is crumpling consistently"], "sample_ids": ["ugHJF0hfYkg", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["loud, intense, propeller", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a helicopter is flying overhead ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["two frogs croak at each other", "water splashes as an animal walks through"], "sample_ids": ["zg0X6BnhOLQ", "w1ir-sZ3Im8"], "start_seconds": ["410", "90"], "properties": ["two frogs, croak, at each other", "animal, water, splashes"], "captions_pred_video": ["footage of lightning in the sky at night", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a frog is croaking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "people cheer as a vehicle engine revs"], "sample_ids": ["vBHyYJ8pL0", "xjhAnI2q6hM"], "start_seconds": ["2", "6"], "properties": ["noise, door, opening", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sd7xVssqlw", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["accelerates, tires, squealing", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car accelerates and revs its engine ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "an infant crying as a woman laughs"], "sample_ids": ["yDoT73BWsdA", "xhmRY9yhC7c"], "start_seconds": ["10", "20"], "properties": ["engine, revs, vehicle", "a, laugh, infant"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vf44CgrjT0A", "tDVADusiIoc"], "start_seconds": ["20", "60"], "properties": ["loud, long, person", "water, radio, man"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a loud burp", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "a toilet flushes and water drains"], "sample_ids": ["s59PfAghdkM", "sfAvvZwdLCY"], "start_seconds": ["0", "20"], "properties": ["bird, chirp, background, horse, neigh", "water drains, flushes, water"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "footage of the toilet in the bathroom"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a small engine spits as it runs", "a whistling owl calls out repeatedly and insects screech"], "sample_ids": ["sZvwOuuPGP0", "w6RTHR6AeAg"], "start_seconds": ["50", "40"], "properties": ["spits, engine, runs", "call, owl, screech"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", null], "captions_pred_audio": ["a medium engine is running ", "an owl hoots and mechanisms operate "], "question": "which entity is a bird?", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vSeGhaZt-aI", "wDVMhEdTiVw"], "start_seconds": ["50", "30"], "properties": ["water, sink, talk", "gun, shoot, water"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun?", "label": 1}, {"captions": ["some clanking with distant murmuring", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uMTTDZ2mb4", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["clanking, murmuring, distant", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["vehicles pass by on a roadway", "some men converse over an engine running"], "sample_ids": ["tgbONvsP47Y", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["pass, vehicle, roadway", "men, converse, engine"], "captions_pred_video": ["footage of a fire truck entering a garage", null], "captions_pred_audio": ["a car is driving on the road ", "a man speaks while a motorcycle revs and accelerates "], "question": "which is not a vehicle", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a dog whimpers as someone inhales/exhales briefly"], "sample_ids": ["sEprKHm8Sj8", "vmrxwuAMb2I"], "start_seconds": ["90", "40"], "properties": ["car, tires, slows", "a dog, inhales, exhales"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "of the dog laying on the bed with his head out of the blanket"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a dog barks and growls"], "question": "which entity is a living thing", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["x5cuQjOdM3E", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["cat, meows, young woman", "airplane, boy, fly"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a clock ticktocks continuously", "people cheer as a vehicle engine revs"], "sample_ids": ["vlJS7LN2XyM", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["ticktocks, clock, ticktocks continuously", "engine revs, vehicle, people"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a ticktock of a clock", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "waves crash against a shoreline and people speak"], "sample_ids": ["vz8868znkVQ", "yFB25fqfU8I"], "start_seconds": ["60", "300"], "properties": ["audio, click, kid speaking", "wave, crash, shoreline"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a video", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a man speaks then blows a vehicle horn as wind blows"], "sample_ids": ["zofjfKhqLk8", "zALy31PjDl0"], "start_seconds": ["10", "21"], "properties": ["noise, stop, motor", "a man, a vehicle, a horn"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a motorcycle is parked on the side of a brick walkway"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and a car horn is honking"], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an engine starts and increases in power", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zjTG0gaGCUI", "wqZ135Ssz0"], "start_seconds": ["80", "60"], "properties": ["power, increase, engine", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a jet engine roars as wind blows ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a human activity", "label": 1}, {"captions": ["a woman speaks with water running", "wind blowing followed by a zoom"], "sample_ids": ["wTideSjRFS0", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["water, running, woman", "wind, blow, zoom"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a car speeding up in the distance"], "sample_ids": ["sapQIQUhFc", "u0TrcHhkPQ"], "start_seconds": ["280", "20"], "properties": ["water, stream, trickles", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "paper folding and crinkling"], "sample_ids": ["tw76HGONaKg", "zPpG3RD8lSs"], "start_seconds": ["570", "20"], "properties": ["music, click, man", "paper, fold, crinkle"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "the wind blows and a mouse clicks "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp as a train approaches", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xM4joTqDVp4", "zl9Dqx-j7q4"], "start_seconds": ["160", "6"], "properties": ["bird, chirp, train", "engine, laugh, loud"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["water flows as men speak and yell", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vJ7JPEFhyLA", "vJ7JPEFhyLA"], "start_seconds": ["16", "16"], "properties": ["water, flow, men", "three men, wind, flow"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows men speaking and yelling?", "label": 0}, {"captions": ["loud, continuous burping", "an insect buzzes around continuously"], "sample_ids": ["y636gklDioE", "v25l1jef3JY"], "start_seconds": ["20", "0"], "properties": ["loud, continuous, burping", "buzzes, continuously, insect"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a person burps loudly several times", "a fly is buzzing around a microphone "], "question": "which entity is not a human", "label": 1}, {"captions": ["someone snores nearby", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["spJCm8tD9Zo", "uEU-Hg5MTN8"], "start_seconds": ["90", "27"], "properties": ["someone snores, nearby, someone", "animal, grunts, snorts"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a vehicle accelerates and squeals tires"], "sample_ids": ["xyL9F5VrjkE", "yRx9txMcBl0"], "start_seconds": ["20", "40"], "properties": ["wind, motor, distance", "accelerates, tires, squeals"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a car accelerates and wind blows"], "sample_ids": ["xKB8O8LTs6s", "u0TrcHhkPQ"], "start_seconds": ["70", "20"], "properties": ["music, radio, gunshots", "accelerates, wind, blows"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["water flows as men speak and yell", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vJ7JPEFhyLA", "wqZ135Ssz0"], "start_seconds": ["16", "60"], "properties": ["water, flow, men", "two men, woman, birds"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "people applaud and hoot and chat quietly"], "sample_ids": ["zcDwZ6W7E3E", "wwyfGO2J4"], "start_seconds": ["180", "90"], "properties": ["a, man, speak", "people, applaud, hoot"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "water pouring and bubbling"], "sample_ids": ["sfAvvZwdLCY", "uyRfq-jKPpo"], "start_seconds": ["20", "50"], "properties": ["flushes, drains, water", "water, bubbles, pouring"], "captions_pred_video": ["footage of the toilet in the bathroom", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a toilet is flushed", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "water flows and trickles"], "sample_ids": ["vveS8HT7Uog", "tB7hWb9gTuQ"], "start_seconds": ["100", "30"], "properties": ["a man, objects, speak", "water, flow, trickle"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["someone snores nearby", "a person snores loudly multiple times at a close distance"], "sample_ids": ["spJCm8tD9Zo", "sSMl2vc3ek"], "start_seconds": ["90", "20"], "properties": ["someone snores, nearby, someone", "loud, multiple, distance"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a person snoring loudly"], "question": "which entity is more annoying", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a child speaks in closed space"], "sample_ids": ["zFjIWfSD-4", "yW6FWLSLkx4"], "start_seconds": ["410", "40"], "properties": ["People, motor, brakes", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a dog barks and whimpers", "water is sprayed across a hard surface"], "sample_ids": ["sShpyu2l4YQ", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["barks, whimpers, dog", "water, spray, surface"], "captions_pred_video": ["the puppies are playing with a toy", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a dog is barking and growling", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["wy1eKjR7KC0", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["people, talk, distance", "sheep, baa, birds"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "a infant makes noise and is excited"], "sample_ids": ["x6ijhqRY38s", "wIJK3-5y0kA"], "start_seconds": ["250", "30"], "properties": ["something metal, glass, hit", "noise, excited, infant"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tiDFTC-5vU", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["male, duck, laugh", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["someone is snoring while sleeping", "three men talk while wind blows and some liquid flows"], "sample_ids": ["ujMt0-D-x2k", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["snore, sleep, someone", "three men, wind, flow"], "captions_pred_video": ["of the dog playing with a toy on the floor", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a person", "label": 0}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a machine beeps continuously"], "sample_ids": ["uzQnlJXBbOM", "y682ml90jGw"], "start_seconds": ["50", "11"], "properties": ["ringing, beep, stop", "beeps, machine, continuously"], "captions_pred_video": ["footage of a person using a cell phone on a table", null], "captions_pred_audio": ["a telephone rings and a man speaks", "a beeping sound is being made "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "paper is crumpling consistently"], "sample_ids": ["yLy-WycbVVE", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["background, people, talk", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "paper is crumpled and crinkled"], "question": "which entity is more silent", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "an infant crying as a woman laughs"], "sample_ids": ["yFB25fqfU8I", "xhmRY9yhC7c"], "start_seconds": ["300", "20"], "properties": ["wave, crash, shoreline", "a, laugh, infant"], "captions_pred_video": ["footage of a person surfing in the ocean", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a woman speaks as she rubs two objects together"], "sample_ids": ["zofjfKhqLk8", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["background, metal, clank", "two objects, woman, speak"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a machine?", "label": 0}, {"captions": ["water splashes and a door squeaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sdXV-ylviw", "su6FAOcOA8c"], "start_seconds": ["190", "4"], "properties": ["sound, splash, door", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a clock ticktocks"], "sample_ids": ["tOSWIURC-4", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["engine, work, nearby", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a lawn mower is running ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a woman speaks with water running", "small dogs yip and bark sharply"], "sample_ids": ["wTideSjRFS0", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["water, running, woman", "bark, yip, sharply"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["dogs barking and whimpering", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tIY7qOV3rEM", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["barking, whimpering, dog", "a woman, something, fried"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vSeGhaZt-aI", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["water, bubbles, run", "three men, wind, flow"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a man speaking while water bubbles and runs?", "label": 0}, {"captions": ["a man talks followed by a woman shouting", "a toilet flushes and a female speaks"], "sample_ids": ["s3cTDAj31g", "yaln9y8I7ms"], "start_seconds": ["80", "230"], "properties": ["man, talk, woman", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a toilet flushes and a man speaks"], "question": "which entity is about a toilet?", "label": 1}, {"captions": ["wind blows as people chatter quietly", "dishes cling together then a man begins to speak"], "sample_ids": ["xBxDz0CFVn0", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["wind, chatter, people", "cling, speak, dishes"], "captions_pred_video": ["footage is blurry and out of focus", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a man speaks as crickets sing"], "sample_ids": ["sa6TLVbooCc", "ryFDPxgDOGc"], "start_seconds": ["240", "570"], "properties": ["people, laugh, child", "a, crickets, sing"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a group of people dressed in camouflage and hunting gear in the dark"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking with crickets chirping in the background"], "question": "which entity has a child speaking?", "label": 0}, {"captions": ["wind blows strongly", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["w8uLijTqtlU", "tDVADusiIoc"], "start_seconds": ["70", "60"], "properties": ["wind, blows, strongly", "water, radio, man"], "captions_pred_video": ["footage is blurry and shaky", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be in a storm", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "water flows as men speak and yell"], "sample_ids": ["xvDdE3zNf8Y", "vJ7JPEFhyLA"], "start_seconds": ["120", "16"], "properties": ["a, female, speaks", "water, flow, men"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "winds blows roughly as a vehicle races past"], "sample_ids": ["wwyfGO2J4", "xjvTpk2Zpr8"], "start_seconds": ["90", "70"], "properties": ["people, applaud, hoot", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["dogs barking and whimpering", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tIY7qOV3rEM", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "multiple, people, yell"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "paper is crumpling consistently"], "sample_ids": ["sQwlkXjQabo", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["liquid, surface, spray", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["spraying followed by silence", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sLUnaPT5gM8", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["loud, laughter, intermittent", "a woman, a television program, a bird"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yYEVLuqEytU", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["animal, pig, background", "men, talk, cars"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "people speak in the background as a clock ticktocks"], "sample_ids": ["tiDFTC-5vU", "vZAw4apG0Es"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "background, clock, ticktocks"], "captions_pred_video": [null, "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a clock is ticking and people are talking"], "question": "which entity has a clock ticktocking in the background?", "label": 1}, {"captions": ["a toilet flushes and water drains", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sfAvvZwdLCY", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["water drains, flushes, water", "music, gunfire, explosion"], "captions_pred_video": ["footage of the toilet in the bathroom", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a toilet is flushed", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a scene of a toilet flushing and water draining?", "label": 0}, {"captions": ["a motorcycle idles loudly as wind blows", "wind blows as people chatter quietly"], "sample_ids": ["v7jJS8aAyA", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["wind, blows, loudly", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "dishes cling together then a man begins to speak"], "sample_ids": ["wqADXCzngMw", "sQGXqGcwOTc"], "start_seconds": ["340", "3"], "properties": ["audio, humming, revving", "cling, speak, dishes"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "mechanisms are operating and water is splashing "], "question": "which entity is a video", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wTideSjRFS0", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["food, sizzle, woman", "engine, accelerate, idle"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xzKKf9bKNUo", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["background, noise, snoring", "music, gunfire, explosion"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a person snoring loudly", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "three men talk while wind blows and some liquid flows"], "sample_ids": ["zkKdxzNC97Y", "vJ7JPEFhyLA"], "start_seconds": ["27", "16"], "properties": ["loud, bang, noise", "three men, wind, flow"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a loud noise?", "label": 0}, {"captions": ["a man talks as several small engines run", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["u9A6VZQCZpU", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "people speak as gunfire rings out"], "sample_ids": ["uiItxDsDMFI", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["wood, piece, saw", "gunfire, ring, speak"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a saw is being used with background noise ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sQGXqGcwOTc", "wz7N8YRy74I"], "start_seconds": ["3", "30"], "properties": ["audio, kid, giggles", "rooster, crow, background, men"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a horse runs while two women talk"], "sample_ids": ["zOZleIRqZm4", "sdvI1mHAsc"], "start_seconds": ["80", "20"], "properties": ["rustling, leaves, person", "two women, horse, run"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "horses clip-clop and a woman speaks"], "question": "which entity has more action", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "some men converse over an engine running"], "sample_ids": ["xOZfdgAgJ9o", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["woman, whimpering, speaking", "men, converse, engine"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a person sniffles and sneezes", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["uRlbY6aoBU", "uZesmtKZGSw"], "start_seconds": ["0", "250"], "properties": ["sneezes, sniffles, person", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["an adult woman and an adult man speak", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zTLVJCo4WEE", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["two people, adult, speak", "a woman, something, fried"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a frog croaks as other frogs croak in the background"], "sample_ids": ["weDbePuc-Xc", "yswmmRZFItk"], "start_seconds": ["40", "0"], "properties": ["music, slaps, human", "background, frog, croak"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a close up of a frog in the water"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a frog is croaking"], "question": "which entity is a croaking animal?", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wsHBIgzs9Fs", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["horn, continuous, buzzing", "airplane, boy, fly"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "several insects fly while two men talk"], "sample_ids": ["zofjfKhqLk8", "s-T9OVOiMLo"], "start_seconds": ["10", "330"], "properties": ["noise, stop, motor", "several, fly, men"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "rain falls on a surface as men speak and thunder roars"], "sample_ids": ["tDVADusiIoc", "w0xsN8X18Y"], "start_seconds": ["60", "30"], "properties": ["man, radio, blows", "rain, thunder, surface"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking while a motorboat is moving in the background "], "question": "which entity is about a man speaking on a radio as wind blows?", "label": 0}, {"captions": ["someone snores nearby", "a man speaks as a car is passing by"], "sample_ids": ["spJCm8tD9Zo", "sK4u5T8hW78"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "a, car, pass"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["wRV8yMk886E", "t25U-v4k4ts"], "start_seconds": ["0", "40"], "properties": ["liquid, spray, nozzle", "a, chirps, bird"], "captions_pred_video": ["two cars are parked in a parking lot at night", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a man is speaking and bees are buzzing"], "question": "which entity has a bird chirp?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a man speaks while turning a water faucet on"], "sample_ids": ["yYJksgsxx5U", "vf9xf3vMsGM"], "start_seconds": ["30", "540"], "properties": ["audio, woman, silverware", "A man speaks while turning a water faucet on."], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "of the person washing their hands under the faucet"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man is speaking while water is running in the background"], "question": "which entity is a man?", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "rain falls on a surface as men speak and music plays"], "sample_ids": ["w34HjHr6gAY", "w0xsN8X18Y"], "start_seconds": ["30", "30"], "properties": ["beeps, squawk, child speaking", "music, surface, rain"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking while a motorboat is moving in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a child speaks in closed space"], "sample_ids": ["ugHJF0hfYkg", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["engine, running, continuously", "child, space, speak"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not running continuously", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zofjfKhqLk8", "tDVADusiIoc"], "start_seconds": ["10", "60"], "properties": ["background, metal, clank", "water, radio, man"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "a toilet flushes and water drains unevenly"], "sample_ids": ["smGI3C1NZc", "vhJWZheqaE"], "start_seconds": ["30", "0"], "properties": ["water, drain, toilet", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a toilet is flushed"], "question": "which toilet is leaking?", "label": 0}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a telephone rings followed by a woman talking"], "sample_ids": ["su6FAOcOA8c", "tGcFnX0GHI"], "start_seconds": ["4", "0"], "properties": ["engine, run, woman", "ring, talk, woman"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "people speak as gunfire rings out"], "sample_ids": ["wsHBIgzs9Fs", "wqTCwqVRDlk"], "start_seconds": ["50", "80"], "properties": ["horn, continuous, buzzing", "gunfire, ring, speak"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a kid speaks followed by music playing", "water pouring and bubbling"], "sample_ids": ["tQWGZLItBXk", "uyRfq-jKPpo"], "start_seconds": ["170", "50"], "properties": ["music, kid, speak", "water, bubbles, pouring"], "captions_pred_video": ["worms revolution screenshots", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "water is running from a faucet"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["material crumbles into a microphone", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vofpvUo6NAw", "zFjIWfSD-4"], "start_seconds": ["220", "410"], "properties": ["material, crumbles, microphone", "People, motor, brakes"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", null], "captions_pred_audio": ["paper is being crumpled and crinkled", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a recording", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a duck quacks continuously"], "sample_ids": ["yLy-WycbVVE", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "quacks, continuously, duck"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a stream of water runs briefly", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["x-PeY8Yb8M4", "zj2R0XoFr5k"], "start_seconds": ["300", "50"], "properties": ["stream, water, run", "airplane, boy, fly"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a car is driving on a wet road ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a propeller rotates loudly and intensely"], "sample_ids": ["ukxt9I7eMMg", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["continuous, woman, speaking", "loud, intense, propeller"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a machine beeps continuously"], "sample_ids": ["wyllXV6PjKo", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["a baby, a woman, a man", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sfAvvZwdLCY", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["flushes, drains, water", "three men, wind, flow"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing?", "label": 0}, {"captions": ["male speech with light ticking", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["xO-Q2BlIIPU", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["male, speech, ticking", "a woman, laughs, animal"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a joke", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "a child speaks in closed space"], "sample_ids": ["zcDwZ6W7E3E", "yW6FWLSLkx4"], "start_seconds": ["180", "40"], "properties": ["a, man, speak", "child, space, speak"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 0}, {"captions": ["a vehicle engine runs as a siren and horn sound", "people cheer as a vehicle engine revs"], "sample_ids": ["u--KhUW8l1Y", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["sound, vehicle, horn", "engine revs, vehicle, people"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a truck is revving its engine and a man is speaking "], "question": "which vehicle is being driven by people?", "label": 1}, {"captions": ["an engine runs loudly", "a man speaks followed by another man speaking outside"], "sample_ids": ["vqZuVbG6-HI", "viuTg1M-dqg"], "start_seconds": ["130", "30"], "properties": ["loud, engine, run", "two men, speak, follow"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which is quieter", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["w2bYrCVLT60", "vfYTJq7nU"], "start_seconds": ["120", "130"], "properties": ["ducks, speak, quack", "rustling, ducks, quack"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", null], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a duck quacks and a woman speaks"], "question": "which entity has more ducks", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a frog vocalizes while birds chirp"], "sample_ids": ["uKCSGgof8gI", "vMf1dLD6Sng"], "start_seconds": ["12", "6"], "properties": ["chirps, distance, signal", "frog, bird, vocalize"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "a frog in a pond with pink flowers in the background"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a frog croaks loudly"], "question": "which entity is a frog?", "label": 1}, {"captions": ["a person snoring several times", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["spJCm8tD9Zo", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["snore, person, several", "airplane, boy, fly"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person is snoring loudly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a woman speaks happily and an animal chirps"], "sample_ids": ["ugHJF0hfYkg", "uWAAAL4CIoc"], "start_seconds": ["10", "0"], "properties": ["loud, propeller, move", "a woman, chirps, animal"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a dog is barking "], "question": "which is quieter", "label": 1}, {"captions": ["wind blows and a stream of water flows nearby", "a clock ticktocks"], "sample_ids": ["sYITalLZjj4", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["stream, flow, wind", "ticktocks, clock, ticktocks"], "captions_pred_video": ["two ducks are swimming in the water near each other", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["wind blows and birds chirp", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["wnpJndXuxLc", "y2bVZ7rz-5M"], "start_seconds": ["50", "280"], "properties": ["blows, vehicle, train", "motor noise, horn, siren"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a loud engine muffles a man as he speaks", "food is frying then a woman speaks"], "sample_ids": ["xyx6eNVEYRY", "ukxt9I7eMMg"], "start_seconds": ["380", "30"], "properties": ["loud, engine, muffles", "food, woman, speak"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a woman is speaking while food is frying in the background "], "question": "which entity is a man speaking?", "label": 0}, {"captions": ["dogs barking and whimpering", "a duck quacks continuously"], "sample_ids": ["tIY7qOV3rEM", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "quacks, continuously, duck"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a man speaks as a car is passing by"], "sample_ids": ["zTLVJCo4WEE", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "a, car, pass"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking with background noise and breathing sounds "], "question": "which entity is in a city", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "paper is crumpling consistently"], "sample_ids": ["vZAw4apG0Es", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["people, clock, converse", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a clock is ticking and people are talking", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["someone snores nearby", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["spJCm8tD9Zo", "uEU-Hg5MTN8"], "start_seconds": ["90", "27"], "properties": ["someone snores, nearby, someone", "a woman, laughs, animal"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a large crowd cheers and applauds", "people cheer as a vehicle engine revs"], "sample_ids": ["rqfQRErjfk8", "xjhAnI2q6hM"], "start_seconds": ["170", "6"], "properties": ["crowd, cheers, applauds", "engine revs, vehicle, people"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a truck is revving its engine and a man is speaking "], "question": "which entity is a crowd?", "label": 0}, {"captions": ["an engine revs and a turning noise is made", "water splashes as an animal walks through"], "sample_ids": ["tOSWIURC-4", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["noise, engine, revs", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a lawn mower is running ", "water splashes and gurgles as people speak"], "question": "which entity is not a noise", "label": 1}, {"captions": ["birds chirp as a bell rings", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["ziUT9IFTkjg", "vfYTJq7nU"], "start_seconds": ["10", "130"], "properties": ["chirp, bell, ring", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a duck quacks and a woman speaks"], "question": "which entity is about birds?", "label": 0}, {"captions": ["a man rubs two objects together then speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["vveS8HT7Uog", "vzxHnu-SFEw"], "start_seconds": ["100", "80"], "properties": ["a man, objects, speak", "two objects, woman, speak"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a man speaking?", "label": 0}, {"captions": ["an audience gives applause as a man yells and a group sings", "pigeons vocalize and birds chirp"], "sample_ids": ["tdWhHV3X25Q", "uiS58TNyUiw"], "start_seconds": ["60", "430"], "properties": ["applause, audience, yells", "vocalize, bird, chirp"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a man is filing a hard object", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vveS8HT7Uog", "su6FAOcOA8c"], "start_seconds": ["100", "4"], "properties": ["a man, hard, object", "engine, idle, woman"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a woman is speaking and a subway train is moving "], "question": "which object is being filed", "label": 0}, {"captions": ["frogs croak and vocalize", "paper folding and crinkling"], "sample_ids": ["yswmmRZFItk", "zPpG3RD8lSs"], "start_seconds": ["0", "20"], "properties": ["croak, vocalize, frog", "paper, fold, crinkle"], "captions_pred_video": ["a close up of a frog in the water", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a frog is croaking", "the wind blows and a mouse clicks "], "question": "which entity is not a frog?", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["w2bYrCVLT60", "wwyfGO2J4"], "start_seconds": ["120", "90"], "properties": ["ducks, speak, quack", "people, applaud, hoot"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", null], "captions_pred_audio": ["ducks are quacking and a man is speaking", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a dog barks and whimpers"], "sample_ids": ["sa6TLVbooCc", "sShpyu2l4YQ"], "start_seconds": ["240", "0"], "properties": ["people, laugh, child", "barks, whimpers, dog"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "the puppies are playing with a toy"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a dog is barking and growling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["w0xsN8X18Y", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["rain, thunder, surface", "a train, a horn, a bell"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a duck quacks continuously"], "sample_ids": ["wRBHTgrbiwg", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["bird, owl, speak", "quacks, continuously, duck"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a machine beeps continuously"], "sample_ids": ["ugHJF0hfYkg", "y682ml90jGw"], "start_seconds": ["10", "11"], "properties": ["engine, running, continuously", "beeps, machine, continuously"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a beeping sound is being made "], "question": "which machine beeps continuously", "label": 1}, {"captions": ["wind blows as people chatter quietly", "wind blows as people chatter quietly"], "sample_ids": ["xBxDz0CFVn0", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["wind, chatter, people", "wind, chatter, people"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is a photograph", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "vehicles pass by on a roadway"], "sample_ids": ["ylpYOorfH4o", "tgbONvsP47Y"], "start_seconds": ["410", "0"], "properties": ["engine, run, loud", "pass, vehicle, roadway"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a woman speaks happily and an animal chirps"], "sample_ids": ["w5W5Kqtc8E", "uWAAAL4CIoc"], "start_seconds": ["100", "0"], "properties": ["wind, engine, scream", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "people cheer as a vehicle engine revs"], "sample_ids": ["ylpYOorfH4o", "xjhAnI2q6hM"], "start_seconds": ["410", "6"], "properties": ["motor, run, steady", "engine revs, vehicle, people"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a truck is revving its engine and a man is speaking "], "question": "which motor runs steadily as a man speaks", "label": 0}, {"captions": ["a horn blasts loudly as a train passes", "a infant makes noise and is excited"], "sample_ids": ["zsLxS-uLJTw", "wIJK3-5y0kA"], "start_seconds": ["20", "30"], "properties": ["horn, blast, train", "noise, excited, infant"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a baby cries and a woman speaks"], "question": "which is louder", "label": 0}, {"captions": ["a goat screams and people speak in the background", "a door slams shut roughly"], "sample_ids": ["xC8kbrKJmco", "zkKdxzNC97Y"], "start_seconds": ["0", "27"], "properties": ["background, goat, scream", "a door, slams, shut"], "captions_pred_video": [null, "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a goat is bleating ", "a door is opened and closed"], "question": "which entity is quieter", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a toilet flushes and water drains"], "sample_ids": ["y2bVZ7rz-5M", "sfAvvZwdLCY"], "start_seconds": ["280", "20"], "properties": ["motor noise, horn, siren", "water drains, flushes, water"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "paper is crumpling consistently"], "sample_ids": ["vms5XGTDVQc", "v5cSxLaHADY"], "start_seconds": ["220", "0"], "properties": ["paper, crumpled, crinkled", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["paper is crumpled and crinkled", "paper is crumpled and crinkled"], "question": "which paper is crumpling consistently", "label": 1}, {"captions": ["a motorcycle engine works nearby", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tOSWIURC-4", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["engine, work, nearby", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a duck quacks and a woman speaks"], "question": "which entity is a natural event", "label": 1}, {"captions": ["vehicles pass by on a roadway", "pigeons vocalize and birds chirp"], "sample_ids": ["tgbONvsP47Y", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["pass, vehicle, roadway", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a fire truck entering a garage", "of the pigeon in the cage"], "captions_pred_audio": ["a car is driving on the road ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uWPRNLnpy7Y", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["accelerate, laugh, vehicle", "applause, audience, yells"], "captions_pred_video": ["is taken from a car driving down the street", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["wqZ135Ssz0", "uEU-Hg5MTN8"], "start_seconds": ["60", "27"], "properties": ["man, woman, squawks", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a more snorts", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["v5P-ThUCINM", "vfYTJq7nU"], "start_seconds": ["400", "130"], "properties": ["background, chirp, bird", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and birds are chirping", "a duck quacks and a woman speaks"], "question": "which entity has more birds", "label": 1}, {"captions": ["a door opens and birds chirp", "wind blows as people chatter quietly"], "sample_ids": ["yeFvk9x0wWI", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "wind, chatter, people"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage is blurry and out of focus"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "men speak and a nozzle sprays liquid"], "sample_ids": ["u21-Z5gJCB8", "wRV8yMk886E"], "start_seconds": ["30", "0"], "properties": ["background, voice, man", "liquid, spray, nozzle"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "two cars are parked in a parking lot at night"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man speaks followed by a loud burst"], "question": "which entity is more active", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "water splashes as an animal walks through"], "sample_ids": ["wwyfGO2J4", "w1ir-sZ3Im8"], "start_seconds": ["90", "90"], "properties": ["people, applaud, hoot", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["u21-Z5gJCB8", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["background, voice, man", "male, duck, laugh"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a child babbles as a woman speaks"], "sample_ids": ["vJvryTwuAV8", "wEBlkGWVWwE"], "start_seconds": ["16", "260"], "properties": ["audience, cheer, man", "a, babble, woman"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "shows a person writing on the whiteboard"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a woman is speaking and a child is speaking with background noise and clapping "], "question": "which entity is a child?", "label": 1}, {"captions": ["food is frying and sizzles", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zNRChLjqcU", "yajyRTUQk3U"], "start_seconds": ["220", "400"], "properties": ["food is frying, sizzles, food", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["water is running from a faucet into a sink", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking while something is fried?", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sZPuqDgX2V0", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["engine, accelerate, intercom", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["a beep occurs briefly", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["xtWeJ56-U-g", "w34HjHr6gAY"], "start_seconds": ["20", "30"], "properties": ["beep, occur, briefly", "beeps, hit, woman"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a infant makes noise and is excited"], "sample_ids": ["su6FAOcOA8c", "wIJK3-5y0kA"], "start_seconds": ["4", "30"], "properties": ["engine, run, woman", "noise, excited, infant"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["u--KhUW8l1Y", "sLUnaPT5gM8"], "start_seconds": ["0", "0"], "properties": ["horn, siren, life", "loud, laughter, intermittent"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a goat screams and people speak in the background", "people applaud and hoot and chat quietly"], "sample_ids": ["xC8kbrKJmco", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["background, goat, scream", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a vehicle engine runs and someone speaks"], "sample_ids": ["yLy-WycbVVE", "zF8yoL0rkbI"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "engine, run, someone"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "footage of the traffic on the street at night"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "the wind is blowing hard and water is splashing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet flushes and water drains", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sfAvvZwdLCY", "vYkA3cfXp5Q"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "engine, accelerate, idle"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a toilet is flushed", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a man speaks as a car is passing by"], "sample_ids": ["slZLHwNbbt4", "sK4u5T8hW78"], "start_seconds": ["300", "30"], "properties": ["clap, distance, horn", "a, car, pass"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a propeller moves loudly nearby", "pigeons vocalize and birds chirp"], "sample_ids": ["ugHJF0hfYkg", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["loud, propeller, move", "vocalize, bird, chirp"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of the pigeon in the cage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking and a bee is buzzing"], "question": "which is quieter", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["x9JovgqUcs", "vqZuVbG6-HI"], "start_seconds": ["500", "130"], "properties": ["a, man, speaks, keyboard", "background, male, female"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a lawn mower is running and men are speaking "], "question": "which entity has a man speaking and typing on a keyboard?", "label": 0}, {"captions": ["a clock ticktocks continuously", "a child speaks in closed space"], "sample_ids": ["vlJS7LN2XyM", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["ticktocks, clock, ticktocks continuously", "child, space, speak"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "water flows and trickles"], "sample_ids": ["zuua6-5goWw", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["birds, chirp, quiet, man, speaks", "water, flow, trickle"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a person snoring several times", "a pigeon cooing as an insect buzzes by briefly"], "sample_ids": ["spJCm8tD9Zo", "yZrFNS7GFBQ"], "start_seconds": ["90", "30"], "properties": ["snore, person, several", "pigeon, buzzes, insect"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of the bird in the cage"], "captions_pred_audio": ["a person is snoring loudly", "an owl hoots in the background "], "question": "which entity is not a person?", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["s4Uz1Ffgo04", "sapQIQUhFc"], "start_seconds": ["100", "280"], "properties": ["water, rushes, motorcycle", "liquid, flow, distance"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more distant", "label": 1}, {"captions": ["tapping occurs then a baby cries", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["wIJK3-5y0kA", "x9JovgqUcs"], "start_seconds": ["30", "500"], "properties": ["a, cry, baby", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man speaks and types on a keyboard"], "question": "which entity is typing?", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["uWAAAL4CIoc", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["a woman, chirps, animal", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["an airplane engine spools and people speak", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wTjoRj1se3U", "xBxDz0CFVn0"], "start_seconds": ["390", "30"], "properties": ["airplane, engine, spool", "stream, water, flow"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage is blurry and out of focus"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["birds tweet and squawk", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["w1mlz3Pe4fU", "tDlysoZiA1I"], "start_seconds": ["300", "0"], "properties": ["squawk, tweet, scream", "animal, grunts, chirps"], "captions_pred_video": ["of a bird in a cage", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["birds are chirping and singing", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sNB8zxXneIM", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["several, quack, cocks", "a woman, something, fried"], "captions_pred_video": ["a group of geese in a cage", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a man speaks on a radio as wind blows"], "sample_ids": ["tgbONvsP47Y", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["pass, vehicle, roadway", "man, radio, blows"], "captions_pred_video": ["footage of a fire truck entering a garage", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a car is driving on the road ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is stationary", "label": 1}, {"captions": ["a man speaks as horns blow", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tHyNqRyK34A", "uYT5gxnyMWM"], "start_seconds": ["24", "50"], "properties": ["a, man, speaks", "female, spraying, scream"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is a woman?", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "people speak and tapping occurs"], "sample_ids": ["vXlk0lIQBFo", "tFCUUGdREgA"], "start_seconds": ["470", "70"], "properties": ["wind, talk, vocalize", "people, tap, speak"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a person riding a white horse in an indoor arena"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man is speaking and walking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "someone whistles a tune"], "sample_ids": ["vVhthZ45k3Y", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["cat, purr, hiss", "someone, tune, whistle"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a person whistling a song"], "question": "which entity is a human activity", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "a car accelerates and wind blows"], "sample_ids": ["zliInBdC98Y", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["a, baby, cries, wails", "accelerates, wind, blows"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["an engine runs loudly", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vqZuVbG6-HI", "su6FAOcOA8c"], "start_seconds": ["130", "4"], "properties": ["loud, engine, run", "engine, idle, woman"], "captions_pred_video": ["footage is blurry because it's raining outside", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a woman is speaking and a subway train is moving "], "question": "which entity is quieter", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wfHeoPDLMaM", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["quacking, squawking, ducks", "rooster, crow, background, men"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["ducks are quacking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a clock ticktocks"], "sample_ids": ["yYEVLuqEytU", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["animal, pig, background", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "waves crash against a shoreline and people speak"], "sample_ids": ["s6DESzUTGjY", "yFB25fqfU8I"], "start_seconds": ["16", "300"], "properties": ["wind, laugh, woman", "wave, crash, shoreline"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["a man talks as several small engines run", "a rumbling clap in the distance followed by a horn and the rumbling grows louder"], "sample_ids": ["u9A6VZQCZpU", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["a, man, talk", "clap, distance, horn"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is not a clap?", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "a piece of wood is being placed down and sawed"], "sample_ids": ["vbpKkWvfOu4", "uiItxDsDMFI"], "start_seconds": ["560", "30"], "properties": ["a, man, speaks", "wood, piece, saw"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a saw is being used with background noise "], "question": "which entity is about a piece of wood being sawed?", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "a toilet flushes and water sputters as it drains"], "sample_ids": ["tw76HGONaKg", "smGI3C1NZc"], "start_seconds": ["570", "30"], "properties": ["music, click, man", "water, drain, toilet"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a toilet is flushed"], "question": "which entity is silent", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["uYT5gxnyMWM", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["person, spray, yell", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a man talks followed by a woman shouting"], "sample_ids": ["yeFvk9x0wWI", "s3cTDAj31g"], "start_seconds": ["30", "80"], "properties": ["clack, bird, chirp", "man, talk, woman"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and a baby is crying"], "question": "which entity is a human", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "running water in a faucet with some clinks"], "sample_ids": ["su6FAOcOA8c", "zNRChLjqcU"], "start_seconds": ["4", "220"], "properties": ["engine, idle, woman", "water, faucet, run"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "water is running from a faucet into a sink"], "question": "which entity is a source of water", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a train horn blows as it passes by"], "sample_ids": ["v-wcQf4BDY0", "zVacuqSb4LI"], "start_seconds": ["120", "30"], "properties": ["bark, yip, sharply", "horn, blows, train"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a dog barks and growls", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["zuua6-5goWw", "sapQIQUhFc"], "start_seconds": ["30", "280"], "properties": ["sound, pop, bird", "liquid, flow, distance"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", null], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking and a stream is flowing in the background "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["motors rev and run loudly as a person laughs", "a man speaks uses a drill"], "sample_ids": ["zl9Dqx-j7q4", "x5eIC7S0fbg"], "start_seconds": ["6", "60"], "properties": ["motors rev, laugh, loudly", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["footage of a man driving a car in the dark", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking and using a power tool "], "question": "which entity is a tool", "label": 1}, {"captions": ["food is frying and sizzles", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["zNRChLjqcU", "xV7Mg1QucSc"], "start_seconds": ["220", "14"], "properties": ["food is frying, sizzles, food", "alarm, ticktocks, laughs"], "captions_pred_video": [null, "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["water is running from a faucet into a sink", "an alarm clock ticks and a woman laughs"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "pigeons vocalize and birds chirp"], "sample_ids": ["vlJS7LN2XyM", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["background, clocks, ticking", "vocalize, bird, chirp"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "of the pigeon in the cage"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "water pouring and bubbling"], "sample_ids": ["sZPuqDgX2V0", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["commentator, race, track", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an engine runs and a man speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["yT5WfYMRr-U", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "background, motor, run"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a man speaking while an engine runs?", "label": 0}, {"captions": ["a male speaks and another male speaks", "paper folding and crinkling"], "sample_ids": ["viuTg1M-dqg", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["two males, speaking, male", "paper, fold, crinkle"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of folding and crinkling paper?", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "a infant makes noise and is excited"], "sample_ids": ["vMf1dLD6Sng", "wIJK3-5y0kA"], "start_seconds": ["6", "30"], "properties": ["frog, bird, vocalize", "noise, excited, infant"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a frog croaks loudly", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["male speech with light ticking", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xO-Q2BlIIPU", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["male, speech, ticking", "multiple, people, yell"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "dishes cling together then a man begins to speak"], "sample_ids": ["sYITalLZjj4", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["water, rushes, background, birds", "cling, speak, dishes"], "captions_pred_video": ["two ducks are swimming in the water near each other", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["wind blows and birds chirp", "mechanisms are operating and water is splashing "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a car revs and accelerates loudly and men and women chatter among themselves"], "sample_ids": ["yZrFNS7GFBQ", "y8dSeubCNI"], "start_seconds": ["30", "4"], "properties": ["pigeon, buzzes, insect", "men, women, car"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "an engine revving and people talking in the background"], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "continuous sneezing together with speech"], "sample_ids": ["xzKKf9bKNUo", "x4dZyf9Gbj0"], "start_seconds": ["10", "130"], "properties": ["background, noise, snoring", "continuous, sneeze, speech"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage is blurry and out of focus"], "captions_pred_audio": ["a person snoring loudly", "a woman sneezes and speaks"], "question": "which entity is more like a sneeze", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a woman and man are speaking"], "sample_ids": ["tK4VlLsNxak", "vbpKkWvfOu4"], "start_seconds": ["120", "560"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "two people, speaking, woman, man"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a woman is speaking and a man is speaking"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a man speaks as a car is passing by"], "sample_ids": ["sSMl2vc3ek", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["a person, laughs, snores", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["spYNpeN7rPY", "wDVMhEdTiVw"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "gun, shoot, water"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["birds chirp and objects are moved around", "water flows and trickles"], "sample_ids": ["yPUYU6t3rwo", "tB7hWb9gTuQ"], "start_seconds": ["370", "30"], "properties": ["birds chirp, objects are moved around, birds", "water, flow, trickle"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["insects buzz and a man speaks", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["weDbePuc-Xc", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["cartoon character, music, vocalize", "clickety-clack, train, whistle"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["children speak as a female ask them questions", "children cheer as a man speaks then an audience screams"], "sample_ids": ["wEBlkGWVWwE", "vJvryTwuAV8"], "start_seconds": ["260", "16"], "properties": ["female, speak, questions", "audience, cheer, man"], "captions_pred_video": ["shows a person writing on the whiteboard", "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking and a crowd is shouting and whooping "], "question": "which entity is a performance", "label": 1}, {"captions": ["women speak and laugh as wind blows", "people applaud and hoot and chat quietly"], "sample_ids": ["un9VQlzgZM", "wwyfGO2J4"], "start_seconds": ["5", "90"], "properties": ["wind, speak, laugh", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "some men converse over an engine running"], "sample_ids": ["sjlVMgdGSK0", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["accelerates, vehicle, race car", "men, converse, engine"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man speaks while a motorcycle revs and accelerates "], "question": "which is a still image", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["v5P-ThUCINM", "vlS6YMeWAPo"], "start_seconds": ["400", "40"], "properties": ["background, chirp, bird", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a goat bleats and birds chirp"], "question": "which entity has more birds", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "birds chirp as a train approaches"], "sample_ids": ["tMJne1a4AFI", "xM4joTqDVp4"], "start_seconds": ["0", "160"], "properties": ["wind, buzz, rustling", "bird, chirp, train"], "captions_pred_video": ["a swarm of bees on the ground", "footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack"], "captions_pred_audio": ["a swarm of bees buzzing around", "birds are chirping and a train is moving "], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "paper folding and crinkling"], "sample_ids": ["siJFXfGWgDk", "zPpG3RD8lSs"], "start_seconds": ["50", "20"], "properties": ["a, bird, vehicle", "paper, fold, crinkle"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of folding and crinkling?", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a car speeding up in the distance"], "sample_ids": ["sAam2NqGhLY", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["snoring, breathing, child", "distance, car, speed"], "captions_pred_video": ["of a little girl sleeping on a couch", null], "captions_pred_audio": ["a person is snoring", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "water splashes as an animal walks through"], "sample_ids": ["uRExseg-0XI", "w1ir-sZ3Im8"], "start_seconds": ["210", "90"], "properties": ["woman, man, water", "animal, water, splashes"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "water splashes and gurgles as people speak"], "question": "which entity is about water", "label": 1}, {"captions": ["a woman and man speak while food is frying", "someone is typing on a computer keyboard"], "sample_ids": ["zk-xJGQU8-4", "v0x1odnXtP0"], "start_seconds": ["130", "210"], "properties": ["food, man, woman", "keyboard, type, computer"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a person is typing on a keyboard"], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "someone is typing on a computer keyboard"], "sample_ids": ["wIvYjuR3nrg", "v0x1odnXtP0"], "start_seconds": ["9", "210"], "properties": ["birds, pigeons, vocalize", "keyboard, type, computer"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "how to make money on youtube in spanish"], "captions_pred_audio": ["birds are chirping and cooing", "a person is typing on a keyboard"], "question": "which entity is typing", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["sWZzXuWYY", "y2bVZ7rz-5M"], "start_seconds": ["420", "280"], "properties": ["male, speech, banging", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "some tunes played by whistling"], "sample_ids": ["vbpKkWvfOu4", "u6BnG6YZqJ4"], "start_seconds": ["560", "0"], "properties": ["a, man, speaks", "tune, play, whistling"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a person snores loudly multiple times at a close distance"], "sample_ids": ["u5RmF3c3Aw", "sSMl2vc3ek"], "start_seconds": ["60", "20"], "properties": ["engine, car, zoom", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vBslzh7saPw", "xfaoyyzw2WU"], "start_seconds": ["90", "180"], "properties": ["power, scream, increase", "loud, jet engine, roar"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a jet engine roars and accelerates ", "an aircraft engine roars and a man speaks "], "question": "which jet engine is louder", "label": 1}, {"captions": ["a man woman speak while crickets sing", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["zTLVJCo4WEE", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["a, crickets, sing", "sound, chirp, buzz"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a bee on a purple thistle flower"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a bee buzzes and a woman speaks"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["some men converse over an engine running", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sCiy7QS1U", "uEU-Hg5MTN8"], "start_seconds": ["300", "27"], "properties": ["men, converse, engine", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a joke", "label": 1}, {"captions": ["a motorcycle engine is idling", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vZAqdHZ81yA", "tDVADusiIoc"], "start_seconds": ["180", "60"], "properties": ["engine, motorcycle, idling", "water, radio, man"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["an engine is idling loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a motorcycle engine is idling", "a train horn blows as it passes by"], "sample_ids": ["vZAqdHZ81yA", "zVacuqSb4LI"], "start_seconds": ["180", "30"], "properties": ["engine, motorcycle, idling", "horn, blows, train"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["an engine is idling loudly", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["a clock ticktocks in wind", "an aircraft engine runs as wind blows heavily"], "sample_ids": ["yVumC9TGknc", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["ticktocks, clock, wind", "engine, run, wind"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a series of beeps and chirps", "a jet engine roars and wind blows "], "question": "which object is moving in the wind", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a child speaks in closed space"], "sample_ids": ["tOSWIURC-4", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["engine, work, nearby", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a lawn mower is running ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "pigeons vocalize and birds chirp"], "sample_ids": ["zl9Dqx-j7q4", "uiS58TNyUiw"], "start_seconds": ["6", "430"], "properties": ["motors rev, laugh, loudly", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a man driving a car in the dark", "of the pigeon in the cage"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a jet engine spools up and takes off", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vBslzh7saPw", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["engine, spools, takes", "airplane, boy, fly"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xSKJGCItUWE", "su6FAOcOA8c"], "start_seconds": ["10", "4"], "properties": ["engine, run, boy", "engine, idle, woman"], "captions_pred_video": ["footage of the helicopter flying in the room", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a woman is speaking and a subway train is moving "], "question": "which entity has a running engine", "label": 0}, {"captions": ["a man speaks as horns blow", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["tHyNqRyK34A", "zFjIWfSD-4"], "start_seconds": ["24", "410"], "properties": ["a, man, speaks", "People, motor, brakes"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", null], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "an adult man speaks over glass clinking"], "sample_ids": ["yaln9y8I7ms", "u6jIvCtKarQ"], "start_seconds": ["230", "70"], "properties": ["female, flushes, toilet", "a, man, speaks"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a person using a blender on a stove top"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is speaking and dishes are being moved with background noise "], "question": "which entity is a man speaking over glass clinking?", "label": 1}, {"captions": ["a person screams glaringly", "a man speaks as a car is passing by"], "sample_ids": ["xC8kbrKJmco", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["glaringly, screams, person", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a goat is bleating ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more passive", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a person snores loudly multiple times at a close distance"], "sample_ids": ["xKB8O8LTs6s", "sSMl2vc3ek"], "start_seconds": ["70", "20"], "properties": ["music, radio, gunshots", "loud, multiple, distance"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vmrxwuAMb2I", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["a dog, inhales, exhales", "rustling, ducks, quack"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", null], "captions_pred_audio": ["a dog barks and growls", "a duck quacks and a woman speaks"], "question": "which entity is about a dog?", "label": 0}, {"captions": ["a man speaks as a car is passing by", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["sK4u5T8hW78", "yks4cLgIDMc"], "start_seconds": ["30", "170"], "properties": ["a, car, pass", "background, speaking, child"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zk-xJGQU8-4", "uYT5gxnyMWM"], "start_seconds": ["130", "50"], "properties": ["food, man, woman", "female, spraying, scream"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a woman and man speaking?", "label": 0}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "an engine runs loudly"], "sample_ids": ["w6RTHR6AeAg", "vqZuVbG6-HI"], "start_seconds": ["40", "130"], "properties": ["call, owl, screech", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a person uses a saw to cut some wood"], "sample_ids": ["ylpYOorfH4o", "sHbXC6na9hg"], "start_seconds": ["410", "0"], "properties": ["engine, running, wind", "a person, saw, wood"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a man is speaking and an engine is revving", "an engine is idling and vibrating"], "question": "which entity is stationary", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "vehicles pass by on a roadway"], "sample_ids": ["zofjfKhqLk8", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["noise, stop, motor", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "water rushes and then a vehicle zooms past"], "sample_ids": ["ukxt9I7eMMg", "s4Uz1Ffgo04"], "start_seconds": ["30", "100"], "properties": ["food, pan, cook", "water, rushes, vehicle"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is moving", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "birds chirp and objects are moved around"], "sample_ids": ["ukg5L09Wpvo", "yPUYU6t3rwo"], "start_seconds": ["150", "370"], "properties": ["clickety-clack, train, whistle", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["children cry and people talk", "an insect buzzes around continuously"], "sample_ids": ["xLwHe825Zs", "v25l1jef3JY"], "start_seconds": ["18", "0"], "properties": ["people talk, children cry, people talk", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a baby cries and a woman speaks", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a frog croaks as other frogs croak in the background"], "sample_ids": ["xfudFO976zE", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["animal, bleats, cry", "background, frog, croak"], "captions_pred_video": ["footage is blurry and shaky", "a close up of a frog in the water"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a frog is croaking"], "question": "which animal is more likely to be a frog?", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["y2ZBGpgbhHM", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["dog, chirp, breathe", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["birds chirping and a dog panting", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "wind blowing followed by a zoom"], "sample_ids": ["yNtRmrn0io8", "vr8ZXjEBhMQ"], "start_seconds": ["210", "150"], "properties": ["storm, distance, strike", "wind, blow, zoom"], "captions_pred_video": ["footage of a house in the middle of the night", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["rain falls and thunder roars", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to cause damage", "label": 0}, {"captions": ["a man speaks as horns blow", "a car accelerates and wind blows"], "sample_ids": ["tHyNqRyK34A", "u0TrcHhkPQ"], "start_seconds": ["24", "20"], "properties": ["a, man, speaks", "accelerates, wind, blows"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", null], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tiDFTC-5vU", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["male, duck, laugh", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sfAvvZwdLCY", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["flushes, drains, water", "music, gunfire, explosion"], "captions_pred_video": ["footage of the toilet in the bathroom", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a toilet is flushed", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a clock ticktocks"], "sample_ids": ["spYNpeN7rPY", "v-g-j2uTByM"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "ticktocks, clock, ticktocks"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a clock is ticking loudly"], "question": "which entity has a clock ticktocks?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a duck quacks loudly and continuously"], "sample_ids": ["vZAw4apG0Es", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["background, tick, repeat", "loud, continuous, quacks"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a clock is ticking and people are talking", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "an engine runs loudly"], "sample_ids": ["uiItxDsDMFI", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["wood, piece, saw", "loud, engine, run"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a saw is being used with background noise ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["se87d6yxEOA", "vbZ-0lGPneg"], "start_seconds": ["10", "30"], "properties": ["run, whistle, pass", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a moving object", "label": 0}, {"captions": ["a man speaks as he moves silverware in a bowl", "small dogs yip and bark sharply"], "sample_ids": ["x6ijhqRY38s", "v-wcQf4BDY0"], "start_seconds": ["250", "120"], "properties": ["bowl, silverware, man", "bark, yip, sharply"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vzxHnu-SFEw", "zl9Dqx-j7q4"], "start_seconds": ["80", "6"], "properties": ["two objects, woman, speak", "engine, laugh, loud"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "an small aircraft engine runs and a boy speaks"], "sample_ids": ["yJ0TePmaOo", "xSKJGCItUWE"], "start_seconds": ["390", "10"], "properties": ["two hard objects, man, speak", "engine, run, boy"], "captions_pred_video": [null, "footage of the helicopter flying in the room"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a high pitched engine is running and a child speaks"], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "a woman speaks and other women and a man talk with her"], "sample_ids": ["w2JXXIAdUdg", "vbpKkWvfOu4"], "start_seconds": ["10", "560"], "properties": ["emits, sleeping, person", "a, woman, man"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["a stream of water flows quickly", "winds blows roughly as a vehicle races past"], "sample_ids": ["wbHTKEJZyhc", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["stream, water, flow", "wind, blows, vehicle"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a jet engine roars and wind blows "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zCrAfDfv6-A", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["person, mouse, click", "a, scream, girl"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person whistles a song", "a woman is speaking and a baby is crying"], "question": "which entity is a person?", "label": 0}, {"captions": ["a sleeping person emits a gravely snore", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["w2JXXIAdUdg", "uZesmtKZGSw"], "start_seconds": ["10", "250"], "properties": ["emits, sleeping, person", "men, talk, cars"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a woman talking as an infant is crying"], "sample_ids": ["xKB8O8LTs6s", "tMbMDvT50j8"], "start_seconds": ["70", "12"], "properties": ["music, gunfire, explosion", "a, talk, infant"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a baby cries and a woman speaks"], "question": "which entity is more calm", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "water flows as men speak and yell"], "sample_ids": ["ukxt9I7eMMg", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["food, pan, cook", "water, flow, men"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a man laughs and speaks as cats purr and hiss", "small dogs yip and bark sharply"], "sample_ids": ["vVhthZ45k3Y", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["cat, purr, hiss", "bark, yip, sharply"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a dog barks and growls"], "question": "which animal is more playful", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a vehicle engine accelerating then running on idle"], "sample_ids": ["zcDwZ6W7E3E", "vYkA3cfXp5Q"], "start_seconds": ["180", "30"], "properties": ["man, speak, motorcycles", "engine, accelerate, idle"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a drill runs and two people laugh", "a man speaks as a car is passing by"], "sample_ids": ["tEE3MpBt1sg", "sK4u5T8hW78"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "a, car, pass"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking to a car passing by?", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "an infant crying frantically"], "sample_ids": ["uZesmtKZGSw", "zwOBqeFTgiU"], "start_seconds": ["250", "30"], "properties": ["men, talk, cars", "cry, infant, frantically"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xl2PIWyXaM", "vfYTJq7nU"], "start_seconds": ["160", "130"], "properties": ["chirp, man, younger person", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and people are talking", "a duck quacks and a woman speaks"], "question": "which entity has more animals", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["x5cuQjOdM3E", "rqu8iB22IY"], "start_seconds": ["30", "5"], "properties": ["cat, meows, young woman", "sound, repeats, laugh"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a dog barks and a man speaks while music plays "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a large bell chimes back and forth loudly", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["w2M4i1mklOA", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["loud, chime, bell", "a, scream, girl"], "captions_pred_video": ["footage of an antique clock", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman is speaking and a baby is crying"], "question": "which entity is quieter", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "a man speaks as a car is passing by"], "sample_ids": ["yeFvk9x0wWI", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["chirp, twitter, clatter", "a, car, pass"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is silent", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "someone whistles a tune"], "sample_ids": ["wztCSUxOf8", "sIXTftIuUgw"], "start_seconds": ["130", "90"], "properties": ["a crowd, yells, applauds", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a person whistling a song"], "question": "which entity is more quiet", "label": 1}, {"captions": ["vehicles pass by on a roadway", "birds chirp and objects are moved around"], "sample_ids": ["tgbONvsP47Y", "yPUYU6t3rwo"], "start_seconds": ["0", "370"], "properties": ["pass, vehicle, roadway", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a fire truck entering a garage", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a car is driving on the road ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and water drains", "a person is burping then speaks and laughs"], "sample_ids": ["sfAvvZwdLCY", "wAAkbZToh8"], "start_seconds": ["20", "0"], "properties": ["water drains, flushes, water", "burp, laugh, speak"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a man burps and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a child speaks in closed space", "wind blowing followed by a zoom"], "sample_ids": ["yW6FWLSLkx4", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["child, space, speak", "wind, blow, zoom"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to blow", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "someone is typing on a computer keyboard"], "sample_ids": ["tw76HGONaKg", "v0x1odnXtP0"], "start_seconds": ["570", "210"], "properties": ["A, game, keyboard", "keyboard, type, computer"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a person is typing on a keyboard"], "question": "which keyboard is used to type on a computer", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "water splashes and a door squeaks"], "sample_ids": ["sapQIQUhFc", "sdXV-ylviw"], "start_seconds": ["280", "190"], "properties": ["water, trickles, flow", "sound, splash, door"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a dog barks and taps with background noise "], "question": "which entity has a door that squeaks?", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "winds blows roughly as a vehicle races past"], "sample_ids": ["wqADXCzngMw", "xjvTpk2Zpr8"], "start_seconds": ["340", "70"], "properties": ["engine, idle, man", "wind, blows, vehicle"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a propeller rotates loudly and intensely"], "sample_ids": ["ugHJF0hfYkg", "ugHJF0hfYkg"], "start_seconds": ["10", "10"], "properties": ["loud, propeller, move", "loud, intense, propeller"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a helicopter is flying overhead ", "a helicopter is flying overhead "], "question": "which propeller is louder", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "an engine runs loudly"], "sample_ids": ["tZGN5a7ybxo", "vqZuVbG6-HI"], "start_seconds": ["60", "130"], "properties": ["ring, train, horn", "loud, engine, run"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a lawn mower is running and men are speaking "], "question": "which train is louder", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zY3icUyMdh8", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["dog, bark, engine", "a woman, something, fried"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a person snoring several times"], "sample_ids": ["yYEVLuqEytU", "spJCm8tD9Zo"], "start_seconds": ["40", "90"], "properties": ["animal, pig, background", "snore, person, several"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a person is snoring loudly"], "question": "which entity is a person?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "two women and a man talk while a kid cries"], "sample_ids": ["vimzuGQvdcU", "wyllXV6PjKo"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "a kid, talk, cry"], "captions_pred_video": ["a group of people are rafting down a river", null], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a woman speaks and a baby cries"], "question": "which entity has a kid?", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["vddP56-ogds", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["water, flow, laugh", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking and a dog is barking "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a toilet flushes and water drains unevenly"], "sample_ids": ["vbZ-0lGPneg", "vhJWZheqaE"], "start_seconds": ["30", "0"], "properties": ["a woman, a television program, a bird", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a toilet is flushed"], "question": "which entity has more water", "label": 1}, {"captions": ["a male speaks and another male speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["viuTg1M-dqg", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["two males, speaking, male", "loud, multiple, distance"], "captions_pred_video": ["footage of water coming out of a hole in the ground", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tEE3MpBt1sg", "xfaoyyzw2WU"], "start_seconds": ["50", "180"], "properties": ["drill, something, laugh", "loud, jet engine, roar"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "a toilet flushes and a female speaks"], "sample_ids": ["y8dSeubCNI", "yaln9y8I7ms"], "start_seconds": ["4", "230"], "properties": ["engine revving, people speaking, motorcycle", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["an engine revving and people talking in the background", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a man is filing a hard object", "a train horn blows as it passes by"], "sample_ids": ["vveS8HT7Uog", "zVacuqSb4LI"], "start_seconds": ["100", "30"], "properties": ["a man, hard, object", "horn, blows, train"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which object is louder", "label": 0}, {"captions": ["a person sneezes followed by another person speaking", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["t8CV69hcvF0", "uZesmtKZGSw"], "start_seconds": ["210", "250"], "properties": ["person, sneeze, follow", "men, talk, cars"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a infant makes noise and is excited"], "sample_ids": ["zkKdxzNC97Y", "wIJK3-5y0kA"], "start_seconds": ["27", "30"], "properties": ["loud, bang, noise", "noise, excited, infant"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a door is opened and closed", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a man speaks as a car is passing by"], "sample_ids": ["su6FAOcOA8c", "sK4u5T8hW78"], "start_seconds": ["4", "30"], "properties": ["engine, idle, woman", "a, car, pass"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["distant men speak as a spray can nozzle is depressed", "an infant crying as a woman laughs"], "sample_ids": ["rwtmaKiCcQU", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["nozzle, depressed, spray can", "a, laugh, infant"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["spraying and people speaking", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "water flows and trickles"], "sample_ids": ["yajyRTUQk3U", "tB7hWb9gTuQ"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "water, flow, trickle"], "captions_pred_video": ["- a woman cooking in the kitchen", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a small engine idles continuously", "vehicles pass by on a roadway"], "sample_ids": ["y5WII6cTH7k", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["engine, idle, continuously", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "footage of a fire truck entering a garage"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vddP56-ogds", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["water, flow, laugh", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "pigeons vocalize and birds chirp"], "sample_ids": ["yDoT73BWsdA", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["engine, revs, vehicle", "vocalize, bird, chirp"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "of the pigeon in the cage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["an engine runs loudly", "water pouring and bubbling"], "sample_ids": ["vqZuVbG6-HI", "uyRfq-jKPpo"], "start_seconds": ["130", "50"], "properties": ["loud, engine, run", "water, bubbles, pouring"], "captions_pred_video": ["footage is blurry because it's raining outside", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "water is running from a faucet"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["shmR4OZtzqA", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["man, engine, idle", "animal, grunts, snorts"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man speaks while a motor runs", "a woman is speaking and a baby is crying"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a helicopter engine runs continuously", "a infant makes noise and is excited"], "sample_ids": ["ugHJF0hfYkg", "wIJK3-5y0kA"], "start_seconds": ["10", "30"], "properties": ["engine, running, continuously", "noise, excited, infant"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a helicopter is flying overhead ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a infant makes noise and is excited"], "sample_ids": ["zl9Dqx-j7q4", "wIJK3-5y0kA"], "start_seconds": ["6", "30"], "properties": ["motors rev, laugh, loudly", "noise, excited, infant"], "captions_pred_video": ["footage of a man driving a car in the dark", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a jet engine roars ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a machine beeps continuously"], "sample_ids": ["sQwlkXjQabo", "y682ml90jGw"], "start_seconds": ["10", "11"], "properties": ["liquid, surface, spray", "beeps, machine, continuously"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "a beeping sound is being made "], "question": "which entity is not silent", "label": 1}, {"captions": ["a person whistles a meandering tune", "a man speaks as a motor runs in the background"], "sample_ids": ["uFoga8sHpiw", "xZepNM9qcRA"], "start_seconds": ["90", "30"], "properties": ["person, tune, whistle", "background, motor, run"], "captions_pred_video": ["footage of a bird in a cage", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person whistles a song", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 0}, {"captions": ["a dog whimpers and a woman briefly talks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["y1saVTXsKwc", "vb1fPSDI4c"], "start_seconds": ["80", "30"], "properties": ["a, dog, talk", "multiple, people, yell"], "captions_pred_video": ["a dog playing with a pink ball", null], "captions_pred_audio": ["a dog barks and a man speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a clock ticktocks briefly", "a duck quacks continuously"], "sample_ids": ["u7C-AEBQM", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks briefly", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a ticktock of a clock", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xfaoyyzw2WU", "wqZ135Ssz0"], "start_seconds": ["180", "60"], "properties": ["loud, jet engine, roar", "two men, woman, birds"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", null], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which is quieter", "label": 1}, {"captions": ["dogs barking and whimpering", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["tIY7qOV3rEM", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["barking, whimpering, dog", "a, scream, girl"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking and a baby is crying"], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tDVADusiIoc", "vfYTJq7nU"], "start_seconds": ["60", "130"], "properties": ["man, radio, blows", "rustling, ducks, quack"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a duck quacks and a woman speaks"], "question": "which entity is about a man speaking on a radio as wind blows?", "label": 0}, {"captions": ["vehicles pass by on a roadway", "a machine beeps continuously"], "sample_ids": ["tgbONvsP47Y", "y682ml90jGw"], "start_seconds": ["0", "11"], "properties": ["pass, vehicle, roadway", "beeps, machine, continuously"], "captions_pred_video": ["footage of a fire truck entering a garage", null], "captions_pred_audio": ["a car is driving on the road ", "a beeping sound is being made "], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "an infant crying frantically"], "sample_ids": ["tEE3MpBt1sg", "zwOBqeFTgiU"], "start_seconds": ["50", "30"], "properties": ["drill, something, laugh", "cry, infant, frantically"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "of the baby crying in the car seat"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "several insects fly while two men talk"], "sample_ids": ["s7knHCFW82w", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["blow horn, get close, train", "several, fly, men"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["continuous sizzling with a woman speaking towards the end", "some men converse over an engine running"], "sample_ids": ["ukxt9I7eMMg", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["continuous, woman, speaking", "men, converse, engine"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a woman speaking?", "label": 0}, {"captions": ["paper folding and crinkling", "a toilet flushes and water sputters as it drains"], "sample_ids": ["zPpG3RD8lSs", "smGI3C1NZc"], "start_seconds": ["20", "30"], "properties": ["paper, fold, crinkle", "water, drain, toilet"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", null], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a car accelerates and wind blows"], "sample_ids": ["u21-Z5gJCB8", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, voice, man", "accelerates, wind, blows"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a child speaks", "vehicles pass by on a roadway"], "sample_ids": ["yW6FWLSLkx4", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["a, child, speaks", "pass, vehicle, roadway"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a goat screams and people speak in the background", "people speak as gunfire rings out"], "sample_ids": ["xC8kbrKJmco", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["background, goat, scream", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a goat is bleating ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["people clap and speak in the distance", "a car accelerates and wind blows"], "sample_ids": ["wwyfGO2J4", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["clap, distance, speak", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "pigeons vocalize and birds chirp"], "sample_ids": ["xKB8O8LTs6s", "uiS58TNyUiw"], "start_seconds": ["70", "430"], "properties": ["music, radio, gunshots", "vocalize, bird, chirp"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of the pigeon in the cage"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a car speeding up in the distance", "paper folding and crinkling"], "sample_ids": ["u0TrcHhkPQ", "zPpG3RD8lSs"], "start_seconds": ["20", "20"], "properties": ["distance, car, speed", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "the wind blows and a mouse clicks "], "question": "which is not a car", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tK4VlLsNxak", "xKB8O8LTs6s"], "start_seconds": ["120", "70"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "music, gunfire, explosion"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "a clock ticktocks"], "sample_ids": ["wvKpEYswXO0", "v-g-j2uTByM"], "start_seconds": ["150", "30"], "properties": ["plastic, tap, speak", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of the person preparing food in the kitchen", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["ziUT9IFTkjg", "yLy-WycbVVE"], "start_seconds": ["10", "30"], "properties": ["background, birds, rustling", "background, people, talk"], "captions_pred_video": [null, "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity has a background of birds chirping?", "label": 0}, {"captions": ["someone snores nearby", "pigeons vocalize and birds chirp"], "sample_ids": ["spJCm8tD9Zo", "uiS58TNyUiw"], "start_seconds": ["90", "430"], "properties": ["someone snores, nearby, someone", "vocalize, bird, chirp"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of the pigeon in the cage"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "a clock ticktocks"], "sample_ids": ["vf44CgrjT0A", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["loud, long, person", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a loud burp", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sjlVMgdGSK0", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["accelerates, vehicle, race car", "harsh, wind, blows"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["small dogs growl, bark and yip.", "a car accelerates and wind blows"], "sample_ids": ["sShpyu2l4YQ", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["growl, bark, yip", "accelerates, wind, blows"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "a clock ticktocks"], "sample_ids": ["y8dSeubCNI", "v-g-j2uTByM"], "start_seconds": ["4", "30"], "properties": ["engine revving, people speaking, motorcycle", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["an engine revving and people talking in the background", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "water flows as men speak and yell"], "sample_ids": ["tK4VlLsNxak", "vJ7JPEFhyLA"], "start_seconds": ["120", "16"], "properties": ["a, dial, telephone", "water, flow, men"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking and dialing a rotary telephone?", "label": 0}, {"captions": ["a person speaks briefly", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zOZleIRqZm4", "vfYTJq7nU"], "start_seconds": ["80", "130"], "properties": ["person, talk, brief", "rustling, ducks, quack"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a duck quacks and a woman speaks"], "question": "which entity is about a person talking?", "label": 0}, {"captions": ["a man sprays as a scraping occurs in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["sOa7g-44Dag", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["background, man, spray", "wind, blows, vehicle"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sxYkFKFIZD0", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["screech, man, door", "three men, wind, flow"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about liquid flowing?", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a child speaks in closed space"], "sample_ids": ["spJCm8tD9Zo", "yW6FWLSLkx4"], "start_seconds": ["90", "40"], "properties": ["snores, wheezes, sleeps", "child, space, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vb1fPSDI4c", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["multiple, people, yell", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a woman is speaking while food is frying in the background"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a car accelerates and wind blows"], "sample_ids": ["yRx9txMcBl0", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["motors, tires, screech", "accelerates, wind, blows"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a race car accelerates and revs its engine "], "question": "which entity is a car?", "label": 1}, {"captions": ["a dog barks and whimpers", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sShpyu2l4YQ", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["barks, whimpers, dog", "engine, laugh, loud"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a dog is barking and growling", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a toilet flushes and water drains", "a child speaks in closed space"], "sample_ids": ["sfAvvZwdLCY", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["water drains, flushes, water", "child, space, speak"], "captions_pred_video": ["footage of the toilet in the bathroom", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks followed by clicks and scraping", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yYJksgsxx5U", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["audio, clicks, scraping", "stream, water, flow"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["white noise and birds chirping", "birds chirp and objects are moved around"], "sample_ids": ["wRBHTgrbiwg", "yPUYU6t3rwo"], "start_seconds": ["50", "370"], "properties": ["noise, white, chirping", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "insects buzz and a man speaks"], "question": "which entity is more like a video", "label": 1}, {"captions": ["water flows and trickles", "a car speeding up in the distance"], "sample_ids": ["tB7hWb9gTuQ", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["water, flow, trickle", "distance, car, speed"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", null], "captions_pred_audio": ["water is splashing and gurgling", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a car accelerates and wind blows", "water is sprayed across a hard surface"], "sample_ids": ["u0TrcHhkPQ", "sQwlkXjQabo"], "start_seconds": ["20", "10"], "properties": ["accelerates, wind, blows", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a car accelerates and wind blows"], "sample_ids": ["xfudFO976zE", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["animal, bleats, cry", "accelerates, wind, blows"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a woman speaks as she rubs two objects together"], "sample_ids": ["xyL9F5VrjkE", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["wind, motor, distance", "two objects, woman, speak"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a motor?", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "water flows and trickles"], "sample_ids": ["vbpKkWvfOu4", "tB7hWb9gTuQ"], "start_seconds": ["560", "30"], "properties": ["a, man, speaks", "water, flow, trickle"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a child speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yW6FWLSLkx4", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["a, child, speaks", "rustling, ducks, quack"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a duck quacks and a woman speaks"], "question": "which entity is about a child speaking?", "label": 0}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a car accelerates and wind blows"], "sample_ids": ["tDlfY3nmx1A", "u0TrcHhkPQ"], "start_seconds": ["160", "20"], "properties": ["applause, laugh, man", "accelerates, wind, blows"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", null], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a woman speaks happily and an animal chirps"], "sample_ids": ["zTLVJCo4WEE", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["a, crickets, sing", "a woman, chirps, animal"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a woman is speaking and a dog is barking "], "question": "which entity has a man and woman speaking?", "label": 0}, {"captions": ["long loud burping by a man", "a man speaks as a car is passing by"], "sample_ids": ["xmiUIOhtZyQ", "sK4u5T8hW78"], "start_seconds": ["60", "30"], "properties": ["loud, burp, man", "a, car, pass"], "captions_pred_video": ["homer simpson drinking a beer", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person burps and music plays in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a propeller rotates loudly and intensely"], "sample_ids": ["sfAvvZwdLCY", "ugHJF0hfYkg"], "start_seconds": ["20", "10"], "properties": ["water drains, flushes, water", "loud, intense, propeller"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a toilet is flushed", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a male speaks over some small clicks", "small dogs yip and bark sharply"], "sample_ids": ["uXxVebHsGZ8", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["male, clicks, speak", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "an infant crying as a woman laughs"], "sample_ids": ["sZPuqDgX2V0", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["engine, accelerate, intercom", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yajyRTUQk3U", "vbZ-0lGPneg"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "a woman, a television program, a bird"], "captions_pred_video": ["- a woman cooking in the kitchen", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman is speaking and a dog is whimpering"], "question": "which woman is speaking over sizzling noise", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "three men talk while wind blows and some liquid flows"], "sample_ids": ["uYT5gxnyMWM", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["person, spray, yell", "three men, wind, flow"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a person speaking over spraying and another person yelling?", "label": 0}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a vehicle accelerates squealing tires"], "sample_ids": ["weDbePuc-Xc", "sd7xVssqlw"], "start_seconds": ["40", "50"], "properties": ["music, slaps, human", "accelerates, tires, squealing"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "a toilet flushes and a female speaks"], "sample_ids": ["vs65y4qmyBE", "yaln9y8I7ms"], "start_seconds": ["340", "230"], "properties": ["wind, blows, strongly", "female, flushes, toilet"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage is blurry and out of focus"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a toilet flushes and a man speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "winds blows roughly as a vehicle races past"], "sample_ids": ["zcDwZ6W7E3E", "xjvTpk2Zpr8"], "start_seconds": ["180", "70"], "properties": ["a, man, speak", "wind, blows, vehicle"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle racing past?", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "several insects fly while two men talk"], "sample_ids": ["ylpYOorfH4o", "s-T9OVOiMLo"], "start_seconds": ["410", "330"], "properties": ["engine, running, wind", "several, fly, men"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more flying insects", "label": 1}, {"captions": ["some men converse over an engine running", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sCiy7QS1U", "wz7N8YRy74I"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "vehicles pass by on a roadway"], "sample_ids": ["zVacuqSb4LI", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["blares, fades, train", "pass, vehicle, roadway"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "water splashes and a door squeaks"], "sample_ids": ["vb1fPSDI4c", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["multiple, people, yell", "sound, splash, door"], "captions_pred_video": [null, null], "captions_pred_audio": ["a crowd of people are talking and laughing", "a dog barks and taps with background noise "], "question": "which entity has a door squeaking?", "label": 1}, {"captions": ["a man speaks as a machine runs", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vD6lYD1l0BY", "vbZ-0lGPneg"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "a woman, a television program, a bird"], "captions_pred_video": ["game controller being held in the hands of the person", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a weapon fires multiple times"], "sample_ids": ["zj2R0XoFr5k", "sMC07Ucy7kg"], "start_seconds": ["50", "10"], "properties": ["airplane, boy, fly", "weapon, fire, multiple"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage is from a car's point of view"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["water splashes as an animal walks through", "water splashes and a door squeaks"], "sample_ids": ["w1ir-sZ3Im8", "sdXV-ylviw"], "start_seconds": ["90", "190"], "properties": ["animal, water, splashes", "sound, splash, door"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a dog barks and taps with background noise "], "question": "which entity is more likely to be a video of a door squeaking?", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "an infant crying as a woman laughs"], "sample_ids": ["tZGN5a7ybxo", "xhmRY9yhC7c"], "start_seconds": ["60", "20"], "properties": ["ring, train, horn", "a, laugh, infant"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["t25U-v4k4ts", "w34HjHr6gAY"], "start_seconds": ["40", "30"], "properties": ["a, chirps, bird", "beeps, hit, woman"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a machine engine runs and a man speaks"], "sample_ids": ["vr8ZXjEBhMQ", "vs65y4qmyBE"], "start_seconds": ["150", "340"], "properties": ["wind, blow, zoom", "engine, run, man"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a heavy engine is running and men are speaking "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["water pouring and bubbling", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uyRfq-jKPpo", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["water, bubbles, pouring", "a woman, laughs, animal"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["water is running from a faucet", "a woman is speaking and a baby is crying"], "question": "which entity is a video", "label": 1}, {"captions": ["a person is snoring while sleeping", "a woman speaks and taps on a hard surface before running tap water"], "sample_ids": ["vJrjSeP17yE", "wvKpEYswXO0"], "start_seconds": ["40", "150"], "properties": ["a person is sleeping, snoring, person", "water, tap, run"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a person", "label": 0}, {"captions": ["a crowd yells, reacts and applauds", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wztCSUxOf8", "ukg5L09Wpvo"], "start_seconds": ["130", "150"], "properties": ["a crowd, yells, applauds", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a train blows its whistle and blows its horn "], "question": "which entity is quieter", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "some tunes played by whistling"], "sample_ids": ["w2JXXIAdUdg", "u6BnG6YZqJ4"], "start_seconds": ["10", "0"], "properties": ["snoring, distance, person", "tune, play, whistling"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["an infant crying as a woman laughs", "a person is whistling"], "sample_ids": ["xhmRY9yhC7c", "sIXTftIuUgw"], "start_seconds": ["20", "90"], "properties": ["a, laugh, infant", "person, whistling, person"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a person whistling a song"], "question": "which person is whistling", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uWAAAL4CIoc", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["a woman, chirps, animal", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause harm", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "a stream of water runs briefly"], "sample_ids": ["xzKKf9bKNUo", "x-PeY8Yb8M4"], "start_seconds": ["10", "300"], "properties": ["background, noise, snoring", "stream, water, run"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person snoring loudly", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "some men converse over an engine running"], "sample_ids": ["zY3icUyMdh8", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["dog, bark, engine", "men, converse, engine"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a dog barking and a vehicle engine idling followed shortly by vehicle engine revving?", "label": 0}, {"captions": ["a woman and man speak while food is frying", "a train horn blows as it passes by"], "sample_ids": ["zk-xJGQU8-4", "zVacuqSb4LI"], "start_seconds": ["130", "30"], "properties": ["food, man, woman", "horn, blows, train"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train", "label": 1}, {"captions": ["continuous snoring", "birds chirp and objects are moved around"], "sample_ids": ["sLkeqCDJIyw", "yPUYU6t3rwo"], "start_seconds": ["120", "370"], "properties": ["loud, snoring, noise", "birds chirp, objects are moved around, birds"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a person is snoring loudly", "insects buzz and a man speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["u2f5NpsoHBg", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["person, laugh, clap", "a woman, something, fried"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a telephone rings followed by a woman talking"], "sample_ids": ["yYJksgsxx5U", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["audio, woman, silverware", "ring, talk, woman"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", null], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which woman is talking", "label": 1}, {"captions": ["food is frying then a woman speaks", "a toilet flushes and a female speaks"], "sample_ids": ["ukxt9I7eMMg", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["food, woman, speak", "female, flushes, toilet"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a toilet flushes and a man speaks"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wnpJndXuxLc", "su6FAOcOA8c"], "start_seconds": ["50", "4"], "properties": ["blows, vehicle, train", "engine, idle, woman"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "water pouring and bubbling"], "sample_ids": ["y8dSeubCNI", "uyRfq-jKPpo"], "start_seconds": ["4", "50"], "properties": ["engine revving, people speaking, motorcycle", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["an engine revving and people talking in the background", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks while water drains", "a toilet flushes and a female speaks"], "sample_ids": ["vSeGhaZt-aI", "yaln9y8I7ms"], "start_seconds": ["50", "230"], "properties": ["water, drain, man", "female, flushes, toilet"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a person speaks briefly", "an infant crying frantically"], "sample_ids": ["zOZleIRqZm4", "zwOBqeFTgiU"], "start_seconds": ["80", "30"], "properties": ["person, talk, brief", "cry, infant, frantically"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["w0xsN8X18Y", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["rain, thunder, surface", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is about a man speaking and a rooster crows?", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "a woman speaks as she rubs two objects together"], "sample_ids": ["vz8868znkVQ", "vzxHnu-SFEw"], "start_seconds": ["60", "80"], "properties": ["audio, click, kid speaking", "two objects, woman, speak"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["u--KhUW8l1Y", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["horn, siren, life", "gun, shoot, water"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "water is sprayed across a hard surface"], "sample_ids": ["ul60S8TXDA8", "sQwlkXjQabo"], "start_seconds": ["60", "10"], "properties": ["sound, distance, bell", "water, spray, surface"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["some tunes played by whistling", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["u6BnG6YZqJ4", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["tune, play, whistling", "female, spraying, scream"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "water pouring and bubbling"], "sample_ids": ["uZesmtKZGSw", "uyRfq-jKPpo"], "start_seconds": ["250", "50"], "properties": ["men, talk, cars", "water, bubbles, pouring"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "paper folding and crinkling"], "sample_ids": ["yZmhM1HcsyE", "zPpG3RD8lSs"], "start_seconds": ["4", "20"], "properties": ["engine, roar, water", "paper, fold, crinkle"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "the wind blows and a mouse clicks "], "question": "which entity is more likely to be a paper craft", "label": 1}, {"captions": ["a male speaks over some small clicks", "a woman speaks happily and an animal chirps"], "sample_ids": ["uXxVebHsGZ8", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["male, clicks, speak", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a woman is speaking and a dog is barking "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["motors runs briefly and tires screech", "some men converse over an engine running"], "sample_ids": ["yRx9txMcBl0", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["motors, tires, screech", "men, converse, engine"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a person talking to someone?", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vSeGhaZt-aI", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["water, bubbles, speak", "a, scream, girl"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a motor vehicle roars, drowning out people speaking in the background"], "sample_ids": ["ylpYOorfH4o", "s4Uz1Ffgo04"], "start_seconds": ["410", "100"], "properties": ["engine, run, loud", "roars, background, people speaking"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "a woman speaks as she rubs two objects together"], "sample_ids": ["t8CV69hcvF0", "vzxHnu-SFEw"], "start_seconds": ["210", "80"], "properties": ["person, sneeze, follow", "two objects, woman, speak"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman sneezes and speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "multiple people speak and children yell while water gurgles"], "sample_ids": ["w5W5Kqtc8E", "vb1fPSDI4c"], "start_seconds": ["100", "30"], "properties": ["wind, blow, vehicle", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a crowd of people are talking and laughing"], "question": "which entity has more people yelling", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "people applaud and hoot and chat quietly"], "sample_ids": ["s4Uz1Ffgo04", "wwyfGO2J4"], "start_seconds": ["100", "90"], "properties": ["water, rushes, motorcycle", "people, applaud, hoot"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "winds blows roughly as a vehicle races past"], "sample_ids": ["tOj4tdLRaA", "xjvTpk2Zpr8"], "start_seconds": ["70", "70"], "properties": ["woman, laugh, baby", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["water flows as men speak and yell", "water splashes as an animal walks through"], "sample_ids": ["vJ7JPEFhyLA", "w1ir-sZ3Im8"], "start_seconds": ["16", "90"], "properties": ["water, flow, men", "animal, water, splashes"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "water splashes and gurgles as people speak"], "question": "which entity is more calm", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a duck quacks several times"], "sample_ids": ["uzQnlJXBbOM", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["ringing, beep, stop", "quacks, duck, several"], "captions_pred_video": ["footage of a person using a cell phone on a table", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a telephone rings and a man speaks", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sK4u5T8hW78", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["a, car, pass", "men, talk, cars"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "a woman speaks happily and an animal chirps"], "sample_ids": ["tOj4tdLRaA", "uWAAAL4CIoc"], "start_seconds": ["70", "0"], "properties": ["woman, laugh, baby", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking and a dog is barking "], "question": "which entity has a baby in it", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vzxHnu-SFEw", "xBxDz0CFVn0"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "stream, water, flow"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman talking as an infant is crying", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tMbMDvT50j8", "tdWhHV3X25Q"], "start_seconds": ["12", "60"], "properties": ["a, talk, infant", "applause, audience, yells"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["water splashes as an animal walks through", "someone whistles a tune"], "sample_ids": ["w1ir-sZ3Im8", "sIXTftIuUgw"], "start_seconds": ["90", "90"], "properties": ["animal, water, splashes", "someone, tune, whistle"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a woman speaks with water running", "a stream of water runs briefly"], "sample_ids": ["wTideSjRFS0", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["water, running, woman", "stream, water, run"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["people speak then an engine runs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uMTTDZ2mb4", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["engine, run, people", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["an engine runs loudly", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vqZuVbG6-HI", "vlS6YMeWAPo"], "start_seconds": ["130", "40"], "properties": ["loud, engine, run", "sheep, baa, birds"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a goat bleats and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "water flows and trickles"], "sample_ids": ["rwtmaKiCcQU", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["nozzle, depressed, spray can", "water, flow, trickle"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["spraying and people speaking", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["water flows as men speak and yell", "people cheer as a vehicle engine revs"], "sample_ids": ["vJ7JPEFhyLA", "xjhAnI2q6hM"], "start_seconds": ["16", "6"], "properties": ["water, flow, men", "engine revs, vehicle, people"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a vehicle?", "label": 1}, {"captions": ["a duck quacks continuously", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vh30P49Po6s", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["quacks, continuously, duck", "a woman, laughs, animal"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "a woman speaks and is crumpling paper"], "sample_ids": ["vlS6YMeWAPo", "xvDdE3zNf8Y"], "start_seconds": ["40", "120"], "properties": ["noise, bleat, call", "A, crumple, paper"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a goat bleats and birds chirp", "a woman speaks and crumples paper"], "question": "which entity is crumpling paper", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "water is sprayed across a hard surface"], "sample_ids": ["zl9Dqx-j7q4", "sQwlkXjQabo"], "start_seconds": ["6", "10"], "properties": ["engine, laugh, loud", "water, spray, surface"], "captions_pred_video": ["footage of a man driving a car in the dark", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a jet engine roars ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sSMl2vc3ek", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["loud, multiple, distance", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person snoring loudly", "a duck quacks and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "people cheer as a vehicle engine revs"], "sample_ids": ["s4Uz1Ffgo04", "xjhAnI2q6hM"], "start_seconds": ["100", "6"], "properties": ["water, rushes, motorcycle", "engine revs, vehicle, people"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a stream of water flows quickly", "a clock ticktocks"], "sample_ids": ["wbHTKEJZyhc", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["stream, water, flow", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a clock is ticking loudly"], "question": "which entity is a clock", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a clock ticktocks"], "sample_ids": ["vimzuGQvdcU", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a group of people are rafting down a river", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["s4Uz1Ffgo04", "wz7N8YRy74I"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "rooster, crow, background, men"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "an engine runs loudly"], "sample_ids": ["sZPuqDgX2V0", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["engine, accelerate, intercom", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a lawn mower is running and men are speaking "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a horn honks and then loudly blares", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wnpJndXuxLc", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["horn, honk, loud", "music, gunfire, explosion"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["birds chirp, a woman speaks, and insects buzz", "a frog croaks as other frogs croak in the background"], "sample_ids": ["t97k0cejSQE", "yswmmRZFItk"], "start_seconds": ["250", "0"], "properties": ["sound, chirp, buzz", "background, frog, croak"], "captions_pred_video": ["a bee on a purple thistle flower", "a close up of a frog in the water"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a frog is croaking"], "question": "which entity has a background of frogs?", "label": 1}, {"captions": ["water splashes and a door squeaks", "vehicle engines race around a track as a man commentates"], "sample_ids": ["sdXV-ylviw", "sZPuqDgX2V0"], "start_seconds": ["190", "30"], "properties": ["sound, splash, door", "commentator, race, track"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and taps with background noise ", "a man is speaking and a helicopter is flying overhead "], "question": "which entity is a video of a race?", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "a diesel truck engine runs continuously"], "sample_ids": ["sQGXqGcwOTc", "sZvwOuuPGP0"], "start_seconds": ["3", "50"], "properties": ["cling, speak, dishes", "engine, diesel, truck"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "of a bulldozer clearing a road in a forest stock footage and royalty-free videos"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a medium engine is running "], "question": "which entity is a moving object", "label": 1}, {"captions": ["water flows followed by women screaming", "a child speaks"], "sample_ids": ["w5W5Kqtc8E", "yW6FWLSLkx4"], "start_seconds": ["100", "40"], "properties": ["water, flow, women", "a, child, speaks"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is silent", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a steam engine runs and whistles as it passes by"], "sample_ids": ["u7C-AEBQM", "se87d6yxEOA"], "start_seconds": ["30", "10"], "properties": ["ticks, rhythmic, quiet", "run, whistle, pass"], "captions_pred_video": [null, "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["a ticktock of a clock", "a train is moving and blowing its whistle "], "question": "which entity is louder", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tK4VlLsNxak", "yajyRTUQk3U"], "start_seconds": ["120", "400"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "a woman, something, fried"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["some people speak", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vbZ-0lGPneg", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "a woman, something, fried"], "captions_pred_video": ["of a man holding a baby duck in his hands", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a person sniffles and sneezes", "a man talks as something metal hits against and glass is set down"], "sample_ids": ["uRlbY6aoBU", "x6ijhqRY38s"], "start_seconds": ["0", "250"], "properties": ["sneezes, sniffles, person", "something metal, glass, hit"], "captions_pred_video": [null, "a chef preparing a dish with a bottle of wine and a plate of food on a table"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and dishes are clanging "], "question": "which entity is about hitting something metal against glass?", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "a toilet flushes and water drains"], "sample_ids": ["s3cTDAj31g", "sfAvvZwdLCY"], "start_seconds": ["80", "20"], "properties": ["man, talk, woman", "water drains, flushes, water"], "captions_pred_video": [null, "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["an insect buzzes around continuously", "a duck quacks continuously"], "sample_ids": ["v25l1jef3JY", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "quacks, continuously, duck"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["a stream of water flows quickly", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wbHTKEJZyhc", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["stream, water, flow", "a woman, laughs, animal"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is not a stream of water?", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "several insects fly while two men talk"], "sample_ids": ["xBxDz0CFVn0", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["stream, water, flow", "several, fly, men"], "captions_pred_video": ["footage is blurry and out of focus", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "a man woman speak while crickets sing"], "sample_ids": ["yZp6xizR0yU", "zTLVJCo4WEE"], "start_seconds": ["30", "30"], "properties": ["animal, bleat, cry", "a, crickets, sing"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a woman speaks and crickets chirp"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["here comes the train and it starts to blow the horn and get close", "wind blows as people chatter quietly"], "sample_ids": ["s7knHCFW82w", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["blow horn, get close, train", "wind, chatter, people"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "footage is blurry and out of focus"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man talks while vehicles pass by", "pigeons vocalize and birds chirp"], "sample_ids": ["sK4u5T8hW78", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["a, man, talk", "vocalize, bird, chirp"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "wind blows as people chatter quietly"], "sample_ids": ["y4tPJXBKDig", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["a, noise, talk", "wind, chatter, people"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a woman speaks happily and an animal chirps"], "sample_ids": ["yLy-WycbVVE", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["background, people, talk", "a woman, chirps, animal"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", null], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "wind blows as people chatter quietly"], "sample_ids": ["uRlbY6aoBU", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["a, distance, sneeze", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "wind blows as people chatter quietly"], "sample_ids": ["yks4cLgIDMc", "xBxDz0CFVn0"], "start_seconds": ["170", "30"], "properties": ["background, speaking, child", "wind, chatter, people"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a child is crying", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "pigeons vocalize and birds chirp"], "sample_ids": ["tGcFnX0GHI", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["ring, talk, woman", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xBxDz0CFVn0", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["stream, water, flow", "water, radio, man"], "captions_pred_video": ["footage is blurry and out of focus", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a motorcycle engine is revving while people are speaking"], "sample_ids": ["tDlfY3nmx1A", "y8dSeubCNI"], "start_seconds": ["160", "4"], "properties": ["applause, laugh, man", "engine revving, people speaking, motorcycle"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", null], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "an engine revving and people talking in the background"], "question": "which entity is a motorcycle?", "label": 1}, {"captions": ["an engine runs loudly", "wind blowing followed by a zoom"], "sample_ids": ["vqZuVbG6-HI", "vr8ZXjEBhMQ"], "start_seconds": ["130", "150"], "properties": ["loud, engine, run", "wind, blow, zoom"], "captions_pred_video": ["footage is blurry because it's raining outside", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["continuous snoring", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sLkeqCDJIyw", "xKB8O8LTs6s"], "start_seconds": ["120", "70"], "properties": ["loud, snoring, noise", "music, gunfire, explosion"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a person is snoring loudly", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is louder", "label": 1}, {"captions": ["a man talks as several small engines run", "water gurgles, metal squeaks and the water stops"], "sample_ids": ["u9A6VZQCZpU", "x4a9YGIw4ok"], "start_seconds": ["30", "120"], "properties": ["a, man, talk", "water, gurgles, stops"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a toilet flushes and water splashes"], "question": "which entity is about water?", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a duck quacks continuously"], "sample_ids": ["uYT5gxnyMWM", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["female, spraying, scream", "quacks, continuously, duck"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "water pouring and bubbling"], "sample_ids": ["tgbONvsP47Y", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["noise, truck, accelerate", "water, bubbles, pouring"], "captions_pred_video": ["footage of a fire truck entering a garage", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a car is driving on the road ", "water is running from a faucet"], "question": "which is a liquid", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a muffled toilet flushes and the water drains"], "sample_ids": ["yajyRTUQk3U", "sfAvvZwdLCY"], "start_seconds": ["400", "20"], "properties": ["noise, woman, speak", "flushes, drains, water"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a toilet is flushed"], "question": "which entity is silent", "label": 1}, {"captions": ["a motorcycle engine works nearby", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["tOSWIURC-4", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["engine, work, nearby", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "wind blowing followed by a zoom"], "sample_ids": ["tOSWIURC-4", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["engine, work, nearby", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a lawn mower is running ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["water rushes and then a vehicle zooms past", "a small engine spits as it runs"], "sample_ids": ["s4Uz1Ffgo04", "sZvwOuuPGP0"], "start_seconds": ["100", "50"], "properties": ["water, rushes, vehicle", "spits, engine, runs"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "of a bulldozer clearing a road in a forest stock footage and royalty-free videos"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a medium engine is running "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["zALy31PjDl0", "zl9Dqx-j7q4"], "start_seconds": ["21", "6"], "properties": ["a man, a vehicle, a horn", "engine, laugh, loud"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yYEVLuqEytU", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["animal, pig, background", "a woman, laughs, animal"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and an animal snorting?", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "some tunes played by whistling"], "sample_ids": ["zVacuqSb4LI", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["blares, fades, train", "tune, play, whistling"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a woman speaks as she rubs two objects together"], "sample_ids": ["tGcFnX0GHI", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["ring, talk, woman", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman is speaking", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "continuous sizzling with a woman speaking towards the end"], "sample_ids": ["tdWhHV3X25Q", "ukxt9I7eMMg"], "start_seconds": ["60", "30"], "properties": ["applause, audience, yells", "continuous, woman, speaking"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a woman is speaking while food is frying in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["an airplane engine runs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yVPZ2MNWpms", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["engine, airplane, runs", "female, spraying, scream"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car is driving by on the road ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["paper folding and crinkling", "pigeons vocalize and birds chirp"], "sample_ids": ["zPpG3RD8lSs", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["paper, fold, crinkle", "vocalize, bird, chirp"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "of the pigeon in the cage"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["loud intermittent buzzing with intermittent laughter", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sLUnaPT5gM8", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["loud, laughter, intermittent", "two men, woman, birds"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["continuous sneezing together with speech", "a infant makes noise and is excited"], "sample_ids": ["x4dZyf9Gbj0", "wIJK3-5y0kA"], "start_seconds": ["130", "30"], "properties": ["continuous, sneeze, speech", "noise, excited, infant"], "captions_pred_video": ["footage is blurry and out of focus", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman sneezes and speaks", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "pigeons vocalize and birds chirp"], "sample_ids": ["rqu8iB22IY", "uiS58TNyUiw"], "start_seconds": ["5", "430"], "properties": ["sound, repeats, laugh", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a vehicle accelerates and squeals tires"], "sample_ids": ["yRx9txMcBl0", "yRx9txMcBl0"], "start_seconds": ["40", "40"], "properties": ["accelerates, tires, squeals", "accelerates, tires, squeals"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a car is revving its engine and skidding "], "question": "which vehicle accelerates and squeals tires", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yks4cLgIDMc", "vb1fPSDI4c"], "start_seconds": ["170", "30"], "properties": ["background, speaking, child", "multiple, people, yell"], "captions_pred_video": ["footage of two kids wrestling on the floor", null], "captions_pred_audio": ["a man is speaking and a child is crying", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an infant crying as a woman laughs", "a man speaks followed by another man speaking outside"], "sample_ids": ["xhmRY9yhC7c", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["a, laugh, infant", "two men, speak, follow"], "captions_pred_video": ["of a baby crying in a baby bouncer", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a infant makes noise and is excited"], "sample_ids": ["rwTERCUno", "wIJK3-5y0kA"], "start_seconds": ["90", "30"], "properties": ["engine, idle, sputter", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["an engine is idling and vibrating", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "people speak as gunfire rings out"], "sample_ids": ["yRx9txMcBl0", "wqTCwqVRDlk"], "start_seconds": ["40", "80"], "properties": ["accelerates, tires, squeals", "gunfire, ring, speak"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "people speak as gunfire rings out"], "sample_ids": ["xM4joTqDVp4", "wqTCwqVRDlk"], "start_seconds": ["160", "80"], "properties": ["background, chirp, birds", "gunfire, ring, speak"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sOa7g-44Dag", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["background, man, spray", "engine, laugh, loud"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uOpoD0gGXcs", "sSMl2vc3ek"], "start_seconds": ["120", "20"], "properties": ["chirps, woman, bird", "loud, multiple, distance"], "captions_pred_video": ["a herd of cows grazing in the field", null], "captions_pred_audio": ["birds are chirping and a man is speaking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "people speak as gunfire rings out"], "sample_ids": ["yNtRmrn0io8", "wqTCwqVRDlk"], "start_seconds": ["210", "80"], "properties": ["storm, distance, strike", "gunfire, ring, speak"], "captions_pred_video": ["footage of a house in the middle of the night", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["rain falls and thunder roars", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "some tunes played by whistling"], "sample_ids": ["tQWGZLItBXk", "u6BnG6YZqJ4"], "start_seconds": ["170", "0"], "properties": ["music, person, ding", "tune, play, whistling"], "captions_pred_video": ["worms revolution screenshots", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "small dogs yip and bark sharply"], "sample_ids": ["shmR4OZtzqA", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["man, engine, idle", "bark, yip, sharply"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man speaks while a motor runs", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["water quietly rushes by while birds chirp in the background", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sYITalLZjj4", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["water, rushes, background, birds", "a, scream, girl"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["wind blows and birds chirp", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "people speak as gunfire rings out"], "sample_ids": ["yZp6xizR0yU", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["animal, bleat, cry", "gunfire, ring, speak"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["u21-Z5gJCB8", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["background, voice, man", "motor noise, horn, siren"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a machine clanks and thumps and a male speaks", "a toilet flushes and a female speaks"], "sample_ids": ["sWZzXuWYY", "yaln9y8I7ms"], "start_seconds": ["420", "230"], "properties": ["male, clanks, thumps", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a toilet flushes and a man speaks"], "question": "which entity is a machine?", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "water pouring and bubbling"], "sample_ids": ["yZmhM1HcsyE", "uyRfq-jKPpo"], "start_seconds": ["4", "50"], "properties": ["engine, roar, water", "water, bubbles, pouring"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a small engine idles continuously"], "sample_ids": ["x6ijhqRY38s", "y5WII6cTH7k"], "start_seconds": ["250", "40"], "properties": ["bowl, silverware, man", "engine, idle, continuously"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "an engine is knocking and vibrating "], "question": "which entity is not moving", "label": 1}, {"captions": ["a car accelerates and wind blows", "small dogs yip and bark sharply"], "sample_ids": ["u0TrcHhkPQ", "v-wcQf4BDY0"], "start_seconds": ["20", "120"], "properties": ["accelerates, wind, blows", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["xKB8O8LTs6s", "wz7N8YRy74I"], "start_seconds": ["70", "30"], "properties": ["music, gunfire, explosion", "rooster, crow, background, men"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xyL9F5VrjkE", "su6FAOcOA8c"], "start_seconds": ["20", "4"], "properties": ["engine, run, wind", "engine, idle, woman"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wz7N8YRy74I", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["rooster, crow, background, people", "rustling, ducks, quack"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a duck quacks and a woman speaks"], "question": "which entity has more animals", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "an infant crying frantically"], "sample_ids": ["vZAw4apG0Es", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["people, clock, converse", "cry, infant, frantically"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "of the baby crying in the car seat"], "captions_pred_audio": ["a clock is ticking and people are talking", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["w0xsN8X18Y", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["music, surface, rain", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and laughing?", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uzQnlJXBbOM", "vb1fPSDI4c"], "start_seconds": ["50", "30"], "properties": ["ringing, beep, stop", "multiple, people, yell"], "captions_pred_video": ["footage of a person using a cell phone on a table", null], "captions_pred_audio": ["a telephone rings and a man speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "vehicle engines race around a track as a man commentates"], "sample_ids": ["wAAkbZToh8", "sZPuqDgX2V0"], "start_seconds": ["0", "30"], "properties": ["burp, laugh, speak", "commentator, race, track"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man burps and a woman speaks", "a man is speaking and a helicopter is flying overhead "], "question": "which entity is a video of a person speaking and laughing?", "label": 0}, {"captions": ["a low rumbling in the distance followed by a motorcycle engine revving up", "a stream of water runs briefly"], "sample_ids": ["vr8ZXjEBhMQ", "x-PeY8Yb8M4"], "start_seconds": ["150", "300"], "properties": ["sound, distance, engine", "stream, water, run"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sU53zg9Jp7s", "vfYTJq7nU"], "start_seconds": ["380", "130"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "rustling, ducks, quack"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a duck quacks and a woman speaks"], "question": "which entity is about a bird?", "label": 0}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "winds blows roughly as a vehicle races past"], "sample_ids": ["ukg5L09Wpvo", "xjvTpk2Zpr8"], "start_seconds": ["150", "70"], "properties": ["clickety-clack, train, whistle", "wind, blows, vehicle"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a jet engine roars and wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["zj2R0XoFr5k", "ukg5L09Wpvo"], "start_seconds": ["50", "150"], "properties": ["airplane, fly, overhead", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a train blows its whistle and blows its horn "], "question": "which is a train", "label": 1}, {"captions": ["an infant crying frantically", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zwOBqeFTgiU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["cry, infant, frantically", "stream, water, flow"], "captions_pred_video": ["of the baby crying in the car seat", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries loudly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["dogs barking and whimpering", "crowd applause while a guy laughs followed by another man speaking"], "sample_ids": ["tIY7qOV3rEM", "tDlfY3nmx1A"], "start_seconds": ["0", "160"], "properties": ["barking, whimpering, dog", "applause, laugh, man"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a man in a suit and tie is talking to another man in a suit and tie"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a crowd is clapping and laughing and a man is speaking "], "question": "which entity is a group of people", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["s4Uz1Ffgo04", "zj2R0XoFr5k"], "start_seconds": ["100", "50"], "properties": ["roars, background, people speaking", "airplane, boy, fly"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vlS6YMeWAPo", "zl9Dqx-j7q4"], "start_seconds": ["40", "6"], "properties": ["noise, bleat, call", "engine, laugh, loud"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a goat bleats and birds chirp", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vJvryTwuAV8", "yajyRTUQk3U"], "start_seconds": ["16", "400"], "properties": ["audience, cheer, man", "a woman, something, fried"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a man speaking with light rustling", "water pouring and bubbling"], "sample_ids": ["zOZleIRqZm4", "uyRfq-jKPpo"], "start_seconds": ["80", "50"], "properties": ["light, rustling, man", "water, bubbles, pouring"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "water is running from a faucet"], "question": "which entity is more likely to be in a kitchen", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "wind blowing followed by a zoom"], "sample_ids": ["uEU-Hg5MTN8", "vr8ZXjEBhMQ"], "start_seconds": ["27", "150"], "properties": ["animal, grunts, snorts", "wind, blow, zoom"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "an airplane engine spools and people speak"], "sample_ids": ["v0x1odnXtP0", "wTjoRj1se3U"], "start_seconds": ["210", "390"], "properties": ["keyboard, type, computer", "airplane, engine, spool"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a person is typing on a keyboard", "a jet engine is running and people are talking"], "question": "which is not a type of machine", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "speaking following by laughing and clapping"], "sample_ids": ["x6ijhqRY38s", "u2f5NpsoHBg"], "start_seconds": ["250", "30"], "properties": ["bowl, silverware, man", "person, laugh, clap"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a woman is speaking and a crowd is clapping"], "question": "which person is speaking?", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "a car speeding up in the distance"], "sample_ids": ["wIvYjuR3nrg", "u0TrcHhkPQ"], "start_seconds": ["9", "20"], "properties": ["birds, pigeons, vocalize", "distance, car, speed"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", null], "captions_pred_audio": ["birds are chirping and cooing", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["vbr9mHKc8WM", "wSVhSdj0F0"], "start_seconds": ["40", "10"], "properties": ["noise, loudness, engine", "horn honks, keys jingle, electronic beep"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine is idling", "a car horn honks and keys jangle with background noise "], "question": "which entity is quieter", "label": 0}, {"captions": ["an airplane accelerates briefly", "birds chirp and objects are moved around"], "sample_ids": ["zjTG0gaGCUI", "yPUYU6t3rwo"], "start_seconds": ["80", "370"], "properties": ["accelerates, airplane, briefly", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a jet engine roars as wind blows ", "insects buzz and a man speaks"], "question": "which entity is moving around objects", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "pigeons vocalize and birds chirp"], "sample_ids": ["w8uLijTqtlU", "uiS58TNyUiw"], "start_seconds": ["70", "430"], "properties": ["wind, microphone, noise", "vocalize, bird, chirp"], "captions_pred_video": ["footage is blurry and shaky", "of the pigeon in the cage"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a clock ticktocks"], "sample_ids": ["wztCSUxOf8", "v-g-j2uTByM"], "start_seconds": ["130", "30"], "properties": ["a crowd, yells, applauds", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "people cheer as a vehicle engine revs"], "sample_ids": ["sHbXC6na9hg", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["a person, saw, wood", "engine revs, vehicle, people"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["an engine is idling and vibrating", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sncRqQ67iJU", "xKB8O8LTs6s"], "start_seconds": ["460", "70"], "properties": ["loud, repeatedly, man", "music, gunfire, explosion"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a person is snoring", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "children speak and play together"], "sample_ids": ["xyL9F5VrjkE", "yVVP8XvWJTo"], "start_seconds": ["20", "260"], "properties": ["engine, run, wind", "children, speak, play"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "children are speaking and breathing with background noise "], "question": "which entity is more likely to be in motion", "label": 0}, {"captions": ["water splashes and a motorboat passes as people yell", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w5W5Kqtc8E", "xBxDz0CFVn0"], "start_seconds": ["100", "30"], "properties": ["water, splashes, motorboat", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["u6jIvCtKarQ", "sLUnaPT5gM8"], "start_seconds": ["70", "0"], "properties": ["a, man, speaks", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a person using a blender on a stove top", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "paper is crumpling consistently"], "sample_ids": ["wSVhSdj0F0", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["beep, clang, footsteps", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a toilet door squeaks as it is opened", "some men converse over an engine running"], "sample_ids": ["sdXV-ylviw", "sCiy7QS1U"], "start_seconds": ["190", "300"], "properties": ["door, toilet, squeaks", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and taps with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation?", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "a woman speaks happily and an animal chirps"], "sample_ids": ["sOa7g-44Dag", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["audio, scratching, man", "a woman, chirps, animal"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a woman is speaking and a dog is barking "], "question": "which entity is more likely to be a recording", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a cat meows as a young woman speaks"], "sample_ids": ["rqfQRErjfk8", "x5cuQjOdM3E"], "start_seconds": ["170", "30"], "properties": ["crowd, cheers, applauds", "cat, meows, young woman"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a cat meows and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["zuua6-5goWw", "rqu8iB22IY"], "start_seconds": ["30", "5"], "properties": ["birds, chirp, quiet, man, speaks", "sound, repeats, laugh"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", null], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a dog barks and a man speaks while music plays "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a car accelerates and wind blows"], "sample_ids": ["tDVADusiIoc", "u0TrcHhkPQ"], "start_seconds": ["60", "20"], "properties": ["wind, radio, waves", "accelerates, wind, blows"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "paper folding and crinkling"], "sample_ids": ["u5RmF3c3Aw", "zPpG3RD8lSs"], "start_seconds": ["60", "20"], "properties": ["engine, car, zoom", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "the wind blows and a mouse clicks "], "question": "which is not a vehicle", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "some men converse over an engine running"], "sample_ids": ["soTOh3zYJfY", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["vehicle, skid, tires", "men, converse, engine"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["t25U-v4k4ts", "zl9Dqx-j7q4"], "start_seconds": ["40", "6"], "properties": ["a, chirps, bird", "engine, laugh, loud"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["paper is crumpling consistently", "wind blowing followed by a zoom"], "sample_ids": ["v5cSxLaHADY", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "wind, blow, zoom"], "captions_pred_video": ["footage of the person holding a pair of scissors", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["paper is crumpled and crinkled", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vmrxwuAMb2I", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["a dog, inhales, exhales", "animal, grunts, snorts"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a dog barks and growls", "a woman is speaking and a baby is crying"], "question": "which animal is grunting and snorting", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["ukxt9I7eMMg", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["food, pan, cook", "a woman, something, fried"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman is speaking while food is frying in the background"], "question": "what is being cooked in the pan?", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zVacuqSb4LI", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["blares, fades, train", "applause, audience, yells"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["tDlysoZiA1I", "ziUT9IFTkjg"], "start_seconds": ["0", "10"], "properties": ["animal, grunt, multiple", "background, birds, rustling"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "birds are chirping and a chime is ringing "], "question": "which entity has a background of birds chirping", "label": 1}, {"captions": ["a rumble grows louder", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["y4MY9mp8-TA", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["loudness, increase, rumble", "wind, blow, vehicle"], "captions_pred_video": ["a helicopter flying in the sky", null], "captions_pred_audio": ["a helicopter flies overhead ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["continuous sneezing together with speech", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["x4dZyf9Gbj0", "xKB8O8LTs6s"], "start_seconds": ["130", "70"], "properties": ["continuous, sneeze, speech", "music, gunfire, explosion"], "captions_pred_video": ["footage is blurry and out of focus", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman sneezes and speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a man speaks as a motor runs in the background"], "sample_ids": ["w34HjHr6gAY", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["beeps, hit, woman", "background, motor, run"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vddP56-ogds", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["liquid, laughs, man", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a jet engine spools up and takes off", "a infant makes noise and is excited"], "sample_ids": ["vBslzh7saPw", "wIJK3-5y0kA"], "start_seconds": ["90", "30"], "properties": ["engine, spools, takes", "noise, excited, infant"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a baby cries and a woman speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["children speak and play together", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yVVP8XvWJTo", "uYT5gxnyMWM"], "start_seconds": ["260", "50"], "properties": ["children, speak, play", "female, spraying, scream"], "captions_pred_video": ["footage of a playground at a school or daycare center", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is more violent", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["uZesmtKZGSw", "tiDFTC-5vU"], "start_seconds": ["250", "30"], "properties": ["car, track, man", "male, duck, laugh"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking and ducks are quacking"], "question": "which entity has a man speaking to a duck?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a child yells and another yells"], "sample_ids": ["vs65y4qmyBE", "vMDHu7Lxcgw"], "start_seconds": ["340", "410"], "properties": ["engine, run, man", "two, yell, child"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a boy playing on a trampoline in the backyard"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and a child is shouting"], "question": "which entity is more active", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uiS58TNyUiw", "uYT5gxnyMWM"], "start_seconds": ["430", "50"], "properties": ["vocalize, bird, chirp", "female, spraying, scream"], "captions_pred_video": ["of the pigeon in the cage", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "water flows and trickles"], "sample_ids": ["zkKdxzNC97Y", "tB7hWb9gTuQ"], "start_seconds": ["27", "30"], "properties": ["hard, surface, door", "water, flow, trickle"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a door is opened and closed", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a child speaks in closed space"], "sample_ids": ["w5W5Kqtc8E", "yW6FWLSLkx4"], "start_seconds": ["100", "40"], "properties": ["wind, engine, scream", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a car accelerates and wind blows"], "sample_ids": ["wyllXV6PjKo", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["a baby, a woman, a man", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "a man speaks as a motor runs in the background"], "sample_ids": ["xjhAnI2q6hM", "xZepNM9qcRA"], "start_seconds": ["6", "30"], "properties": ["wind, blow, loudly", "background, motor, run"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "multiple beeps are followed by a squawk and a child speaking"], "sample_ids": ["vbpKkWvfOu4", "w34HjHr6gAY"], "start_seconds": ["560", "30"], "properties": ["a, woman, man", "beeps, squawk, child speaking"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a beep sounds followed by a child speaking"], "question": "which entity has a child speaking?", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "rain falls on a surface as men speak and music plays"], "sample_ids": ["wRV8yMk886E", "w0xsN8X18Y"], "start_seconds": ["0", "30"], "properties": ["liquid, spray, nozzle", "music, surface, rain"], "captions_pred_video": ["two cars are parked in a parking lot at night", null], "captions_pred_audio": ["a man speaks followed by a loud burst", "a man is speaking while a motorboat is moving in the background "], "question": "which entity has a nozzle spraying liquid?", "label": 0}, {"captions": ["bees buzz and wind blows", "a car speeding up in the distance"], "sample_ids": ["tMJne1a4AFI", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["bees buzz, wind blows, bees", "distance, car, speed"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "tapping occurs then a baby cries"], "sample_ids": ["xfudFO976zE", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["animal, bleats, cry", "a, cry, baby"], "captions_pred_video": ["footage is blurry and shaky", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a baby cries and a woman speaks"], "question": "which entity is crying", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a man speaks over intermittent keyboard taps"], "sample_ids": ["sG7TyPnFDR0", "tw76HGONaKg"], "start_seconds": ["180", "570"], "properties": ["beeps, machine, smoke alarm", "audio, man, keyboard"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["y682ml90jGw", "su6FAOcOA8c"], "start_seconds": ["11", "4"], "properties": ["beeps, series, electronic", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "a train horn blows as it passes by"], "sample_ids": ["vms5XGTDVQc", "zVacuqSb4LI"], "start_seconds": ["220", "30"], "properties": ["paper, crumpled, crinkled", "horn, blows, train"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["paper is crumpled and crinkled", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["yLy-WycbVVE", "vzceMbklWc"], "start_seconds": ["30", "180"], "properties": ["background, people, talk", "water, faucet, sink"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", null], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "water is running and a man is speaking"], "question": "which entity has a sink?", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "people speak as gunfire rings out"], "sample_ids": ["vqZuVbG6-HI", "wqTCwqVRDlk"], "start_seconds": ["130", "80"], "properties": ["background, male, female", "gunfire, ring, speak"], "captions_pred_video": ["footage is blurry because it's raining outside", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking and a gun is fired"], "question": "which entity has more gunfire", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a series of light horn beeps is followed by a loud steam whistle"], "sample_ids": ["vXlk0lIQBFo", "wnpJndXuxLc"], "start_seconds": ["470", "50"], "properties": ["wind, talk, vocalize", "beeps, loud, whistle"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a baby cries and a woman moans", "a woman talking as an infant is crying"], "sample_ids": ["smDKStoHBJo", "tMbMDvT50j8"], "start_seconds": ["0", "12"], "properties": ["a, cry, woman", "a, talk, infant"], "captions_pred_video": ["a man holding a crying baby in his arms", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a baby cries and a woman speaks"], "question": "which entity has a woman talking to an infant?", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["spYNpeN7rPY", "uEU-Hg5MTN8"], "start_seconds": ["1", "27"], "properties": ["a clock, ticktock, man", "a woman, laughs, animal"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity has a clock in it?", "label": 0}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sEprKHm8Sj8", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["car, tires, slows", "airplane, boy, fly"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "small dogs yip and bark sharply"], "sample_ids": ["zofjfKhqLk8", "v-wcQf4BDY0"], "start_seconds": ["10", "120"], "properties": ["noise, stop, motor", "bark, yip, sharply"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["an insect buzzes around continuously", "a car accelerates and wind blows"], "sample_ids": ["v25l1jef3JY", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["buzzes, continuously, insect", "accelerates, wind, blows"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["xyx6eNVEYRY", "xBxDz0CFVn0"], "start_seconds": ["380", "30"], "properties": ["loud, engine, muffles", "stream, water, flow"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "footage is blurry and out of focus"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is flowing", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "an insect buzzes around continuously"], "sample_ids": ["vhJWZheqaE", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["water drains unevenly, toilet flushes, water drains", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a toilet is flushed", "a fly is buzzing around a microphone "], "question": "which entity is a source of noise", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a car accelerates and wind blows"], "sample_ids": ["vcmWSmvti8", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["music, man, fire", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "a duck quacks continuously"], "sample_ids": ["yZp6xizR0yU", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["animal, bleat, cry", "quacks, continuously, duck"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a woman speaks happily and an animal chirps"], "sample_ids": ["sK4u5T8hW78", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["a, car, pass", "a woman, chirps, animal"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a dog is barking "], "question": "which entity is more active", "label": 1}, {"captions": ["some men converse over an engine running", "a vehicle is skidding and squealing tires"], "sample_ids": ["sCiy7QS1U", "soTOh3zYJfY"], "start_seconds": ["300", "40"], "properties": ["men, converse, engine", "vehicle, skid, tires"], "captions_pred_video": [null, "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a man speaks followed by another man speaking outside"], "sample_ids": ["v-wcQf4BDY0", "viuTg1M-dqg"], "start_seconds": ["120", "30"], "properties": ["bark, yip, sharply", "two men, speak, follow"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["long loud burping by a man", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xmiUIOhtZyQ", "wDVMhEdTiVw"], "start_seconds": ["60", "30"], "properties": ["loud, burp, man", "gun, shoot, water"], "captions_pred_video": ["homer simpson drinking a beer", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a person burps and music plays in the background ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not followed by water sloshing nearby?", "label": 0}, {"captions": ["a person uses a saw to cut some wood", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sHbXC6na9hg", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["a person, saw, wood", "a woman, something, fried"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "- a woman cooking in the kitchen"], "captions_pred_audio": ["an engine is idling and vibrating", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vzxHnu-SFEw", "uEU-Hg5MTN8"], "start_seconds": ["80", "27"], "properties": ["two objects, woman, speak", "animal, grunts, snorts"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["people speak and tapping occurs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tFCUUGdREgA", "wDVMhEdTiVw"], "start_seconds": ["70", "30"], "properties": ["people, tap, speak", "gun, shoot, water"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause water to slosh", "label": 1}, {"captions": ["a child speaks in closed space", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yW6FWLSLkx4", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["child, space, speak", "female, spraying, scream"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a infant makes noise and is excited"], "sample_ids": ["zgUgkpk78xU", "wIJK3-5y0kA"], "start_seconds": ["70", "30"], "properties": ["clinking, humming, horn", "noise, excited, infant"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "some tunes played by whistling"], "sample_ids": ["w34HjHr6gAY", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["beeps, hit, woman", "tune, play, whistling"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["an airplane engine runs", "some men converse over an engine running"], "sample_ids": ["yVPZ2MNWpms", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["engine, airplane, runs", "men, converse, engine"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["food is frying while a woman speaks", "a beep occurs briefly"], "sample_ids": ["yhQ2Lg-7qDY", "xtWeJ56-U-g"], "start_seconds": ["130", "20"], "properties": ["food, woman, speak", "beep, occur, briefly"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8"], "captions_pred_audio": ["a faucet is running and a man is speaking", "mechanisms are ticking and a beep is heard "], "question": "which entity is silent", "label": 1}, {"captions": ["a toilet flushes and water drains", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sfAvvZwdLCY", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["water drains, flushes, water", "three men, wind, flow"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing", "label": 1}, {"captions": ["a man is filing a hard object", "water is sprayed across a hard surface"], "sample_ids": ["vveS8HT7Uog", "sQwlkXjQabo"], "start_seconds": ["100", "10"], "properties": ["a man, hard, object", "water, spray, surface"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "spraying followed by silence"], "question": "which object is harder to file", "label": 0}, {"captions": ["water flows as men speak and yell", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vJ7JPEFhyLA", "tdWhHV3X25Q"], "start_seconds": ["16", "60"], "properties": ["water, flow, men", "applause, audience, yells"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "vehicles pass by on a roadway"], "sample_ids": ["vveS8HT7Uog", "tgbONvsP47Y"], "start_seconds": ["100", "0"], "properties": ["a man, objects, speak", "pass, vehicle, roadway"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "a duck quacks continuously"], "sample_ids": ["xSKJGCItUWE", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["engine, work, child", "quacks, continuously, duck"], "captions_pred_video": ["footage of the helicopter flying in the room", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a vehicle engine runs as a siren and horn sound", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u--KhUW8l1Y", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["sound, vehicle, horn", "airplane, boy, fly"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a clock ticktocks continuously", "someone is typing on a computer keyboard"], "sample_ids": ["vlJS7LN2XyM", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["ticktocks, clock, ticktocks continuously", "keyboard, type, computer"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "how to make money on youtube in spanish"], "captions_pred_audio": ["a ticktock of a clock", "a person is typing on a keyboard"], "question": "which object is moving", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["xKB8O8LTs6s", "vVhthZ45k3Y"], "start_seconds": ["70", "30"], "properties": ["music, gunshots, explosion", "cat, purr, hiss"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage is blurry and out of focus"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking and a cat is meowing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a man speaks as a machine runs", "vehicles pass by on a roadway"], "sample_ids": ["vD6lYD1l0BY", "tgbONvsP47Y"], "start_seconds": ["330", "0"], "properties": ["a, machine, run", "pass, vehicle, roadway"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet flushes and water drains", "plastic is tapped on while someone speaks"], "sample_ids": ["sfAvvZwdLCY", "wvKpEYswXO0"], "start_seconds": ["20", "150"], "properties": ["water drains, flushes, water", "plastic, tap, speak"], "captions_pred_video": ["footage of the toilet in the bathroom", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a baby laugh at a sputter"], "sample_ids": ["ziUT9IFTkjg", "sLUnaPT5gM8"], "start_seconds": ["10", "0"], "properties": ["background, birds, rustling", "laugh, sputter, baby"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more likely to be in a forest?", "label": 0}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a duck quacks loudly and continuously"], "sample_ids": ["sjlVMgdGSK0", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["car, revving, loudly", "loud, continuous, quacks"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "winds blows roughly as a vehicle races past"], "sample_ids": ["sAam2NqGhLY", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["snoring, breathing, child", "wind, blows, vehicle"], "captions_pred_video": ["of a little girl sleeping on a couch", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a person is snoring", "a jet engine roars and wind blows "], "question": "which entity is not a person", "label": 1}, {"captions": ["a child yells and another yells", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vMDHu7Lxcgw", "xBxDz0CFVn0"], "start_seconds": ["410", "30"], "properties": ["two, yell, child", "stream, water, flow"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["uiItxDsDMFI", "vlJS7LN2XyM"], "start_seconds": ["30", "30"], "properties": ["wood, piece, saw", "background, clocks, ticking"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a saw is being used with background noise ", "a ticktock of a clock"], "question": "which entity is a video of a person sawing wood?", "label": 0}, {"captions": ["a woman talks while a baby cries and a man whispers", "a stream of water flows as people talk and wind blows"], "sample_ids": ["smDKStoHBJo", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["a, talk, baby, cry", "stream, water, flow"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["a male speaks and another male speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["viuTg1M-dqg", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["two males, speaking, male", "water, radio, man"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a person screams glaringly", "water is sprayed across a hard surface"], "sample_ids": ["xC8kbrKJmco", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["glaringly, screams, person", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a goat is bleating ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a weapon fires multiple times", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sMC07Ucy7kg", "vYkA3cfXp5Q"], "start_seconds": ["10", "30"], "properties": ["weapon, fire, multiple", "engine, accelerate, idle"], "captions_pred_video": ["footage is from a car's point of view", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "an engine is idling"], "question": "which entity is not a weapon", "label": 1}, {"captions": ["a goat screams and people speak in the background", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xC8kbrKJmco", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["background, goat, scream", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a goat is bleating ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a machine beeps continuously", "paper is crumpling consistently"], "sample_ids": ["y682ml90jGw", "v5cSxLaHADY"], "start_seconds": ["11", "0"], "properties": ["beeps, machine, continuously", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a beeping sound is being made ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a child speaks in closed space"], "sample_ids": ["zY3icUyMdh8", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["dog, bark, engine", "child, space, speak"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a male speaks and another male speaks", "an infant crying frantically"], "sample_ids": ["viuTg1M-dqg", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["two males, speaking, male", "cry, infant, frantically"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "paper folding and crinkling"], "sample_ids": ["tK4VlLsNxak", "zPpG3RD8lSs"], "start_seconds": ["120", "20"], "properties": ["a, dial, telephone", "paper, fold, crinkle"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "the wind blows and a mouse clicks "], "question": "which is not a rotary telephone", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sG7TyPnFDR0", "wDVMhEdTiVw"], "start_seconds": ["180", "30"], "properties": ["beeps, machine, smoke alarm", "gun, shoot, water"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xNMovAf3o50", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["rain, thunder, music", "airplane, boy, fly"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["an infant crying as a woman laughs", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xhmRY9yhC7c", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["a, laugh, infant", "rustling, ducks, quack"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is about a woman and an infant?", "label": 0}, {"captions": ["wind blowing followed by a zoom", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vr8ZXjEBhMQ", "wqZ135Ssz0"], "start_seconds": ["150", "60"], "properties": ["wind, blow, zoom", "two men, woman, birds"], "captions_pred_video": ["is taken from a motorcycle's point of view", null], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["rustling with distant murmuring", "small dogs yip and bark sharply"], "sample_ids": ["wnNNcxAPwGQ", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["sound, distance, rustling", "bark, yip, sharply"], "captions_pred_video": ["footage of a yellow truck doing a burnout on a race track", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a crowd of people are talking and laughing while a skateboard rolls by ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vJvryTwuAV8", "zl9Dqx-j7q4"], "start_seconds": ["16", "6"], "properties": ["audience, cheer, man", "engine, laugh, loud"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a toilet flushes and a female speaks"], "sample_ids": ["tOSWIURC-4", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["engine, work, nearby", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a lawn mower is running ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a person is snoring while sleeping"], "sample_ids": ["sjlVMgdGSK0", "vJrjSeP17yE"], "start_seconds": ["30", "40"], "properties": ["car, revving, loudly", "a person is sleeping, snoring, person"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a clock ticktocks"], "sample_ids": ["zF8yoL0rkbI", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["engine, run, someone", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the traffic on the street at night", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "paper is crumpling consistently"], "sample_ids": ["wvKpEYswXO0", "v5cSxLaHADY"], "start_seconds": ["150", "0"], "properties": ["sound, water, running", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["some people speak", "winds blows roughly as a vehicle races past"], "sample_ids": ["vbZ-0lGPneg", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "wind, blows, vehicle"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["two frogs croak at each other", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zg0X6BnhOLQ", "vb1fPSDI4c"], "start_seconds": ["410", "30"], "properties": ["two frogs, croak, at each other", "multiple, people, yell"], "captions_pred_video": ["footage of lightning in the sky at night", null], "captions_pred_audio": ["a frog is croaking", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind blows and women speak as livestock vocalizes", "water is sprayed across a hard surface"], "sample_ids": ["vXlk0lIQBFo", "sQwlkXjQabo"], "start_seconds": ["470", "10"], "properties": ["wind, speak, vocalize", "water, spray, surface"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["y1saVTXsKwc", "uEU-Hg5MTN8"], "start_seconds": ["80", "27"], "properties": ["a, dog, talk", "a woman, laughs, animal"], "captions_pred_video": ["a dog playing with a pink ball", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a dog barks and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity has a woman talking to an animal?", "label": 0}, {"captions": ["a jet engine spools up and takes off", "an adult male speaks and dials a rotary phone"], "sample_ids": ["vBslzh7saPw", "tK4VlLsNxak"], "start_seconds": ["90", "120"], "properties": ["engine, spools, takes", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking and using a sewing machine"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "people applaud and hoot and chat quietly"], "sample_ids": ["siJFXfGWgDk", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["a, bird, vehicle", "people, applaud, hoot"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a large crowd cheers and applauds"], "sample_ids": ["ugHJF0hfYkg", "rqfQRErjfk8"], "start_seconds": ["10", "170"], "properties": ["engine, running, continuously", "crowd, cheers, applauds"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man hugging another man in front of an orchestra"], "captions_pred_audio": ["a helicopter is flying overhead ", "a crowd of people clapping and cheering"], "question": "which entity is a human activity", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "a man speaks as a car is passing by"], "sample_ids": ["tgbONvsP47Y", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["noise, truck, accelerate", "a, car, pass"], "captions_pred_video": ["footage of a fire truck entering a garage", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is driving on the road ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a vehicle", "label": 1}, {"captions": ["white noise and birds chirping", "a propeller rotates loudly and intensely"], "sample_ids": ["wRBHTgrbiwg", "ugHJF0hfYkg"], "start_seconds": ["50", "10"], "properties": ["noise, white, chirping", "loud, intense, propeller"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a helicopter is flying overhead "], "question": "which noise is louder", "label": 1}, {"captions": ["a child babbles as a woman speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["wEBlkGWVWwE", "wqZ135Ssz0"], "start_seconds": ["260", "60"], "properties": ["a, babble, woman", "two men, woman, birds"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["an adult woman and an adult man speak", "people applaud and hoot and chat quietly"], "sample_ids": ["zTLVJCo4WEE", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["two people, adult, speak", "people, applaud, hoot"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xfaoyyzw2WU", "su6FAOcOA8c"], "start_seconds": ["180", "4"], "properties": ["loud, jet engine, roar", "engine, idle, woman"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a woman is speaking and a subway train is moving "], "question": "which engine is quieter", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "an airplane engine spools and people speak"], "sample_ids": ["sjlVMgdGSK0", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["accelerates, vehicle, race car", "airplane, engine, spool"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a jet engine is running and people are talking"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["w5W5Kqtc8E", "wz7N8YRy74I"], "start_seconds": ["100", "30"], "properties": ["wind, blow, vehicle", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a woman speaks followed by another woman whimpering and speaking"], "sample_ids": ["xBxDz0CFVn0", "xOZfdgAgJ9o"], "start_seconds": ["30", "40"], "properties": ["wind, chatter, people", "woman, whimpering, speaking"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a woman talking to a man in a doctor's office"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wsHBIgzs9Fs", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["horn, continuous, buzzing", "stream, water, flow"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "footage is blurry and out of focus"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a drill runs and two people laugh"], "sample_ids": ["u7C-AEBQM", "tEE3MpBt1sg"], "start_seconds": ["30", "50"], "properties": ["ticks, rhythmic, quiet", "two people, laugh, drill"], "captions_pred_video": [null, "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a ticktock of a clock", "people are laughing breathing and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "an airplane flies overhead as a woman speaks"], "sample_ids": ["vh30P49Po6s", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["loud, continuous, quacks", "airplane, fly, overhead"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a duck is quacking loudly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying overhead", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a man speaks with another voice speaking in the background"], "sample_ids": ["su6FAOcOA8c", "u21-Z5gJCB8"], "start_seconds": ["4", "30"], "properties": ["engine, idle, woman", "background, voice, man"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a church bell rings several times", "a child speaks in closed space"], "sample_ids": ["sUVVjE3Ucp8", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["ring, bell, several", "child, space, speak"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a church bell is ringing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a car speeding up in the distance"], "sample_ids": ["tIY7qOV3rEM", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "distance, car, speed"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a baby cries and a woman moans", "paper is crumpling consistently"], "sample_ids": ["smDKStoHBJo", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["a, cry, woman", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["an engine runs and a man speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yT5WfYMRr-U", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["engine, run, man", "a woman, something, fried"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["xvDdE3zNf8Y", "sLUnaPT5gM8"], "start_seconds": ["120", "0"], "properties": ["a, female, speaks", "loud, laughter, intermittent"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman speaks and crumples paper", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a piece of wood is being placed down and sawed", "an infant crying and a woman speaking with some distant murmuring"], "sample_ids": ["uiItxDsDMFI", "smDKStoHBJo"], "start_seconds": ["30", "0"], "properties": ["wood, piece, saw", "a, infant, speaking"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a man holding a crying baby in his arms"], "captions_pred_audio": ["a saw is being used with background noise ", "a baby is crying and a woman is speaking"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zgUgkpk78xU", "wDVMhEdTiVw"], "start_seconds": ["70", "30"], "properties": ["horn, bells, ring", "gun, shoot, water"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a warning", "label": 0}, {"captions": ["a man speaks then multiple motorcycles pass by", "water splashes as an animal walks through"], "sample_ids": ["zcDwZ6W7E3E", "w1ir-sZ3Im8"], "start_seconds": ["180", "90"], "properties": ["a, man, speak", "animal, water, splashes"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "water splashes and gurgles as people speak"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["water flows as a woman laughs and a man speaks", "some men converse over an engine running"], "sample_ids": ["vddP56-ogds", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["water, flow, laugh", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a man speaking?", "label": 0}, {"captions": ["someone whistles briefly", "a car speeding up in the distance"], "sample_ids": ["uFoga8sHpiw", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["sound, duration, pitch", "distance, car, speed"], "captions_pred_video": ["footage of a bird in a cage", null], "captions_pred_audio": ["a person whistles a song", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "vehicles pass by on a roadway"], "sample_ids": ["zl9Dqx-j7q4", "tgbONvsP47Y"], "start_seconds": ["6", "0"], "properties": ["motors rev, laugh, loudly", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a jet engine roars ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "some men converse over an engine running"], "sample_ids": ["zofjfKhqLk8", "sCiy7QS1U"], "start_seconds": ["10", "300"], "properties": ["background, metal, clings", "men, converse, engine"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a machine?", "label": 0}, {"captions": ["a horn blasts loudly as a train passes", "a man speaks as a vehicle engine idles"], "sample_ids": ["zsLxS-uLJTw", "shmR4OZtzqA"], "start_seconds": ["20", "30"], "properties": ["horn, blast, train", "man, engine, idle"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a man speaks while a motor runs"], "question": "which entity is stationary", "label": 1}, {"captions": ["water flows followed by women screaming", "water is sprayed across a hard surface"], "sample_ids": ["w5W5Kqtc8E", "sQwlkXjQabo"], "start_seconds": ["100", "10"], "properties": ["water, flow, women", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "spraying followed by silence"], "question": "which entity is a video of water flowing?", "label": 0}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "water is sprayed across a hard surface"], "sample_ids": ["y8WEcpOlT3I", "sQwlkXjQabo"], "start_seconds": ["40", "10"], "properties": ["harsh, wind, blows", "water, spray, surface"], "captions_pred_video": ["on how to use a sewing machine youtube", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a clock ticktocks briefly", "a man is snoring loudly and repeatedly"], "sample_ids": ["u7C-AEBQM", "sncRqQ67iJU"], "start_seconds": ["30", "460"], "properties": ["ticktocks, clock, ticktocks briefly", "loud, repeatedly, man"], "captions_pred_video": [null, "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a ticktock of a clock", "a person is snoring"], "question": "which entity is louder", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "multiple people speak and children yell while water gurgles"], "sample_ids": ["rqu8iB22IY", "vb1fPSDI4c"], "start_seconds": ["5", "30"], "properties": ["sound, repeats, laugh", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking?", "label": 1}, {"captions": ["a woman speaks followed by clicks and scraping", "a fly buzzes around loudly as birds chirp"], "sample_ids": ["yYJksgsxx5U", "uJV8NDaHqqk"], "start_seconds": ["30", "100"], "properties": ["audio, clicks, scraping", "loud, fly, chirp"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "a bee hive in a wooden box"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a swarm of bees buzzing around"], "question": "which entity is louder", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "a propeller moves loudly nearby"], "sample_ids": ["sHbXC6na9hg", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["a person, saw, wood", "loud, propeller, move"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["an engine is idling and vibrating", "a helicopter is flying overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a toilet flushes and water drains"], "sample_ids": ["u--KhUW8l1Y", "sfAvvZwdLCY"], "start_seconds": ["0", "20"], "properties": ["horn, siren, life", "water drains, flushes, water"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a person speaks as a cage rattles, birds chips and flap wings in the background", "birds chirp and a man speaks"], "sample_ids": ["v0wPrLBI3hg", "zuua6-5goWw"], "start_seconds": ["30", "30"], "properties": ["background, person, cage", "chirp, speak, bird"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "birds are chirping and a man is speaking with background noise "], "question": "which entity has a person speaking and birds chirping?", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "birds chirp and objects are moved around"], "sample_ids": ["xKB8O8LTs6s", "yPUYU6t3rwo"], "start_seconds": ["70", "370"], "properties": ["music, gunfire, explosion", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "insects buzz and a man speaks"], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wTideSjRFS0", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["food, sizzle, woman", "engine, revs, vehicle"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a duck quacks continuously"], "sample_ids": ["vddP56-ogds", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["water, splash, person, laugh", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["speaking following by laughing and clapping", "wind blows as people chatter quietly"], "sample_ids": ["u2f5NpsoHBg", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["person, laugh, clap", "wind, chatter, people"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["spYNpeN7rPY", "uEU-Hg5MTN8"], "start_seconds": ["1", "27"], "properties": ["a clock, ticktock, man", "animal, grunts, snorts"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity has a clock ticking?", "label": 0}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a duck quacks continuously"], "sample_ids": ["weDbePuc-Xc", "vh30P49Po6s"], "start_seconds": ["40", "30"], "properties": ["cartoon character, music, vocalize", "quacks, continuously, duck"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a duck is quacking loudly"], "question": "which entity is a duck?", "label": 1}, {"captions": ["bees buzz and wind blows", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["tMJne1a4AFI", "tw76HGONaKg"], "start_seconds": ["0", "570"], "properties": ["bees buzz, wind blows, bees", "A, game, keyboard"], "captions_pred_video": ["a swarm of bees on the ground", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man speaks and types on a computer keyboard "], "question": "which entity is not a video game?", "label": 0}, {"captions": ["a drill runs and two people laugh", "a clock ticktocks"], "sample_ids": ["tEE3MpBt1sg", "v-g-j2uTByM"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "people applaud and hoot and chat quietly"], "sample_ids": ["tZGN5a7ybxo", "wwyfGO2J4"], "start_seconds": ["60", "90"], "properties": ["ring, train, horn", "people, applaud, hoot"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", null], "captions_pred_audio": ["a train is moving and blowing its horn ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["tQWGZLItBXk", "tw76HGONaKg"], "start_seconds": ["170", "570"], "properties": ["music, kid, speak", "A, game, keyboard"], "captions_pred_video": ["worms revolution screenshots", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man speaks and types on a computer keyboard "], "question": "which entity has a keyboard?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["tDVADusiIoc", "x9JovgqUcs"], "start_seconds": ["60", "500"], "properties": ["water, radio, man", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man speaks and types on a keyboard"], "question": "which entity is indoors", "label": 1}, {"captions": ["a man talks as several small engines run", "a duck quacks continuously"], "sample_ids": ["u9A6VZQCZpU", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["ul60S8TXDA8", "tDVADusiIoc"], "start_seconds": ["60", "60"], "properties": ["sound, distance, bell", "water, radio, man"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a weapon fires multiple times", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["sMC07Ucy7kg", "y2bVZ7rz-5M"], "start_seconds": ["10", "280"], "properties": ["weapon, fire, multiple", "motor noise, horn, siren"], "captions_pred_video": ["footage is from a car's point of view", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is not a weapon?", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["xjvTpk2Zpr8", "vXlk0lIQBFo"], "start_seconds": ["70", "470"], "properties": ["wind, blows, vehicle", "wind, speak, vocalize"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a jet engine roars and wind blows ", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity is more calm", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "wind blows as people chatter quietly"], "sample_ids": ["zofjfKhqLk8", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "wind, chatter, people"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage is blurry and out of focus"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vf44CgrjT0A", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["loud, long, person", "music, gunfire, explosion"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a loud burp", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a door slams shut roughly", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zkKdxzNC97Y", "vb1fPSDI4c"], "start_seconds": ["27", "30"], "properties": ["a door, slams, shut", "multiple, people, yell"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a woman speaks happily and an animal chirps"], "sample_ids": ["tQWGZLItBXk", "uWAAAL4CIoc"], "start_seconds": ["170", "0"], "properties": ["voice, music, whoosh", "a woman, chirps, animal"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a dog is barking "], "question": "which entity has a woman speaking and an animal chirps?", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "some men converse over an engine running"], "sample_ids": ["s4Uz1Ffgo04", "sCiy7QS1U"], "start_seconds": ["100", "300"], "properties": ["water, rushes, vehicle", "men, converse, engine"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a vehicle zooming past?", "label": 0}, {"captions": ["multiple motorcycles pass by as a man speaks", "several insects fly while two men talk"], "sample_ids": ["zcDwZ6W7E3E", "s-T9OVOiMLo"], "start_seconds": ["180", "330"], "properties": ["man, speak, motorcycles", "several, fly, men"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a man speaking to multiple motorcycles?", "label": 0}, {"captions": ["water pouring and bubbling", "winds blows roughly as a vehicle races past"], "sample_ids": ["uyRfq-jKPpo", "xjvTpk2Zpr8"], "start_seconds": ["50", "70"], "properties": ["water, bubbles, pouring", "wind, blows, vehicle"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["water is running from a faucet", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a diesel truck engine runs while wind blows", "a man speaks as a car is passing by"], "sample_ids": ["xyL9F5VrjkE", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["engine, run, wind", "a, car, pass"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a vehicle", "label": 1}, {"captions": ["a woman and man speak while food is frying", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zk-xJGQU8-4", "wqZ135Ssz0"], "start_seconds": ["130", "60"], "properties": ["food, man, woman", "two men, woman, birds"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", null], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["people speak then an engine runs", "a woman speaks happily and an animal chirps"], "sample_ids": ["uMTTDZ2mb4", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["engine, run, people", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is about a woman speaking and an animal chirping?", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "vehicles pass by on a roadway"], "sample_ids": ["tqR406bGiE", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["flush, water, gurgle", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a toilet is flushed", "a car is driving on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a dog barks and whimpers", "water flows and trickles"], "sample_ids": ["sShpyu2l4YQ", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "water, flow, trickle"], "captions_pred_video": ["the puppies are playing with a toy", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a dog is barking and growling", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w2JXXIAdUdg", "wDVMhEdTiVw"], "start_seconds": ["10", "30"], "properties": ["snoring, distance, person", "gun, shoot, water"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause injury", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "wind blows as people chatter quietly"], "sample_ids": ["w2M4i1mklOA", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["loud, chime, bell", "wind, chatter, people"], "captions_pred_video": ["footage of an antique clock", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "water is sprayed across a hard surface"], "sample_ids": ["su6FAOcOA8c", "sQwlkXjQabo"], "start_seconds": ["4", "10"], "properties": ["engine, run, woman", "water, spray, surface"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a child speaks in closed space"], "sample_ids": ["tPJvjq9QePY", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["animal, bleat, moo", "child, space, speak"], "captions_pred_video": ["a dog and a sheep in a barn", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a baby cries and a man speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vSeGhaZt-aI", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["water, bubbles, run", "a woman, something, fried"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["wyllXV6PjKo", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["a kid, talk, cry", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a woman speaks and a baby cries", "a goat bleats and birds chirp"], "question": "which entity is about animals?", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["y2ZBGpgbhHM", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["dog, chirp, breathe", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "people cheer as a vehicle engine revs"], "sample_ids": ["vuUVPzd2FXw", "xjhAnI2q6hM"], "start_seconds": ["160", "6"], "properties": ["a, steam, release", "engine revs, vehicle, people"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a child speaks in closed space"], "sample_ids": ["y8WEcpOlT3I", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["harsh, wind, blows", "child, space, speak"], "captions_pred_video": ["on how to use a sewing machine youtube", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vzxHnu-SFEw", "ukg5L09Wpvo"], "start_seconds": ["80", "150"], "properties": ["two objects, woman, speak", "clickety-clack, train, whistle"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["goats bleat and metal clings", "someone is typing on a computer keyboard"], "sample_ids": ["tH17JPjDPnc", "v0x1odnXtP0"], "start_seconds": ["260", "210"], "properties": ["bleat, metal, clings", "keyboard, type, computer"], "captions_pred_video": ["feed of the goats eating hay in the barn", "how to make money on youtube in spanish"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a dog barks and whimpers", "people speak in a closed space"], "sample_ids": ["sShpyu2l4YQ", "sTpirNYo8vQ"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "people, space, speak"], "captions_pred_video": ["the puppies are playing with a toy", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking while a car is revving and accelerating "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tOSWIURC-4", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["noise, engine, revs", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a lawn mower is running ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a man speaks as a car is passing by"], "sample_ids": ["yZmhM1HcsyE", "sK4u5T8hW78"], "start_seconds": ["4", "30"], "properties": ["engine, roar, water", "a, car, pass"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["people speak then an engine runs", "people cheer as a vehicle engine revs"], "sample_ids": ["uMTTDZ2mb4", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["engine, run, people", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity shows people speaking?", "label": 0}, {"captions": ["dogs bark as an engine runs and a person whistles", "people cheer as a vehicle engine revs"], "sample_ids": ["zY3icUyMdh8", "xjhAnI2q6hM"], "start_seconds": ["20", "6"], "properties": ["dog, bark, engine", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a man speaks as a car is passing by"], "sample_ids": ["w0xsN8X18Y", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["music, surface, rain", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["birds fly and flutter around", "water splashing and a person laughs in the distance then a man speaks nearby"], "sample_ids": ["wGKgwOP3h30", "vddP56-ogds"], "start_seconds": ["30", "30"], "properties": ["fly, flutter, around", "water, splash, person, laugh"], "captions_pred_video": ["of the pigeons in the coop", null], "captions_pred_audio": ["pigeons coo and flap their wings", "water is running and gurgling and a man is speaking"], "question": "which entity is more active", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a man sprays as a scraping occurs in the background"], "sample_ids": ["y4tPJXBKDig", "sOa7g-44Dag"], "start_seconds": ["20", "30"], "properties": ["a, noise, talk", "background, man, spray"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a man is speaking and rubbing his hands together "], "question": "which entity is a person", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a man speaks followed by another man speaking outside"], "sample_ids": ["zcDwZ6W7E3E", "viuTg1M-dqg"], "start_seconds": ["180", "30"], "properties": ["man, speak, motorcycles", "two men, speak, follow"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xV7Mg1QucSc", "vb1fPSDI4c"], "start_seconds": ["14", "30"], "properties": ["alarm, ticktocks, laughs", "multiple, people, yell"], "captions_pred_video": ["a cuckoo clock hanging on the wall", null], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "pigeons vocalize and birds chirp"], "sample_ids": ["yYEVLuqEytU", "uiS58TNyUiw"], "start_seconds": ["40", "430"], "properties": ["animal, pig, background", "vocalize, bird, chirp"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of the pigeon in the cage"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an insect buzzes around continuously", "an infant crying as a woman laughs"], "sample_ids": ["v25l1jef3JY", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["buzzes, continuously, insect", "a, laugh, infant"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a baby cries and a woman speaks"], "question": "which entity is not a person", "label": 0}, {"captions": ["a man speaks followed by another man speaking outside", "water flows and trickles"], "sample_ids": ["viuTg1M-dqg", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["two men, speak, follow", "water, flow, trickle"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a drill runs and two people laugh", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tEE3MpBt1sg", "vfYTJq7nU"], "start_seconds": ["50", "130"], "properties": ["two people, laugh, drill", "rustling, ducks, quack"], "captions_pred_video": ["footage is blurry due to the smoke in the air", null], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a duck quacks and a woman speaks"], "question": "which entity is about a drill?", "label": 0}, {"captions": ["a toilet flushes and water sputters as it drains", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["smGI3C1NZc", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["water, drain, toilet", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a dog is whimpering"], "question": "which entity is a video of a toilet?", "label": 0}, {"captions": ["someone sprays liquid onto a hard surface", "vehicles pass by on a roadway"], "sample_ids": ["sQwlkXjQabo", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["liquid, surface, spray", "pass, vehicle, roadway"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage of a fire truck entering a garage"], "captions_pred_audio": ["spraying followed by silence", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as a car is passing by", "an infant crying as a woman laughs"], "sample_ids": ["sK4u5T8hW78", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["a, car, pass", "a, laugh, infant"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["w5W5Kqtc8E", "vlS6YMeWAPo"], "start_seconds": ["100", "40"], "properties": ["water, splashes, motorboat", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a machine runs continuously", "wind blowing followed by a zoom"], "sample_ids": ["wdXV3Pv0jiY", "vr8ZXjEBhMQ"], "start_seconds": ["11", "150"], "properties": ["machine, running, continuously", "wind, blow, zoom"], "captions_pred_video": ["footage is blurry and shaky", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not running continuously", "label": 1}, {"captions": ["scraping and female speech with distant music", "a woman speaks and is crumpling paper"], "sample_ids": ["yHeVV-xeOxQ", "xvDdE3zNf8Y"], "start_seconds": ["130", "120"], "properties": ["female, speech, music", "A, crumple, paper"], "captions_pred_video": ["of a girl milking a goat's udder", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a woman speaks and crumples paper"], "question": "which entity is a woman speaking?", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "rain falls on a surface as men speak and music plays"], "sample_ids": ["spYNpeN7rPY", "w0xsN8X18Y"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "music, surface, rain"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", null], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking while a motorboat is moving in the background "], "question": "which entity has a clock ticking?", "label": 0}, {"captions": ["a man speaks as bees buzz and birds chirp", "a person snores loudly multiple times at a close distance"], "sample_ids": ["t25U-v4k4ts", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["bees buzz, birds chirp, man speaks", "loud, multiple, distance"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", null], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a clock ticktocks"], "sample_ids": ["y682ml90jGw", "v-g-j2uTByM"], "start_seconds": ["11", "30"], "properties": ["beeps, series, electronic", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a beeping sound is being made ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["vW4x7S1VfQc", "tw76HGONaKg"], "start_seconds": ["150", "570"], "properties": ["clacking, oil, woman", "A, game, keyboard"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["food sizzles in a frying pan", "a man speaks and types on a computer keyboard "], "question": "which entity is a video of a man playing a video game?", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wudZTNBtVqc", "wz7N8YRy74I"], "start_seconds": ["60", "30"], "properties": ["accelerates, engine, wind", "rooster, crow, background, men"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "an airplane engine spools and people speak"], "sample_ids": ["yNtRmrn0io8", "wTjoRj1se3U"], "start_seconds": ["210", "390"], "properties": ["storm, distance, strike", "airplane, engine, spool"], "captions_pred_video": ["footage of a house in the middle of the night", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["rain falls and thunder roars", "a jet engine is running and people are talking"], "question": "which is a moving object", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a telephone rings followed by a woman talking"], "sample_ids": ["vSeGhaZt-aI", "tGcFnX0GHI"], "start_seconds": ["50", "0"], "properties": ["water, sink, talk", "ring, talk, woman"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "birds chirp and an owl hoots before a man speaks briefly"], "sample_ids": ["wvKpEYswXO0", "wRBHTgrbiwg"], "start_seconds": ["150", "50"], "properties": ["water, tap, run", "bird, owl, speak"], "captions_pred_video": ["of the person preparing food in the kitchen", "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "birds are chirping and insects are buzzing"], "question": "which entity has more birds", "label": 1}, {"captions": ["multiple ducks quack continuously", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wfHeoPDLMaM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "stream, water, flow"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage is blurry and out of focus"], "captions_pred_audio": ["ducks are quacking", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["weDbePuc-Xc", "yDoT73BWsdA"], "start_seconds": ["40", "10"], "properties": ["music, slaps, human", "engine, revs, vehicle"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tgbONvsP47Y", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["noise, truck, accelerate", "engine, idle, woman"], "captions_pred_video": ["footage of a fire truck entering a garage", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a car is driving on the road ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xyL9F5VrjkE", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["wind, motor, distance", "a woman, something, fried"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "- a woman cooking in the kitchen"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a duck quacks several times", "a train horn blows as it passes by"], "sample_ids": ["vh30P49Po6s", "zVacuqSb4LI"], "start_seconds": ["30", "30"], "properties": ["quacks, duck, several", "horn, blows, train"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a duck is quacking loudly", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["wind blows strongly", "a clock ticktocks"], "sample_ids": ["w8uLijTqtlU", "v-g-j2uTByM"], "start_seconds": ["70", "30"], "properties": ["wind, blows, strongly", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is blurry and shaky", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["the wind is blowing strongly", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "paper is crumpling consistently"], "sample_ids": ["w2JXXIAdUdg", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["snoring, distance, person", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person snoring and a dog whimpering", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a drill runs and two people laugh", "a horn rings out as a machine runs by"], "sample_ids": ["tEE3MpBt1sg", "slZLHwNbbt4"], "start_seconds": ["50", "300"], "properties": ["two people, laugh, drill", "a, horn, run"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a man speaks while a machine runs before a smoke alarm beeps"], "sample_ids": ["siJFXfGWgDk", "sG7TyPnFDR0"], "start_seconds": ["50", "180"], "properties": ["a, bird, vehicle", "beeps, machine, smoke alarm"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a person is using an espresso machine in a restaurant"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and a microwave oven is beeping "], "question": "which entity has a vehicle passing nearby?", "label": 0}, {"captions": ["a male speaks and another male speaks", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["viuTg1M-dqg", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["two males, speaking, male", "animal, grunts, snorts"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity has a more snorts", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zuua6-5goWw", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["birds, chirp, quiet, man, speaks", "men, talk, cars"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "pigeons vocalize and birds chirp"], "sample_ids": ["zkKdxzNC97Y", "uiS58TNyUiw"], "start_seconds": ["27", "430"], "properties": ["loud, bang, noise", "vocalize, bird, chirp"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of the pigeon in the cage"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background"], "sample_ids": ["tiDFTC-5vU", "yYEVLuqEytU"], "start_seconds": ["30", "40"], "properties": ["male, duck, laugh", "animal, pig, background"], "captions_pred_video": [null, "a baby goat is being petted by a person's hand"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "several sheep bleat and a man speaks"], "question": "which entity has a pig in it?", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "birds chirp and objects are moved around"], "sample_ids": ["yYEVLuqEytU", "yPUYU6t3rwo"], "start_seconds": ["40", "370"], "properties": ["animal, pig, background", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["several sheep bleat and a man speaks", "insects buzz and a man speaks"], "question": "which entity has more birds", "label": 1}, {"captions": ["a child speaks in closed space", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yW6FWLSLkx4", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["child, space, speak", "multiple, people, yell"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a duck quacks continuously"], "sample_ids": ["wy1eKjR7KC0", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["people, talk, distance", "quacks, continuously, duck"], "captions_pred_video": ["two police officers riding motorcycles down the street", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["s4Uz1Ffgo04", "zFjIWfSD-4"], "start_seconds": ["100", "410"], "properties": ["roars, background, people speaking", "People, motor, brakes"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["an small aircraft engine runs and a boy speaks", "an airplane engine runs"], "sample_ids": ["xSKJGCItUWE", "yVPZ2MNWpms"], "start_seconds": ["10", "0"], "properties": ["engine, run, boy", "engine, airplane, runs"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a car is driving by on the road "], "question": "which entity has a boy speaking?", "label": 0}, {"captions": ["a car accelerates and wind blows", "a man talks while a clock does ticktock"], "sample_ids": ["u0TrcHhkPQ", "spYNpeN7rPY"], "start_seconds": ["20", "1"], "properties": ["accelerates, wind, blows", "a clock, ticktock, man"], "captions_pred_video": [null, "in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and breathing with background noise "], "question": "which entity is stationary", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a stream of water runs briefly"], "sample_ids": ["ugHJF0hfYkg", "x-PeY8Yb8M4"], "start_seconds": ["10", "300"], "properties": ["engine, running, continuously", "stream, water, run"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is driving on a wet road "], "question": "which entity is running continuously", "label": 0}, {"captions": ["a woman talking as an infant is crying", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["tMbMDvT50j8", "y8WEcpOlT3I"], "start_seconds": ["12", "40"], "properties": ["a, talk, infant", "harsh, wind, blows"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is about a woman talking to an infant?", "label": 0}, {"captions": ["a man speaks, then dials a rotary telephone", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tK4VlLsNxak", "tDVADusiIoc"], "start_seconds": ["120", "60"], "properties": ["a, dial, telephone", "water, radio, man"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "an infant crying frantically"], "sample_ids": ["sjlVMgdGSK0", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["accelerates, vehicle, race car", "cry, infant, frantically"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "of the baby crying in the car seat"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a clock ticktocks"], "sample_ids": ["vbr9mHKc8WM", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["noise, loudness, engine", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["an engine is idling", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "pigeons vocalize and birds chirp"], "sample_ids": ["vSeGhaZt-aI", "uiS58TNyUiw"], "start_seconds": ["50", "430"], "properties": ["water, bubbles, speak", "vocalize, bird, chirp"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a male speaks and another male speaks", "water flows and trickles"], "sample_ids": ["viuTg1M-dqg", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["two males, speaking, male", "water, flow, trickle"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zALy31PjDl0", "uYT5gxnyMWM"], "start_seconds": ["21", "50"], "properties": ["a man, a vehicle, a horn", "female, spraying, scream"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a woman is speaking and a baby is crying"], "question": "which entity is about a vehicle?", "label": 0}, {"captions": ["a child speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["yW6FWLSLkx4", "wqZ135Ssz0"], "start_seconds": ["40", "60"], "properties": ["a, child, speaks", "two men, woman, birds"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a drill runs and two people laugh", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tEE3MpBt1sg", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["two people, laugh, drill", "a woman, something, fried"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "- a woman cooking in the kitchen"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["someone snores nearby", "people speak softly as food sizzles"], "sample_ids": ["spJCm8tD9Zo", "yhQ2Lg-7qDY"], "start_seconds": ["90", "130"], "properties": ["someone snores, nearby, someone", "food, sizzle, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a person is snoring loudly", "a faucet is running and a man is speaking"], "question": "which entity is more active", "label": 1}, {"captions": ["an electronic device bleeps once", "an airplane engine roars increasingly louder"], "sample_ids": ["tHJ6JSa8Y4", "vBslzh7saPw"], "start_seconds": ["0", "90"], "properties": ["bleeps, electronic, device", "engine, roar, louder"], "captions_pred_video": [null, "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a clock is ticking and beeping", "a jet engine roars and accelerates "], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["tOSWIURC-4", "wyllXV6PjKo"], "start_seconds": ["0", "30"], "properties": ["engine, work, nearby", "a baby, a woman, a man"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a woman speaks and a baby cries"], "question": "which entity is a person", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vXlk0lIQBFo", "xBxDz0CFVn0"], "start_seconds": ["470", "30"], "properties": ["wind, talk, vocalize", "stream, water, flow"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage is blurry and out of focus"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing as people talk and wind blows?", "label": 1}, {"captions": ["material crumbles into a microphone", "a car speeds away loudly followed by a car revving loudly and driving away while outside"], "sample_ids": ["vofpvUo6NAw", "sjlVMgdGSK0"], "start_seconds": ["220", "30"], "properties": ["material, crumbles, microphone", "car, revving, loudly"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet flushes and water drains", "water splashes as an animal walks through"], "sample_ids": ["sfAvvZwdLCY", "w1ir-sZ3Im8"], "start_seconds": ["20", "90"], "properties": ["water drains, flushes, water", "animal, water, splashes"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a toilet is flushed", "water splashes and gurgles as people speak"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a stream of water flows quickly", "an audience gives applause"], "sample_ids": ["wbHTKEJZyhc", "x6iCUDmRpKQ"], "start_seconds": ["20", "38"], "properties": ["stream, water, flow", "applause, audience, give"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "a black background with the moon and stars in the sky"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a group of people are clapping and cheering"], "question": "which entity is a human activity", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vcmWSmvti8", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["music, man, fire", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in it?", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a clicking followed by some people laughing and a kid speaking"], "sample_ids": ["ugHJF0hfYkg", "vz8868znkVQ"], "start_seconds": ["10", "60"], "properties": ["loud, intense, propeller", "audio, click, kid speaking"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a video of a plane flying over a cloudy sky"], "captions_pred_audio": ["a helicopter is flying overhead ", "a baby is laughing and breathing with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a stream of water runs briefly", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["x-PeY8Yb8M4", "yDoT73BWsdA"], "start_seconds": ["300", "10"], "properties": ["stream, water, run", "engine, revs, vehicle"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a car is driving on a wet road ", "a race car accelerates and revs its engine "], "question": "which entity is a moving object", "label": 1}, {"captions": ["an engine runs and a man speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["yT5WfYMRr-U", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["engine, run, man", "engine revs, vehicle, people"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["loud clanking and banging with brief male speech", "water splashes as an animal walks through"], "sample_ids": ["sWZzXuWYY", "w1ir-sZ3Im8"], "start_seconds": ["420", "90"], "properties": ["male, speech, banging", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "water splashes and gurgles as people speak"], "question": "which entity is more quiet", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "people speak as gunfire rings out"], "sample_ids": ["vdoxuJn9lTc", "wqTCwqVRDlk"], "start_seconds": ["40", "80"], "properties": ["burp, loud, girl", "gunfire, ring, speak"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a infant makes noise and is excited"], "sample_ids": ["u--KhUW8l1Y", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["horn, siren, life", "noise, excited, infant"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["y2bVZ7rz-5M", "tdWhHV3X25Q"], "start_seconds": ["280", "60"], "properties": ["motor noise, horn, siren", "applause, audience, yells"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking and a crowd is clapping"], "question": "which entity is a response to a performance", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "paper folding and crinkling"], "sample_ids": ["uZesmtKZGSw", "zPpG3RD8lSs"], "start_seconds": ["250", "20"], "properties": ["men, talk, cars", "paper, fold, crinkle"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "the wind blows and a mouse clicks "], "question": "which entity is more quiet", "label": 1}, {"captions": ["several insects fly while two men talk", "a horse runs while two women talk"], "sample_ids": ["s-T9OVOiMLo", "sdvI1mHAsc"], "start_seconds": ["330", "20"], "properties": ["several, fly, men", "two women, horse, run"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", null], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "horses clip-clop and a woman speaks"], "question": "which entity is a horse?", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wnpJndXuxLc", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["blows, vehicle, train", "rooster, crow, background, men"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "people cheer as a vehicle engine revs"], "sample_ids": ["zkKdxzNC97Y", "xjhAnI2q6hM"], "start_seconds": ["27", "6"], "properties": ["hard, surface, door", "engine revs, vehicle, people"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a door is opened and closed", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a woman speaks as she rubs two objects together"], "sample_ids": ["x6ijhqRY38s", "vzxHnu-SFEw"], "start_seconds": ["250", "80"], "properties": ["bowl, silverware, man", "two objects, woman, speak"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is being moved in the bowl", "label": 0}, {"captions": ["a clang followed by a toilet flushing", "water pouring and bubbling"], "sample_ids": ["wNZ5thZM7XU", "uyRfq-jKPpo"], "start_seconds": ["20", "50"], "properties": ["sound, flush, toilet", "water, bubbles, pouring"], "captions_pred_video": ["footage of a toilet in a bathroom stall", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a toilet flushes", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a stream of water flows as people talk and wind blows"], "sample_ids": ["voJh2gJxXhA", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["music, frog, croak", "stream, water, flow"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "footage is blurry and out of focus"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sapQIQUhFc", "uEU-Hg5MTN8"], "start_seconds": ["280", "27"], "properties": ["liquid, flow, distance", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a man talking nearby and another man talking far away?", "label": 0}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["su6FAOcOA8c", "tdWhHV3X25Q"], "start_seconds": ["4", "60"], "properties": ["engine, run, woman", "applause, audience, yells"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zj2R0XoFr5k", "wDVMhEdTiVw"], "start_seconds": ["50", "30"], "properties": ["airplane, boy, fly", "gun, shoot, water"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["birds chirp and wind blows", "a vehicle accelerates before a race car idles then accelerates quickly"], "sample_ids": ["sxIvBMSavMQ", "sjlVMgdGSK0"], "start_seconds": ["210", "30"], "properties": ["birds, chirp, wind", "accelerates, vehicle, race car"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "an insect buzzes around continuously"], "sample_ids": ["w0xsN8X18Y", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["music, surface, rain", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a man speaks followed by another man speaking outside"], "sample_ids": ["zofjfKhqLk8", "viuTg1M-dqg"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "two men, speak, follow"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "speaking following by laughing and clapping"], "sample_ids": ["slZLHwNbbt4", "u2f5NpsoHBg"], "start_seconds": ["300", "30"], "properties": ["train, horn, sound", "person, laugh, clap"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman is speaking and a crowd is clapping"], "question": "which entity is a person", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wTjoRj1se3U", "tdWhHV3X25Q"], "start_seconds": ["390", "60"], "properties": ["engine, run, people", "applause, audience, yells"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["frogs croak and vocalize", "a man speaks as a car is passing by"], "sample_ids": ["yswmmRZFItk", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["croak, vocalize, frog", "a, car, pass"], "captions_pred_video": ["a close up of a frog in the water", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a frog is croaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is not a frog?", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a woman speaks as she rubs two objects together"], "sample_ids": ["weDbePuc-Xc", "vzxHnu-SFEw"], "start_seconds": ["40", "80"], "properties": ["music, slaps, human", "two objects, woman, speak"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "people speak as gunfire rings out"], "sample_ids": ["smDKStoHBJo", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["a, infant, speaking", "gunfire, ring, speak"], "captions_pred_video": ["a man holding a crying baby in his arms", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and a gun is fired"], "question": "which entity is about a woman speaking to an infant?", "label": 0}, {"captions": ["a woman talks while a baby cries and a man whispers", "a man speaks followed by another man speaking outside"], "sample_ids": ["smDKStoHBJo", "viuTg1M-dqg"], "start_seconds": ["0", "30"], "properties": ["a, talk, baby, cry", "two men, speak, follow"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a baby?", "label": 0}, {"captions": ["a duck quacks loudly and continuously", "a man speaks followed by another man speaking outside"], "sample_ids": ["vh30P49Po6s", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["loud, continuous, quacks", "two men, speak, follow"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is quieter", "label": 1}, {"captions": ["running water in a faucet with some clinks", "a jet engine spools up and takes off"], "sample_ids": ["zNRChLjqcU", "vBslzh7saPw"], "start_seconds": ["220", "90"], "properties": ["water, faucet, run", "engine, spools, takes"], "captions_pred_video": [null, "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["water is running from a faucet into a sink", "a jet engine roars and accelerates "], "question": "which entity is a moving object", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "winds blows roughly as a vehicle races past"], "sample_ids": ["sQGXqGcwOTc", "xjvTpk2Zpr8"], "start_seconds": ["3", "70"], "properties": ["audio, kid, giggles", "wind, blows, vehicle"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["sjlVMgdGSK0", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["car, revving, loudly", "sheep, baa, birds"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a goat bleats and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a dog barks and whimpers"], "sample_ids": ["uJV8NDaHqqk", "sShpyu2l4YQ"], "start_seconds": ["100", "0"], "properties": ["loud, fly, chirp", "barks, whimpers, dog"], "captions_pred_video": ["a bee hive in a wooden box", "the puppies are playing with a toy"], "captions_pred_audio": ["a swarm of bees buzzing around", "a dog is barking and growling"], "question": "which entity is quieter", "label": 1}, {"captions": ["a dog barks and whimpers", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sShpyu2l4YQ", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["barks, whimpers, dog", "music, gunfire, explosion"], "captions_pred_video": ["the puppies are playing with a toy", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a dog is barking and growling", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["a guy speaks with birds chirping in the background", "a stream of water flows as people talk and wind blows"], "sample_ids": ["v5P-ThUCINM", "xBxDz0CFVn0"], "start_seconds": ["400", "30"], "properties": ["background, chirp, bird", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a man is speaking with wind noise in the background "], "question": "which entity is a natural scene", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a horn honks and then loudly blares"], "sample_ids": ["weDbePuc-Xc", "wnpJndXuxLc"], "start_seconds": ["40", "50"], "properties": ["music, slaps, human", "horn, honk, loud"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a dog barks and whimpers", "someone is typing on a computer keyboard"], "sample_ids": ["sShpyu2l4YQ", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["barks, whimpers, dog", "keyboard, type, computer"], "captions_pred_video": ["the puppies are playing with a toy", "how to make money on youtube in spanish"], "captions_pred_audio": ["a dog is barking and growling", "a person is typing on a keyboard"], "question": "which entity is typing", "label": 1}, {"captions": ["a child speaks in closed space", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yW6FWLSLkx4", "wz7N8YRy74I"], "start_seconds": ["40", "30"], "properties": ["child, space, speak", "rooster, crow, background, men"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a person is snoring while sleeping"], "sample_ids": ["xV7Mg1QucSc", "vJrjSeP17yE"], "start_seconds": ["14", "40"], "properties": ["alarm, ticktocks, laughs", "a person is sleeping, snoring, person"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a whistling owl calls out repeatedly and insects screech"], "sample_ids": ["ukxt9I7eMMg", "w6RTHR6AeAg"], "start_seconds": ["30", "40"], "properties": ["continuous, woman, speaking", "call, owl, screech"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "an owl hoots and mechanisms operate "], "question": "which entity is a bird?", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a baby cries and a woman speaks"], "sample_ids": ["tDlfY3nmx1A", "tMbMDvT50j8"], "start_seconds": ["160", "12"], "properties": ["applause, laugh, man", "a, cry, woman"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a baby cries and a woman speaks"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["dogs barking and whimpering", "water splashes as an animal walks through"], "sample_ids": ["tIY7qOV3rEM", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["barking, whimpering, dog", "animal, water, splashes"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and wind blows", "a girl talking, laughing and sneezing noise"], "sample_ids": ["sxIvBMSavMQ", "y4tPJXBKDig"], "start_seconds": ["210", "20"], "properties": ["birds, chirp, wind", "a, noise, talk"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage of the woman wiping her nose with a tissue"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman is speaking and coughing with background noise and breathing "], "question": "which entity is talking", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tGcFnX0GHI", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["ring, talk, woman", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["someone whistles briefly", "small dogs yip and bark sharply"], "sample_ids": ["uFoga8sHpiw", "v-wcQf4BDY0"], "start_seconds": ["90", "120"], "properties": ["sound, duration, pitch", "bark, yip, sharply"], "captions_pred_video": ["footage of a bird in a cage", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a person whistles a song", "a dog barks and growls"], "question": "which entity is louder", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a telephone rings followed by a woman talking"], "sample_ids": ["vs65y4qmyBE", "tGcFnX0GHI"], "start_seconds": ["340", "0"], "properties": ["engine, run, man", "ring, talk, woman"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["wind blows strongly and a young man speaks", "a car revs and accelerates loudly and men and women chatter among themselves"], "sample_ids": ["vs65y4qmyBE", "y8dSeubCNI"], "start_seconds": ["340", "4"], "properties": ["wind, blows, strongly", "men, women, car"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "an engine revving and people talking in the background"], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["ugHJF0hfYkg", "tDVADusiIoc"], "start_seconds": ["10", "60"], "properties": ["loud, propeller, move", "water, radio, man"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not moving", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "winds blows roughly as a vehicle races past"], "sample_ids": ["sEprKHm8Sj8", "xjvTpk2Zpr8"], "start_seconds": ["90", "70"], "properties": ["car, tires, slows", "wind, blows, vehicle"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a jet engine roars and wind blows "], "question": "which vehicle is moving faster", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "motors rev and run loudly as a person laughs"], "sample_ids": ["uiItxDsDMFI", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["wood, piece, saw", "motors rev, laugh, loudly"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a saw is being used with background noise ", "a jet engine roars "], "question": "which entity is not a person?", "label": 0}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a vehicle is skidding and squealing tires"], "sample_ids": ["y2bVZ7rz-5M", "soTOh3zYJfY"], "start_seconds": ["280", "40"], "properties": ["motor noise, horn, siren", "vehicle, skid, tires"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "birds chirp and objects are moved around"], "sample_ids": ["s4Uz1Ffgo04", "yPUYU6t3rwo"], "start_seconds": ["100", "370"], "properties": ["water, rushes, motorcycle", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "insects buzz and a man speaks"], "question": "which entity is more calm", "label": 1}, {"captions": ["water is sprayed across a hard surface", "several insects fly while two men talk"], "sample_ids": ["sQwlkXjQabo", "s-T9OVOiMLo"], "start_seconds": ["10", "330"], "properties": ["water, spray, surface", "several, fly, men"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video of a natural event", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a woman speaks happily and an animal chirps"], "sample_ids": ["y8WEcpOlT3I", "uWAAAL4CIoc"], "start_seconds": ["40", "0"], "properties": ["wind, speak, buffeting", "a woman, chirps, animal"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is more likely to be in a zoo", "label": 1}, {"captions": ["a child speaks in closed space", "someone snores nearby"], "sample_ids": ["yW6FWLSLkx4", "spJCm8tD9Zo"], "start_seconds": ["40", "90"], "properties": ["child, space, speak", "someone snores, nearby, someone"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "vehicles pass by on a roadway"], "sample_ids": ["zdYdyF9-m8U", "tgbONvsP47Y"], "start_seconds": ["7", "0"], "properties": ["wind, crash, shoreline", "pass, vehicle, roadway"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage of a fire truck entering a garage"], "captions_pred_audio": ["waves crash and wind blows ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "a duck quacks continuously"], "sample_ids": ["zj2R0XoFr5k", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, overhead", "quacks, continuously, duck"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["xOZfdgAgJ9o", "x9JovgqUcs"], "start_seconds": ["40", "500"], "properties": ["woman, whimpering, speaking", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks and types on a keyboard"], "question": "which entity is speaking", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "several insects fly while two men talk"], "sample_ids": ["w5W5Kqtc8E", "s-T9OVOiMLo"], "start_seconds": ["100", "330"], "properties": ["wind, blow, vehicle", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about flying insects?", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a person speaks briefly"], "sample_ids": ["zTLVJCo4WEE", "zOZleIRqZm4"], "start_seconds": ["30", "80"], "properties": ["a, crickets, sing", "person, talk, brief"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking with crickets chirping in the background"], "question": "which entity is a person talking?", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "a vehicle engine accelerating then running on idle"], "sample_ids": ["s4Uz1Ffgo04", "vYkA3cfXp5Q"], "start_seconds": ["100", "30"], "properties": ["water, rushes, vehicle", "engine, accelerate, idle"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "an engine is idling"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["dogs barking and whimpering", "someone whistles a tune"], "sample_ids": ["tIY7qOV3rEM", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["barking, whimpering, dog", "someone, tune, whistle"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zCrAfDfv6-A", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["person, mouse, click", "applause, audience, yells"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a person whistles a song", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["y2ZBGpgbhHM", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["animal, growl, bird", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "water flows and trickles"], "sample_ids": ["vf44CgrjT0A", "tB7hWb9gTuQ"], "start_seconds": ["20", "30"], "properties": ["loud, long, person", "water, flow, trickle"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a loud burp", "water is splashing and gurgling"], "question": "which entity is flowing", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a man speaks as a car is passing by"], "sample_ids": ["vJvryTwuAV8", "sK4u5T8hW78"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "a, car, pass"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a man speaking to an audience?", "label": 0}, {"captions": ["music plays followed by gunshots and then an explosion", "a man speaks over intermittent keyboard taps"], "sample_ids": ["xKB8O8LTs6s", "tw76HGONaKg"], "start_seconds": ["70", "570"], "properties": ["music, gunshots, explosion", "audio, man, keyboard"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "a machine beeps continuously"], "sample_ids": ["wqUmIEzuNz4", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["frog, bird, vocalize", "beeps, machine, continuously"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", null], "captions_pred_audio": ["a cat meows and rustles", "a beeping sound is being made "], "question": "which entity is not a frog?", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vfYTJq7nU", "wqZ135Ssz0"], "start_seconds": ["130", "60"], "properties": ["ducks, quack, man", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["a toilet flushes and water drains", "a person sneezes followed by another person speaking"], "sample_ids": ["sfAvvZwdLCY", "t8CV69hcvF0"], "start_seconds": ["20", "210"], "properties": ["water drains, flushes, water", "person, sneeze, follow"], "captions_pred_video": ["footage of the toilet in the bathroom", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a toilet is flushed", "a woman sneezes and speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a machine beeps continuously"], "sample_ids": ["ugHJF0hfYkg", "y682ml90jGw"], "start_seconds": ["10", "11"], "properties": ["loud, intense, propeller", "beeps, machine, continuously"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["sG7TyPnFDR0", "wyllXV6PjKo"], "start_seconds": ["180", "30"], "properties": ["beeps, machine, smoke alarm", "a baby, a woman, a man"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", null], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a woman speaks and a baby cries"], "question": "which entity has a baby?", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wP8ZKrlx3oA", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["rain, storm, thunder", "female, spraying, scream"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a dog barks and whimpers", "a drill drills through something then people begin laughing"], "sample_ids": ["sShpyu2l4YQ", "tEE3MpBt1sg"], "start_seconds": ["0", "50"], "properties": ["barks, whimpers, dog", "drill, something, laugh"], "captions_pred_video": ["the puppies are playing with a toy", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a dog is barking and growling", "people are laughing breathing and speaking with background noise "], "question": "which entity is more likely to be a joke", "label": 1}, {"captions": ["distant humming of an engine", "an engine runs loudly"], "sample_ids": ["yVPZ2MNWpms", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["sound, distance, engine", "loud, engine, run"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a car is driving by on the road ", "a lawn mower is running and men are speaking "], "question": "which engine is louder", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tGcFnX0GHI", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["ring, talk, woman", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a woman is speaking while food is frying in the background"], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["an engine starts and increases in power", "a stream of water runs briefly"], "sample_ids": ["zjTG0gaGCUI", "x-PeY8Yb8M4"], "start_seconds": ["80", "300"], "properties": ["power, increase, engine", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wztCSUxOf8", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["a crowd, yells, applauds", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be at a sporting event", "label": 0}, {"captions": ["a man speaks while water drains", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vSeGhaZt-aI", "zFjIWfSD-4"], "start_seconds": ["50", "410"], "properties": ["water, drain, man", "People, motor, brakes"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "vehicles pass by on a roadway"], "sample_ids": ["vJ7JPEFhyLA", "tgbONvsP47Y"], "start_seconds": ["16", "0"], "properties": ["three men, wind, flow", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a car is driving on the road "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["ticking continues without interruption", "water flows as men speak and yell"], "sample_ids": ["v-g-j2uTByM", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["ticking, continuous, clock", "water, flow, men"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not continuous", "label": 1}, {"captions": ["a car accelerates and wind blows", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["u0TrcHhkPQ", "wDVMhEdTiVw"], "start_seconds": ["20", "30"], "properties": ["accelerates, wind, blows", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a car", "label": 0}, {"captions": ["the revving of an engine throttle followed by a man speaking", "vehicles pass by on a roadway"], "sample_ids": ["tezvROoo4bs", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["audio, throttle, speaking", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a car is driving on the road "], "question": "which entity is a video", "label": 1}, {"captions": ["a door opens and closes", "paper is crumpling consistently"], "sample_ids": ["vBHyYJ8pL0", "v5cSxLaHADY"], "start_seconds": ["2", "0"], "properties": ["open, close, door", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a person speaks briefly"], "sample_ids": ["xvDdE3zNf8Y", "zOZleIRqZm4"], "start_seconds": ["120", "80"], "properties": ["a, female, speaks", "person, talk, brief"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking with crickets chirping in the background"], "question": "which entity is talking", "label": 1}, {"captions": ["water flows as men speak and yell", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["vJ7JPEFhyLA", "ziUT9IFTkjg"], "start_seconds": ["16", "10"], "properties": ["water, flow, men", "background, birds, rustling"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a toilet flushes and a female speaks"], "sample_ids": ["vcmWSmvti8", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["music, man, fire", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a toilet flushes and a man speaks"], "question": "which entity is about a toilet?", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a telephone rings followed by a woman talking"], "sample_ids": ["s6DESzUTGjY", "tGcFnX0GHI"], "start_seconds": ["16", "0"], "properties": ["wind, laugh, woman", "ring, talk, woman"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", null], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which woman is talking", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vfYTJq7nU", "uZesmtKZGSw"], "start_seconds": ["130", "250"], "properties": ["ducks, quack, man", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an animal quacks rapidly", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vh30P49Po6s", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["animal, quacks, rapidly", "airplane, boy, fly"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a duck is quacking loudly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["slZLHwNbbt4", "y2bVZ7rz-5M"], "start_seconds": ["300", "280"], "properties": ["train, horn, sound", "motor noise, horn, siren"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning device", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "people speak as gunfire rings out"], "sample_ids": ["xO-Q2BlIIPU", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["two men, exclamation, speak", "gunfire, ring, speak"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a gun is fired"], "question": "which entity shows a man speaking to another man?", "label": 0}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a man speaks as water trickles down a stream"], "sample_ids": ["x9JovgqUcs", "sapQIQUhFc"], "start_seconds": ["500", "280"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "water, stream, trickles"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a keyboard", "a man is speaking and a stream is flowing in the background "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a frog croaks as other frogs croak in the background"], "sample_ids": ["sEprKHm8Sj8", "yswmmRZFItk"], "start_seconds": ["90", "0"], "properties": ["car, tires, slows", "background, frog, croak"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "a close up of a frog in the water"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a frog is croaking"], "question": "which entity is a croaking animal?", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vXlk0lIQBFo", "zl9Dqx-j7q4"], "start_seconds": ["470", "6"], "properties": ["wind, speak, vocalize", "engine, laugh, loud"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of a man driving a car in the dark"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "winds blows roughly as a vehicle races past"], "sample_ids": ["tiDFTC-5vU", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["male, duck, laugh", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "water running down a sink while a man is talking"], "sample_ids": ["tjmoSi330GM", "vSeGhaZt-aI"], "start_seconds": ["23", "50"], "properties": ["speed, water, boat", "water, sink, talk"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is stationary", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "people speak as gunfire rings out"], "sample_ids": ["x5cuQjOdM3E", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["cat, talk, meow", "gunfire, ring, speak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a small engine spits as it runs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sZvwOuuPGP0", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["spits, engine, runs", "a woman, something, fried"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a medium engine is running ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["y8WEcpOlT3I", "uWAAAL4CIoc"], "start_seconds": ["40", "0"], "properties": ["harsh, wind, blows", "a woman, chirps, animal"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is more cheerful", "label": 1}, {"captions": ["birds coo incessantly", "water is sprayed across a hard surface"], "sample_ids": ["yZrFNS7GFBQ", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["coo, bird, incessant", "water, spray, surface"], "captions_pred_video": ["of the bird in the cage", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["an owl hoots in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "someone is typing on a computer keyboard"], "sample_ids": ["xvDdE3zNf8Y", "v0x1odnXtP0"], "start_seconds": ["120", "210"], "properties": ["a, female, speaks", "keyboard, type, computer"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman speaks and crumples paper", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "a man speaking with light rustling"], "sample_ids": ["vZAw4apG0Es", "zOZleIRqZm4"], "start_seconds": ["30", "80"], "properties": ["people, clock, converse", "light, rustling, man"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking with crickets chirping in the background"], "question": "which entity is a man speaking with light rustling?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["w5W5Kqtc8E", "w5W5Kqtc8E"], "start_seconds": ["100", "100"], "properties": ["water, splashes, motorboat", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a person speaks briefly", "tapping occurs then a baby cries"], "sample_ids": ["zOZleIRqZm4", "wIJK3-5y0kA"], "start_seconds": ["80", "30"], "properties": ["person, talk, brief", "a, cry, baby"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a baby cries and a woman speaks"], "question": "which entity is a person talking?", "label": 0}, {"captions": ["loud clanking and banging with brief male speech", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sWZzXuWYY", "xfaoyyzw2WU"], "start_seconds": ["420", "180"], "properties": ["male, speech, banging", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uWPRNLnpy7Y", "vb1fPSDI4c"], "start_seconds": ["10", "30"], "properties": ["accelerate, laugh, vehicle", "multiple, people, yell"], "captions_pred_video": ["is taken from a car driving down the street", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a duck quacks continuously"], "sample_ids": ["sjlVMgdGSK0", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["car, revving, loudly", "quacks, continuously, duck"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "water is sprayed across a hard surface"], "sample_ids": ["xyx6eNVEYRY", "sQwlkXjQabo"], "start_seconds": ["380", "10"], "properties": ["loud, engine, muffles", "water, spray, surface"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a race car approaches quickly and slows down squealing tires"], "sample_ids": ["yZrFNS7GFBQ", "sEprKHm8Sj8"], "start_seconds": ["30", "90"], "properties": ["pigeon, buzzes, insect", "car, tires, slows"], "captions_pred_video": ["of the bird in the cage", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["an owl hoots in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "dishes cling together then a man begins to speak"], "sample_ids": ["s4Uz1Ffgo04", "sQGXqGcwOTc"], "start_seconds": ["100", "3"], "properties": ["water, rushes, motorcycle", "cling, speak, dishes"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a person snores loudly multiple times at a close distance"], "sample_ids": ["weDbePuc-Xc", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["cartoon character, music, vocalize", "loud, multiple, distance"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "a horn rings out as a machine runs by"], "sample_ids": ["wqADXCzngMw", "slZLHwNbbt4"], "start_seconds": ["340", "300"], "properties": ["engine, idle, man", "a, horn, run"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a goat screams and people speak in the background", "wind blowing followed by a zoom"], "sample_ids": ["xC8kbrKJmco", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["background, goat, scream", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a goat is bleating ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more calm", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "a man speaks followed by another man speaking outside"], "sample_ids": ["yeFvk9x0wWI", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["chirp, twitter, clatter", "two men, speak, follow"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "wind blows as people chatter quietly"], "sample_ids": ["vlS6YMeWAPo", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["noise, bleat, call", "wind, chatter, people"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage is blurry and out of focus"], "captions_pred_audio": ["a goat bleats and birds chirp", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a car speeds away loudly followed by a car revving loudly and driving away while outside"], "sample_ids": ["uWPRNLnpy7Y", "sjlVMgdGSK0"], "start_seconds": ["10", "30"], "properties": ["accelerate, laugh, vehicle", "car, revving, loudly"], "captions_pred_video": ["is taken from a car driving down the street", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a car accelerates and revs its engine "], "question": "which vehicle is revving loudly", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a propeller moves loudly nearby"], "sample_ids": ["sapQIQUhFc", "ugHJF0hfYkg"], "start_seconds": ["280", "10"], "properties": ["liquid, flow, distance", "loud, propeller, move"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a helicopter is flying overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["w2bYrCVLT60", "zl9Dqx-j7q4"], "start_seconds": ["120", "6"], "properties": ["ducks, speak, quack", "engine, laugh, loud"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "footage of a man driving a car in the dark"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["uYT5gxnyMWM", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["a, scream, girl", "animal, grunts, snorts"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a baby is crying"], "question": "which entity is about a girl speaking followed by a scream?", "label": 0}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vqZuVbG6-HI", "tdWhHV3X25Q"], "start_seconds": ["130", "60"], "properties": ["background, male, female", "applause, audience, yells"], "captions_pred_video": ["footage is blurry because it's raining outside", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["su6FAOcOA8c", "yDoT73BWsdA"], "start_seconds": ["4", "10"], "properties": ["engine, idle, woman", "engine, revs, vehicle"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a race car accelerates and revs its engine "], "question": "which engine is revving", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["wnpJndXuxLc", "wDVMhEdTiVw"], "start_seconds": ["50", "30"], "properties": ["blows, vehicle, train", "gun, shoot, water"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["wind blowing followed by a zoom", "roadway noise occurs and a truck accelerates"], "sample_ids": ["vr8ZXjEBhMQ", "tgbONvsP47Y"], "start_seconds": ["150", "0"], "properties": ["wind, blow, zoom", "noise, truck, accelerate"], "captions_pred_video": ["is taken from a motorcycle's point of view", "footage of a fire truck entering a garage"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a car is driving on the road "], "question": "which is not a zoom", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "water is sprayed across a hard surface"], "sample_ids": ["un9VQlzgZM", "sQwlkXjQabo"], "start_seconds": ["5", "10"], "properties": ["females, talk, laugh", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "wind blows as people chatter quietly"], "sample_ids": ["xvDdE3zNf8Y", "xBxDz0CFVn0"], "start_seconds": ["120", "30"], "properties": ["a, female, speaks", "wind, chatter, people"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a stream of water runs briefly"], "sample_ids": ["t25U-v4k4ts", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["a, chirps, bird", "stream, water, run"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["small dogs yip and bark sharply", "water flows as men speak and yell"], "sample_ids": ["v-wcQf4BDY0", "vJ7JPEFhyLA"], "start_seconds": ["120", "16"], "properties": ["bark, yip, sharply", "water, flow, men"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["t69a8aRKhmc", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "rooster, crow, background, men"], "captions_pred_video": ["footage is blurry and out of focus", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vveS8HT7Uog", "w34HjHr6gAY"], "start_seconds": ["100", "30"], "properties": ["a man, objects, speak", "beeps, hit, woman"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a toilet flushes and a female speaks"], "sample_ids": ["w2JXXIAdUdg", "yaln9y8I7ms"], "start_seconds": ["10", "230"], "properties": ["snoring, distance, person", "female, flushes, toilet"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage is blurry and out of focus"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a toilet flushes and a man speaks"], "question": "which entity is a toilet?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a stream of water runs briefly"], "sample_ids": ["vimzuGQvdcU", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["a, man, yells", "stream, water, run"], "captions_pred_video": ["a group of people are rafting down a river", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["s4Uz1Ffgo04", "zFjIWfSD-4"], "start_seconds": ["100", "410"], "properties": ["water, rushes, vehicle", "People, motor, brakes"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a vehicle zooming past?", "label": 0}, {"captions": ["a wooden clack accompanies nearby chirping birds", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yeFvk9x0wWI", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["clack, bird, chirp", "a woman, a television program, a bird"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman is speaking and a dog is whimpering"], "question": "which entity has more birds", "label": 1}, {"captions": ["a clock ticktocks briefly", "wind blows as people chatter quietly"], "sample_ids": ["u7C-AEBQM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks briefly", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["yajyRTUQk3U", "yDoT73BWsdA"], "start_seconds": ["400", "10"], "properties": ["a woman, something, fried", "engine, revs, vehicle"], "captions_pred_video": ["- a woman cooking in the kitchen", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["an electronic device bleeps once", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tHJ6JSa8Y4", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["bleeps, electronic, device", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a clock is ticking and beeping", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a horn honks followed by a loud continuous buzzing while men speak"], "sample_ids": ["spYNpeN7rPY", "wsHBIgzs9Fs"], "start_seconds": ["1", "50"], "properties": ["a clock, ticktock, man", "horn, continuous, buzzing"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "shows a motorcycle riding down a country road with a motorcycle in the foreground"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a car accelerates and revs its engine while a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a jet engine spools up and takes off", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vBslzh7saPw", "vbZ-0lGPneg"], "start_seconds": ["90", "30"], "properties": ["engine, spools, takes", "a woman, a television program, a bird"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a moving object", "label": 0}, {"captions": ["a person is whistling", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sIXTftIuUgw", "tdWhHV3X25Q"], "start_seconds": ["90", "60"], "properties": ["person, whistling, person", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a person whistling a song", "a man is speaking and a crowd is clapping"], "question": "which person is more likely to be a performer", "label": 1}, {"captions": ["an electronic device bleeps once", "a clock ticktocks"], "sample_ids": ["tHJ6JSa8Y4", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["bleeps, electronic, device", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a clock is ticking and beeping", "a clock is ticking loudly"], "question": "which entity is a clock", "label": 1}, {"captions": ["a toilet flushes and water drains", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sfAvvZwdLCY", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["water drains, flushes, water", "rustling, ducks, quack"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["popping and crackling repeats as men yell and laugh", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["rqu8iB22IY", "tdWhHV3X25Q"], "start_seconds": ["5", "60"], "properties": ["sound, repeats, laugh", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["rustling with distant murmuring", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wnNNcxAPwGQ", "uZesmtKZGSw"], "start_seconds": ["0", "250"], "properties": ["sound, distance, rustling", "men, talk, cars"], "captions_pred_video": ["footage of a yellow truck doing a burnout on a race track", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a crowd of people are talking and laughing while a skateboard rolls by ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks as water trickles down a stream", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sapQIQUhFc", "xKB8O8LTs6s"], "start_seconds": ["280", "70"], "properties": ["water, stream, trickles", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["heavy rain splashes as it falls", "water splashes as an animal walks through"], "sample_ids": ["wP8ZKrlx3oA", "w1ir-sZ3Im8"], "start_seconds": ["40", "90"], "properties": ["fall, rain, splash", "animal, water, splashes"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a heavy rain is falling on a surface", "water splashes and gurgles as people speak"], "question": "which entity is more likely to cause water to splash", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "water pouring and bubbling"], "sample_ids": ["w5W5Kqtc8E", "uyRfq-jKPpo"], "start_seconds": ["100", "50"], "properties": ["wind, engine, scream", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "water pouring and bubbling"], "sample_ids": ["sDSppXIlJrs", "uyRfq-jKPpo"], "start_seconds": ["27", "50"], "properties": ["microphone, water, wind", "water, bubbles, pouring"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["the wind is blowing and water is splashing", "water is running from a faucet"], "question": "which entity is more likely to be found in a bathroom", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "a clock ticktocks"], "sample_ids": ["zdYdyF9-m8U", "v-g-j2uTByM"], "start_seconds": ["7", "30"], "properties": ["wind, crash, shoreline", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["waves crash and wind blows ", "a clock is ticking loudly"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["a man speaks as crickets sing", "birds chirp and pigeons vocalize while walking around"], "sample_ids": ["ryFDPxgDOGc", "wIvYjuR3nrg"], "start_seconds": ["570", "9"], "properties": ["a, crickets, sing", "birds, pigeons, vocalize"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "footage of a pigeon sitting on a roof with trees in the background"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "birds are chirping and cooing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a toilet flushes and water drains", "rain falls on a surface as men speak and music plays"], "sample_ids": ["sfAvvZwdLCY", "w0xsN8X18Y"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "music, surface, rain"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while a motorboat is moving in the background "], "question": "which entity is a video of a toilet flushing and water draining?", "label": 0}, {"captions": ["a man speaks, then dials a rotary telephone", "people speak as gunfire rings out"], "sample_ids": ["tK4VlLsNxak", "wqTCwqVRDlk"], "start_seconds": ["120", "80"], "properties": ["a, dial, telephone", "gunfire, ring, speak"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "a car speeding up in the distance"], "sample_ids": ["xjhAnI2q6hM", "u0TrcHhkPQ"], "start_seconds": ["6", "20"], "properties": ["wind, blow, loudly", "distance, car, speed"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "an engine starts and increases in power"], "sample_ids": ["wP8ZKrlx3oA", "zjTG0gaGCUI"], "start_seconds": ["40", "80"], "properties": ["rain, storm, thunder", "power, increase, engine"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a jet engine roars as wind blows "], "question": "which entity is more powerful", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "a telephone rings followed by a woman talking"], "sample_ids": ["uWAAAL4CIoc", "tGcFnX0GHI"], "start_seconds": ["0", "0"], "properties": ["a woman, chirps, animal", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a motorcycle engine is idling", "a car speeding up in the distance"], "sample_ids": ["vZAqdHZ81yA", "u0TrcHhkPQ"], "start_seconds": ["180", "20"], "properties": ["engine, motorcycle, idling", "distance, car, speed"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a race car accelerates and revs its engine "], "question": "which object is moving faster", "label": 0}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sxYkFKFIZD0", "zj2R0XoFr5k"], "start_seconds": ["20", "50"], "properties": ["screech, man, door", "airplane, boy, fly"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["tDlysoZiA1I", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["animal, grunt, multiple", "animal, grunts, snorts"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and an animal grunts and snorts?", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "an insect buzzes around continuously"], "sample_ids": ["yJ0TePmaOo", "v25l1jef3JY"], "start_seconds": ["390", "0"], "properties": ["two hard objects, man, speak", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a baby laugh at a sputter", "a train horn blows as it passes by"], "sample_ids": ["sLUnaPT5gM8", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["laugh, sputter, baby", "horn, blows, train"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["water pouring and bubbling", "wind blowing followed by a zoom"], "sample_ids": ["uyRfq-jKPpo", "vr8ZXjEBhMQ"], "start_seconds": ["50", "150"], "properties": ["water, bubbles, pouring", "wind, blow, zoom"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["water is running from a faucet", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be a natural phenomenon", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sQGXqGcwOTc", "xBxDz0CFVn0"], "start_seconds": ["3", "30"], "properties": ["cling, speak, dishes", "stream, water, flow"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "some men converse over an engine running"], "sample_ids": ["wAAkbZToh8", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["burp, laugh, speak", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man burps and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation?", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ukxt9I7eMMg", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["continuous, woman, speaking", "female, spraying, scream"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking towards the end?", "label": 0}, {"captions": ["a bird is chirping and tweeting a bird song", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["wPz6QRAkEb4", "tDlysoZiA1I"], "start_seconds": ["60", "0"], "properties": ["chirps, tweets, song", "animal, grunts, chirps"], "captions_pred_video": ["a bird in a cage on top of a pole", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["birds are chirping in the background ", "birds are chirping and a rooster is crowing "], "question": "which entity is a bird", "label": 0}, {"captions": ["a man speaks as horns blow", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["tHyNqRyK34A", "y8WEcpOlT3I"], "start_seconds": ["24", "40"], "properties": ["a, man, speaks", "harsh, wind, blows"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity has a man speaking as horns blow?", "label": 0}, {"captions": ["a clock alarm sounds and gears turn", "a toilet flushes and a female speaks"], "sample_ids": ["w2M4i1mklOA", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["alarm, gears, turn", "female, flushes, toilet"], "captions_pred_video": ["footage of an antique clock", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "water is sprayed across a hard surface"], "sample_ids": ["sSMl2vc3ek", "sQwlkXjQabo"], "start_seconds": ["20", "10"], "properties": ["loud, multiple, distance", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a person snoring loudly", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["children speak as a female ask them questions", "a woman speaks as she rubs two objects together"], "sample_ids": ["wEBlkGWVWwE", "vzxHnu-SFEw"], "start_seconds": ["260", "80"], "properties": ["female, speak, questions", "two objects, woman, speak"], "captions_pred_video": ["shows a person writing on the whiteboard", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "a horn blasts loudly as a train passes"], "sample_ids": ["vs65y4qmyBE", "zsLxS-uLJTw"], "start_seconds": ["340", "20"], "properties": ["wind, blows, strongly", "horn, blast, train"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage of the train on the tracks at sunrise or sunset"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a train blows its horn and moves on the tracks "], "question": "which is louder", "label": 0}, {"captions": ["continuous sneezing together with speech", "water flows and trickles"], "sample_ids": ["x4dZyf9Gbj0", "tB7hWb9gTuQ"], "start_seconds": ["130", "30"], "properties": ["continuous, sneeze, speech", "water, flow, trickle"], "captions_pred_video": ["footage is blurry and out of focus", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman sneezes and speaks", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a beep repeats multiple times", "a bird is chirping and tweeting a bird song"], "sample_ids": ["y682ml90jGw", "wPz6QRAkEb4"], "start_seconds": ["11", "60"], "properties": ["beep, repeat, multiple", "chirps, tweets, song"], "captions_pred_video": [null, "a bird in a cage on top of a pole"], "captions_pred_audio": ["a beeping sound is being made ", "birds are chirping in the background "], "question": "which entity is a song", "label": 1}, {"captions": ["a man speaks as horns blow", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tHyNqRyK34A", "vb1fPSDI4c"], "start_seconds": ["24", "30"], "properties": ["a, man, speaks", "multiple, people, yell"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", null], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["paper is crumpling consistently", "an engine idles consistently before sputtering some"], "sample_ids": ["v5cSxLaHADY", "rwTERCUno"], "start_seconds": ["0", "90"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "engine, idle, sputter"], "captions_pred_video": ["footage of the person holding a pair of scissors", null], "captions_pred_audio": ["paper is crumpled and crinkled", "an engine is idling and vibrating"], "question": "which entity is consistent", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["y2ZBGpgbhHM", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["animal, growl, bird", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a toilet flushes and water drains"], "sample_ids": ["wnpJndXuxLc", "sfAvvZwdLCY"], "start_seconds": ["50", "20"], "properties": ["beeps, loud, whistle", "water drains, flushes, water"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a person snores loudly multiple times at a close distance"], "sample_ids": ["u2f5NpsoHBg", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["person, laugh, clap", "loud, multiple, distance"], "captions_pred_video": ["is being projected on a screen at the front of the stage", null], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a person snoring loudly"], "question": "which person is speaking", "label": 0}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "winds blows roughly as a vehicle races past"], "sample_ids": ["wy1eKjR7KC0", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["people, talk, distance", "wind, blows, vehicle"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["an emergency siren wails as it passes", "an insect buzzes around continuously"], "sample_ids": ["vGj1XLJvNrw", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["wails, wails, pass", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a police car driving down a city street", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a fly is buzzing around a microphone "], "question": "which entity buzzes around continuously", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vddP56-ogds", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["water, flow, laugh", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "a consistent ticking pattern"], "sample_ids": ["uWAAAL4CIoc", "sCeWURVHfOM"], "start_seconds": ["0", "30"], "properties": ["a, dog, vocalize", "ticking, pattern, clock"], "captions_pred_video": [null, "- a close-up view of the clock's inner workings"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "ticking of a clock"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp as a train approaches", "a car accelerates and wind blows"], "sample_ids": ["xM4joTqDVp4", "u0TrcHhkPQ"], "start_seconds": ["160", "20"], "properties": ["bird, chirp, train", "accelerates, wind, blows"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", null], "captions_pred_audio": ["birds are chirping and a train is moving ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a horse runs while two women talk", "wind blows as people chatter quietly"], "sample_ids": ["sdvI1mHAsc", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["two women, horse, run", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["uYT5gxnyMWM", "zFjIWfSD-4"], "start_seconds": ["50", "410"], "properties": ["a, scream, girl", "People, motor, brakes"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a girl speaking followed by a scream and more girls talking?", "label": 0}, {"captions": ["an engine runs loudly", "sirens ring and approach with humming of distant traffic"], "sample_ids": ["vqZuVbG6-HI", "xERFUeZONz8"], "start_seconds": ["130", "0"], "properties": ["loud, engine, run", "ring, approach, traffic"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage is blurry due to camera shake or motion blur"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "an emergency vehicle siren blares"], "question": "which entity is a warning", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "someone is typing on a computer keyboard"], "sample_ids": ["y2ZBGpgbhHM", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["birds, tweet, pant", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["birds chirping and a dog panting", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["wvKpEYswXO0", "zl9Dqx-j7q4"], "start_seconds": ["150", "6"], "properties": ["water, tap, run", "engine, laugh, loud"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wvKpEYswXO0", "su6FAOcOA8c"], "start_seconds": ["150", "4"], "properties": ["sound, water, running", "engine, idle, woman"], "captions_pred_video": ["of the person preparing food in the kitchen", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking softly?", "label": 0}, {"captions": ["an airplane accelerates briefly", "insects humming with a dog barking and small goat bleating"], "sample_ids": ["zjTG0gaGCUI", "tIY7qOV3rEM"], "start_seconds": ["80", "0"], "properties": ["accelerates, airplane, briefly", "animal, bark, dog, barking, small, goat, bleating"], "captions_pred_video": [null, "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a dog is barking and a cat is meowing"], "question": "which animal is barking", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a toilet flushes and a female speaks"], "sample_ids": ["wtDqrBygTcU", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["man, engine, run", "female, flushes, toilet"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a motor is running", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a person sniffles and then sneezes in the distance", "a clock ticktocks"], "sample_ids": ["uRlbY6aoBU", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["a, distance, sneeze", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is sneezing ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a man speaks, then dials a rotary telephone"], "sample_ids": ["vJvryTwuAV8", "tK4VlLsNxak"], "start_seconds": ["16", "120"], "properties": ["audience, cheer, man", "a, dial, telephone"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a man is speaking and using a sewing machine"], "question": "which man is speaking?", "label": 0}, {"captions": ["some clanking with distant murmuring", "pigeons vocalize and birds chirp"], "sample_ids": ["uMTTDZ2mb4", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["clanking, murmuring, distant", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an engine starts and increases in power", "paper is crumpling consistently"], "sample_ids": ["zjTG0gaGCUI", "v5cSxLaHADY"], "start_seconds": ["80", "0"], "properties": ["power, increase, engine", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a jet engine roars as wind blows ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a steam engine runs and whistles as it passes by"], "sample_ids": ["yZmhM1HcsyE", "se87d6yxEOA"], "start_seconds": ["4", "10"], "properties": ["engine, roar, water", "run, whistle, pass"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a train is moving and blowing its whistle "], "question": "which entity is a steam engine?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "waves crash against a shoreline and people speak"], "sample_ids": ["sxYkFKFIZD0", "yFB25fqfU8I"], "start_seconds": ["20", "300"], "properties": ["screech, man, door", "wave, crash, shoreline"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xC8kbrKJmco", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["background, goat, scream", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a goat is bleating ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a helicopter engine runs continuously", "an airplane engine runs"], "sample_ids": ["ugHJF0hfYkg", "yVPZ2MNWpms"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "engine, airplane, runs"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is driving by on the road "], "question": "which entity has a running engine", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a duck quacks continuously"], "sample_ids": ["uiItxDsDMFI", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["wood, piece, saw", "quacks, continuously, duck"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a saw is being used with background noise ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a toilet flushes and water drains"], "sample_ids": ["y2ZBGpgbhHM", "sfAvvZwdLCY"], "start_seconds": ["30", "20"], "properties": ["animal, growl, bird", "water drains, flushes, water"], "captions_pred_video": [null, "footage of the toilet in the bathroom"], "captions_pred_audio": ["birds chirping and a dog panting", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "loud clanking and banging with brief male speech"], "sample_ids": ["uRExseg-0XI", "sWZzXuWYY"], "start_seconds": ["210", "420"], "properties": ["woman, man, water", "male, speech, banging"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", null], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a sewing machine runs and a man speaks"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["people converse as a motor runs and air brakes hiss", "a woman speaks happily and an animal chirps"], "sample_ids": ["zFjIWfSD-4", "uWAAAL4CIoc"], "start_seconds": ["410", "0"], "properties": ["People, motor, brakes", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["skd2PphS6oI", "tdWhHV3X25Q"], "start_seconds": ["190", "60"], "properties": ["ring, bird, vocalize", "applause, audience, yells"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "some men converse over an engine running"], "sample_ids": ["sEprKHm8Sj8", "sCiy7QS1U"], "start_seconds": ["90", "300"], "properties": ["car, tires, slows", "men, converse, engine"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is stationary", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "an infant crying as a woman laughs"], "sample_ids": ["u--KhUW8l1Y", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["engine, sound, horn", "a, laugh, infant"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["bees buzz and wind blows", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tMJne1a4AFI", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["bees buzz, wind blows, bees", "two men, woman, birds"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a video of a natural event?", "label": 0}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "wind blowing followed by a zoom"], "sample_ids": ["vfYTJq7nU", "vr8ZXjEBhMQ"], "start_seconds": ["130", "150"], "properties": ["ducks, quack, man", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a duck quacks and a woman speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "small dogs yip and bark sharply"], "sample_ids": ["vveS8HT7Uog", "v-wcQf4BDY0"], "start_seconds": ["100", "120"], "properties": ["a man, objects, speak", "bark, yip, sharply"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "gunshots ring out, a man yells, and more shots follow"], "sample_ids": ["wvKpEYswXO0", "vKrYfzleLB8"], "start_seconds": ["150", "110"], "properties": ["water, tap, run", "a, ring, gunshots"], "captions_pred_video": ["of the person preparing food in the kitchen", "stock footage of a person holding a gun in their hand"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking with background noise and a cap gun is fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "people applaud and hoot and chat quietly"], "sample_ids": ["w6RTHR6AeAg", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["call, owl, screech", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speak as gunfire rings out", "a man speaks as a motor runs in the background"], "sample_ids": ["wqTCwqVRDlk", "xZepNM9qcRA"], "start_seconds": ["80", "30"], "properties": ["gunfire, ring, speak", "background, motor, run"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more quiet", "label": 1}, {"captions": ["dogs barking and whimpering", "people speak as gunfire rings out"], "sample_ids": ["tIY7qOV3rEM", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["barking, whimpering, dog", "gunfire, ring, speak"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "a person speaks briefly"], "sample_ids": ["vfYTJq7nU", "zOZleIRqZm4"], "start_seconds": ["130", "80"], "properties": ["ducks, quack, man", "person, talk, brief"], "captions_pred_video": [null, "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking with crickets chirping in the background"], "question": "which entity is a person talking?", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "a stream of water runs briefly"], "sample_ids": ["sSMl2vc3ek", "x-PeY8Yb8M4"], "start_seconds": ["20", "300"], "properties": ["loud, multiple, distance", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person snoring loudly", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "small dogs yip and bark sharply"], "sample_ids": ["yDoT73BWsdA", "v-wcQf4BDY0"], "start_seconds": ["10", "120"], "properties": ["engine revs, tires squeal, vehicle", "bark, yip, sharply"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a jet engine spools up and takes off", "an insect buzzes around continuously"], "sample_ids": ["vBslzh7saPw", "v25l1jef3JY"], "start_seconds": ["90", "0"], "properties": ["engine, spools, takes", "buzzes, continuously, insect"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a man speaks while water drains", "people applaud and hoot and chat quietly"], "sample_ids": ["vSeGhaZt-aI", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["water, drain, man", "people, applaud, hoot"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "waves crash against a shoreline and people speak"], "sample_ids": ["wyllXV6PjKo", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["a kid, talk, cry", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["a kid speaks followed by music playing", "people applaud and hoot and chat quietly"], "sample_ids": ["tQWGZLItBXk", "wwyfGO2J4"], "start_seconds": ["170", "90"], "properties": ["music, kid, speak", "people, applaud, hoot"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a propeller rotates loudly and intensely"], "sample_ids": ["vlJS7LN2XyM", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["background, clocks, ticking", "loud, intense, propeller"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a ticktock of a clock", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "three men talk while wind blows and some liquid flows"], "sample_ids": ["tDlfY3nmx1A", "vJ7JPEFhyLA"], "start_seconds": ["160", "16"], "properties": ["applause, laugh, man", "three men, wind, flow"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "water pouring and bubbling"], "sample_ids": ["yZrFNS7GFBQ", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["pigeon, buzzes, insect", "water, bubbles, pouring"], "captions_pred_video": ["of the bird in the cage", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["an owl hoots in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["birds fly and flutter around", "birds chirp quietly and an adult man speaks"], "sample_ids": ["wGKgwOP3h30", "zuua6-5goWw"], "start_seconds": ["30", "30"], "properties": ["fly, flutter, around", "birds, chirp, quiet, man, speaks"], "captions_pred_video": ["of the pigeons in the coop", "in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot"], "captions_pred_audio": ["pigeons coo and flap their wings", "birds are chirping and a man is speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a duck quacks continuously"], "sample_ids": ["zsLxS-uLJTw", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["horn, blast, train", "quacks, continuously, duck"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["vehicles pass by on a roadway", "winds blows roughly as a vehicle races past"], "sample_ids": ["tgbONvsP47Y", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["pass, vehicle, roadway", "wind, blows, vehicle"], "captions_pred_video": ["footage of a fire truck entering a garage", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a car is driving on the road ", "a jet engine roars and wind blows "], "question": "which vehicle is moving faster", "label": 0}, {"captions": ["popping and crackling repeats as men yell and laugh", "wind blows as people chatter quietly"], "sample_ids": ["rqu8iB22IY", "xBxDz0CFVn0"], "start_seconds": ["5", "30"], "properties": ["sound, repeats, laugh", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a motorcycle engine is idling", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vZAqdHZ81yA", "vJ7JPEFhyLA"], "start_seconds": ["180", "16"], "properties": ["engine, motorcycle, idling", "three men, wind, flow"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["an engine is idling loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "some men converse over an engine running"], "sample_ids": ["ziUT9IFTkjg", "sCiy7QS1U"], "start_seconds": ["10", "300"], "properties": ["background, birds, rustling", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more likely to be in a museum", "label": 1}, {"captions": ["a door opens and birds chirp", "a man speaks as horns blow"], "sample_ids": ["yeFvk9x0wWI", "tHyNqRyK34A"], "start_seconds": ["30", "24"], "properties": ["door, open, birds", "a, man, speaks"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "being taken from inside a vehicle on the street at night"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and a car is honking with background noise "], "question": "which entity is a person", "label": 1}, {"captions": ["a woman talking as an infant is crying", "several insects fly while two men talk"], "sample_ids": ["tMbMDvT50j8", "s-T9OVOiMLo"], "start_seconds": ["12", "330"], "properties": ["a, talk, infant", "several, fly, men"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a woman talking to an infant?", "label": 0}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays"], "sample_ids": ["yYEVLuqEytU", "sU53zg9Jp7s"], "start_seconds": ["40", "380"], "properties": ["animal, pig, background", "a bird chirps, a door bell ringing, a woman gasps"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a cartoon girl is standing in front of a blue couch"], "captions_pred_audio": ["several sheep bleat and a man speaks", "birds chirp and a doorbell rings with breathing and music in the background "], "question": "which entity has a doorbell?", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a toilet flushes and a female speaks"], "sample_ids": ["zALy31PjDl0", "yaln9y8I7ms"], "start_seconds": ["21", "230"], "properties": ["a man, a vehicle, a horn", "female, flushes, toilet"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "a propeller rotates loudly and intensely"], "sample_ids": ["vbpKkWvfOu4", "ugHJF0hfYkg"], "start_seconds": ["560", "10"], "properties": ["a, man, speaks", "loud, intense, propeller"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a helicopter is flying overhead "], "question": "which is louder", "label": 0}, {"captions": ["an small aircraft engine runs and a boy speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xSKJGCItUWE", "vfYTJq7nU"], "start_seconds": ["10", "130"], "properties": ["engine, run, boy", "rustling, ducks, quack"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a duck quacks and a woman speaks"], "question": "which entity is about a boy speaking?", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yZp6xizR0yU", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["animal, bleat, cry", "three men, wind, flow"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["water splashes and wind noise is made into a microphone", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sDSppXIlJrs", "wDVMhEdTiVw"], "start_seconds": ["27", "30"], "properties": ["microphone, water, wind", "gun, shoot, water"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun?", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xBxDz0CFVn0", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["wind, chatter, people", "engine, idle, woman"], "captions_pred_video": ["footage is blurry and out of focus", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["someone snores nearby", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["spJCm8tD9Zo", "wyllXV6PjKo"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "a baby, a woman, a man"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a woman speaks and a baby cries"], "question": "which entity has more people in it", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "water pouring and bubbling"], "sample_ids": ["xERFUeZONz8", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["ring, approach, traffic", "water, bubbles, pouring"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["an emergency vehicle siren blares", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["running water in a faucet with some clinks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["zNRChLjqcU", "vJ7JPEFhyLA"], "start_seconds": ["220", "16"], "properties": ["water, faucet, run", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a liquid flowing?", "label": 1}, {"captions": ["a child yells and another yells", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vMDHu7Lxcgw", "vlS6YMeWAPo"], "start_seconds": ["410", "40"], "properties": ["two, yell, child", "sheep, baa, birds"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "water flows and trickles"], "sample_ids": ["smGI3C1NZc", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["water, drain, toilet", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a toilet is flushed", "water is splashing and gurgling"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a motorcycle engine works nearby", "water is sprayed across a hard surface"], "sample_ids": ["tOSWIURC-4", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["engine, work, nearby", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a lawn mower is running ", "spraying followed by silence"], "question": "which is a liquid", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sSMl2vc3ek", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["loud, multiple, distance", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "people cheer as a vehicle engine revs"], "sample_ids": ["uPDn2BFTHk", "xjhAnI2q6hM"], "start_seconds": ["140", "6"], "properties": ["lady, laugh, baby", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xO-Q2BlIIPU", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["two men, exclamation, speak", "two men, woman, birds"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has two men speaking?", "label": 0}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a duck quacks continuously"], "sample_ids": ["ylpYOorfH4o", "vh30P49Po6s"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "quacks, continuously, duck"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "an airplane engine runs"], "sample_ids": ["u--KhUW8l1Y", "yVPZ2MNWpms"], "start_seconds": ["0", "0"], "properties": ["engine, sound, horn", "engine, airplane, runs"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a car is driving by on the road "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a church bell rings several times", "someone snores nearby"], "sample_ids": ["sUVVjE3Ucp8", "spJCm8tD9Zo"], "start_seconds": ["0", "90"], "properties": ["ring, bell, several", "someone snores, nearby, someone"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a church bell is ringing ", "a person is snoring loudly"], "question": "which is louder", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "small dogs yip and bark sharply"], "sample_ids": ["xNMovAf3o50", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["rain, thunder, music", "bark, yip, sharply"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a car speeding up in the distance"], "sample_ids": ["tPJvjq9QePY", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["animal, bleat, moo", "distance, car, speed"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sapQIQUhFc", "yDoT73BWsdA"], "start_seconds": ["280", "10"], "properties": ["water, trickles, flow", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "music plays and animals vocalize as a cartoon character makes sounds"], "sample_ids": ["tOj4tdLRaA", "weDbePuc-Xc"], "start_seconds": ["70", "40"], "properties": ["woman, laugh, baby", "cartoon character, music, vocalize"], "captions_pred_video": [null, "a cartoon frog and a butterfly are sitting on the ground next to each other"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and birds are chirping with a frog croaking in the background "], "question": "which entity has a baby?", "label": 0}, {"captions": ["small dogs yip and bark sharply", "a car accelerates and wind blows"], "sample_ids": ["v-wcQf4BDY0", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["bark, yip, sharply", "accelerates, wind, blows"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", null], "captions_pred_audio": ["a dog barks and growls", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "an airplane engine runs"], "sample_ids": ["vuUVPzd2FXw", "yVPZ2MNWpms"], "start_seconds": ["160", "0"], "properties": ["a, steam, release", "engine, airplane, runs"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a car is driving by on the road "], "question": "which object releases steam", "label": 0}, {"captions": ["a man speaks while video game music plays with some clicking", "a toilet flushes and water drains"], "sample_ids": ["tw76HGONaKg", "sfAvvZwdLCY"], "start_seconds": ["570", "20"], "properties": ["music, click, man", "water drains, flushes, water"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["people speak in a closed space", "vehicle engines race around a track as a man commentates"], "sample_ids": ["sTpirNYo8vQ", "sZPuqDgX2V0"], "start_seconds": ["30", "30"], "properties": ["people, space, speak", "commentator, race, track"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking and a helicopter is flying overhead "], "question": "which is a video", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a duck quacks continuously"], "sample_ids": ["u--KhUW8l1Y", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["horn, siren, life", "quacks, continuously, duck"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a man speaks as a car is passing by"], "sample_ids": ["xKB8O8LTs6s", "sK4u5T8hW78"], "start_seconds": ["70", "30"], "properties": ["music, radio, gunshots", "a, car, pass"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a radio?", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "two women and a man talk while a kid cries"], "sample_ids": ["vzxHnu-SFEw", "wyllXV6PjKo"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "a kid, talk, cry"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a woman speaks and a baby cries"], "question": "which entity has a kid?", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "an airplane engine runs"], "sample_ids": ["xjhAnI2q6hM", "yVPZ2MNWpms"], "start_seconds": ["6", "0"], "properties": ["wind, blow, loudly", "engine, airplane, runs"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a car is driving by on the road "], "question": "which entity is running", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vmrxwuAMb2I", "sLUnaPT5gM8"], "start_seconds": ["40", "0"], "properties": ["a dog, inhales, exhales", "loud, laughter, intermittent"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a dog barks and growls", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a heavy rain falls endlessly", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wP8ZKrlx3oA", "sLUnaPT5gM8"], "start_seconds": ["40", "0"], "properties": ["heavy, rain, fall", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["tIY7qOV3rEM", "vzceMbklWc"], "start_seconds": ["0", "180"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "water, faucet, sink"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "water is running and a man is speaking"], "question": "which entity is a video of a sink?", "label": 1}, {"captions": ["a goat screams and people speak in the background", "dog barking and vehicle engine idling followed shortly by vehicle engine revving"], "sample_ids": ["xC8kbrKJmco", "zY3icUyMdh8"], "start_seconds": ["0", "20"], "properties": ["background, goat, scream", "dog, bark, engine"], "captions_pred_video": [null, "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a goat is bleating ", "a car is driving and dogs are barking and squealing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an aircraft engine runs", "some men converse over an engine running"], "sample_ids": ["yLCORCnd35Q", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["engine, aircraft, runs", "men, converse, engine"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", null], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a telephone rings followed by a woman talking"], "sample_ids": ["yajyRTUQk3U", "tGcFnX0GHI"], "start_seconds": ["400", "0"], "properties": ["noise, woman, speak", "ring, talk, woman"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a dial tone sounds followed by a woman speaking"], "question": "which woman is talking", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "water splashes as an animal walks through"], "sample_ids": ["zcDwZ6W7E3E", "w1ir-sZ3Im8"], "start_seconds": ["180", "90"], "properties": ["man, speak, motorcycles", "animal, water, splashes"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "small dogs yip and bark sharply"], "sample_ids": ["tGcFnX0GHI", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["ring, talk, woman", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a car accelerates and wind blows", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["u0TrcHhkPQ", "vlJS7LN2XyM"], "start_seconds": ["20", "30"], "properties": ["accelerates, wind, blows", "background, clocks, ticking"], "captions_pred_video": [null, "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a ticktock of a clock"], "question": "which entity is accompanied by clocks ticking in the background?", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "a small engine idles continuously"], "sample_ids": ["sQGXqGcwOTc", "y5WII6cTH7k"], "start_seconds": ["3", "40"], "properties": ["cling, speak, dishes", "engine, idle, continuously"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "an engine is knocking and vibrating "], "question": "which entity is stationary", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "an insect buzzes around continuously"], "sample_ids": ["sTpirNYo8vQ", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["a, tone, fast", "buzzes, continuously, insect"], "captions_pred_video": ["of a man taking a selfie on a bus", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a telephone rings followed by a woman talking"], "sample_ids": ["zOZleIRqZm4", "tGcFnX0GHI"], "start_seconds": ["80", "0"], "properties": ["rustling, leaves, person", "ring, talk, woman"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a recording", "label": 1}, {"captions": ["food is frying then a woman speaks", "an aircraft engine runs as wind blows heavily"], "sample_ids": ["ukxt9I7eMMg", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["food, woman, speak", "engine, run, wind"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a jet engine roars and wind blows "], "question": "which entity is a moving object", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a series of light horn beeps is followed by a loud steam whistle"], "sample_ids": ["w2JXXIAdUdg", "wnpJndXuxLc"], "start_seconds": ["10", "50"], "properties": ["snoring, distance, person", "beeps, loud, whistle"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["t97k0cejSQE", "xfaoyyzw2WU"], "start_seconds": ["250", "180"], "properties": ["bird, chirp, insect", "loud, jet engine, roar"], "captions_pred_video": ["a bee on a purple thistle flower", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vimzuGQvdcU", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "rooster, crow, background, men"], "captions_pred_video": ["a group of people are rafting down a river", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a man speaks as crickets sing", "a airplane flies overhead as a woman speaks"], "sample_ids": ["ryFDPxgDOGc", "zj2R0XoFr5k"], "start_seconds": ["570", "50"], "properties": ["a, crickets, sing", "airplane, fly, woman"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a man speaking as crickets sing?", "label": 0}, {"captions": ["an engine runs loudly", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vqZuVbG6-HI", "w5W5Kqtc8E"], "start_seconds": ["130", "100"], "properties": ["loud, engine, run", "wind, blow, vehicle"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "a woman speaks as she rubs two objects together"], "sample_ids": ["yVumC9TGknc", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["humming, clock, birds", "two objects, woman, speak"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a series of beeps and chirps", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["slZLHwNbbt4", "zl9Dqx-j7q4"], "start_seconds": ["300", "6"], "properties": ["a, horn, run", "engine, laugh, loud"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "an airplane engine runs"], "sample_ids": ["sG7TyPnFDR0", "yVPZ2MNWpms"], "start_seconds": ["180", "0"], "properties": ["beeps, machine, smoke alarm", "engine, airplane, runs"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a car is driving by on the road "], "question": "which entity is a machine?", "label": 0}, {"captions": ["birds vocalize and a man speaks", "paper is crumpling consistently"], "sample_ids": ["v0wPrLBI3hg", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["vocalize, bird, speak", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "winds blows roughly as a vehicle races past"], "sample_ids": ["zO-LSSY92ZM", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["liquid, surface, sound", "wind, blows, vehicle"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["steam is hissing and hissing", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["vehicles pass by on a roadway", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["tgbONvsP47Y", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["pass, vehicle, roadway", "animal, grunts, snorts"], "captions_pred_video": ["footage of a fire truck entering a garage", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car is driving on the road ", "a woman is speaking and a baby is crying"], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["someone snores nearby", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["spJCm8tD9Zo", "zl9Dqx-j7q4"], "start_seconds": ["90", "6"], "properties": ["someone snores, nearby, someone", "engine, laugh, loud"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a person is snoring loudly", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "an airplane accelerates briefly"], "sample_ids": ["xERFUeZONz8", "zjTG0gaGCUI"], "start_seconds": ["0", "80"], "properties": ["ring, approach, traffic", "accelerates, airplane, briefly"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", null], "captions_pred_audio": ["an emergency vehicle siren blares", "a jet engine roars as wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["a stream runs then someone speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wbHTKEJZyhc", "zFjIWfSD-4"], "start_seconds": ["20", "410"], "properties": ["stream, run, someone", "People, motor, brakes"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a stream running?", "label": 0}, {"captions": ["a woman sneezes then speaks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["x4dZyf9Gbj0", "w34HjHr6gAY"], "start_seconds": ["130", "30"], "properties": ["sneezes, speaks, woman", "beeps, hit, woman"], "captions_pred_video": ["footage is blurry and out of focus", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman sneezes and speaks", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a door opens and birds chirp", "a mechanical buzzing getting louder"], "sample_ids": ["yeFvk9x0wWI", "sEprKHm8Sj8"], "start_seconds": ["30", "90"], "properties": ["door, open, birds", "noise, loud, buzzing"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["continuous snoring", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sLkeqCDJIyw", "tiDFTC-5vU"], "start_seconds": ["120", "30"], "properties": ["loud, snoring, noise", "male, duck, laugh"], "captions_pred_video": [", what is the man doing on the couch? sleeping", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "a man speaks as a car is passing by"], "sample_ids": ["x6ijhqRY38s", "sK4u5T8hW78"], "start_seconds": ["250", "30"], "properties": ["something metal, glass, hit", "a, car, pass"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "water splashes and a door squeaks"], "sample_ids": ["vimzuGQvdcU", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["a, man, yells", "sound, splash, door"], "captions_pred_video": ["a group of people are rafting down a river", null], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a dog barks and taps with background noise "], "question": "which entity has a door squeaking?", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w9lpbUn0hPc", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["male, wind, rustling", "three men, wind, flow"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more wind", "label": 1}, {"captions": ["a toilet flushes and water drains", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sfAvvZwdLCY", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["water drains, flushes, water", "a woman, laughs, animal"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a man speaks followed by another man speaking outside"], "sample_ids": ["vmrxwuAMb2I", "viuTg1M-dqg"], "start_seconds": ["40", "30"], "properties": ["a dog, inhales, exhales", "two men, speak, follow"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person speaking to another person?", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "crowd applause while a guy laughs followed by another man speaking"], "sample_ids": ["vZAw4apG0Es", "tDlfY3nmx1A"], "start_seconds": ["30", "160"], "properties": ["background, clock, ticktocks", "applause, laugh, man"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a man in a suit and tie is talking to another man in a suit and tie"], "captions_pred_audio": ["a clock is ticking and people are talking", "a crowd is clapping and laughing and a man is speaking "], "question": "which entity has a clock ticktocking in the background?", "label": 0}, {"captions": ["water gurgles, metal squeaks and the water stops", "a car speeding up in the distance"], "sample_ids": ["x4a9YGIw4ok", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["water, gurgles, stops", "distance, car, speed"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and water splashes", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["w5W5Kqtc8E", "y2bVZ7rz-5M"], "start_seconds": ["100", "280"], "properties": ["water, splashes, motorboat", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning", "label": 1}, {"captions": ["a woman speaks while water pours", "a clock rings and ticktocks"], "sample_ids": ["wTideSjRFS0", "yaLIJu2U4Y"], "start_seconds": ["30", "30"], "properties": ["water, pours, woman", "ring, ticktock, clock"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": null, "question": "which entity is silent", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a clock ticktocks"], "sample_ids": ["zgUgkpk78xU", "v-g-j2uTByM"], "start_seconds": ["70", "30"], "properties": ["horn, bells, ring", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["people speak and tapping occurs", "water flows and trickles"], "sample_ids": ["tFCUUGdREgA", "tB7hWb9gTuQ"], "start_seconds": ["70", "30"], "properties": ["people, tap, speak", "water, flow, trickle"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "wind blowing followed by a zoom"], "sample_ids": ["sapQIQUhFc", "vr8ZXjEBhMQ"], "start_seconds": ["280", "150"], "properties": ["water, stream, trickles", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a person is whistling", "a speedboat passes quickly on the water"], "sample_ids": ["sIXTftIuUgw", "tjmoSi330GM"], "start_seconds": ["90", "23"], "properties": ["person, whistling, person", "speed, water, boat"], "captions_pred_video": [null, "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a person whistling a song", "a motorboat speeds through water with wind noise "], "question": "which is moving faster", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zF8yoL0rkbI", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["engine, run, someone", "applause, audience, yells"], "captions_pred_video": ["footage of the traffic on the street at night", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a man speaks as a car is passing by"], "sample_ids": ["vKrYfzleLB8", "sK4u5T8hW78"], "start_seconds": ["110", "30"], "properties": ["a, ring, gunshots", "a, car, pass"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more calm", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a man speaks over intermittent keyboard taps"], "sample_ids": ["xM4joTqDVp4", "tw76HGONaKg"], "start_seconds": ["160", "570"], "properties": ["background, chirp, birds", "audio, man, keyboard"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "wind blows as people chatter quietly"], "sample_ids": ["w2M4i1mklOA", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["alarm, gears, turn", "wind, chatter, people"], "captions_pred_video": ["footage of an antique clock", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["vK93VuO0yNc", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["male voice, bus, rumble", "sound, chirp, buzz"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "a bee on a purple thistle flower"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a bee buzzes and a woman speaks"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["a helicopter engine runs continuously", "an adult woman and an adult man speak"], "sample_ids": ["ugHJF0hfYkg", "zTLVJCo4WEE"], "start_seconds": ["10", "30"], "properties": ["engine, running, continuously", "two people, adult, speak"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman speaks and crickets chirp"], "question": "which entity is speaking", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "wind blowing followed by a zoom"], "sample_ids": ["xOZfdgAgJ9o", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["woman, whimpering, speaking", "wind, blow, zoom"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["water runs into a sink while men speak", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vzceMbklWc", "zFjIWfSD-4"], "start_seconds": ["180", "410"], "properties": ["water, sink, run", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and a man is speaking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a woman speaks as she rubs two objects together"], "sample_ids": ["xjvTpk2Zpr8", "vzxHnu-SFEw"], "start_seconds": ["70", "80"], "properties": ["wind, blows, vehicle", "two objects, woman, speak"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["speaking following by laughing and clapping", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["u2f5NpsoHBg", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["person, laugh, clap", "People, motor, brakes"], "captions_pred_video": ["is being projected on a screen at the front of the stage", null], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity shows a person speaking and laughing and clapping?", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "a machine engine runs and a man speaks"], "sample_ids": ["wyllXV6PjKo", "vs65y4qmyBE"], "start_seconds": ["30", "340"], "properties": ["a kid, talk, cry", "engine, run, man"], "captions_pred_video": [null, "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["a woman speaks and a baby cries", "a heavy engine is running and men are speaking "], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wqADXCzngMw", "zj2R0XoFr5k"], "start_seconds": ["340", "50"], "properties": ["engine, idle, man", "airplane, boy, fly"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a plane flying by?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a car accelerates and wind blows"], "sample_ids": ["sxYkFKFIZD0", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["screech, man, door", "accelerates, wind, blows"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a race car accelerates and revs its engine "], "question": "which car is moving faster", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["xKB8O8LTs6s", "y8WEcpOlT3I"], "start_seconds": ["70", "40"], "properties": ["music, gunfire, explosion", "harsh, wind, blows"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "on how to use a sewing machine youtube"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a man speaks with another voice speaking in the background"], "sample_ids": ["uJV8NDaHqqk", "u21-Z5gJCB8"], "start_seconds": ["100", "30"], "properties": ["loud, fly, chirp", "background, voice, man"], "captions_pred_video": ["a bee hive in a wooden box", "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking with background noise and breathing sounds "], "question": "which entity is quieter", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a few ducks quack and scamper and a man speaks"], "sample_ids": ["yRx9txMcBl0", "w2bYrCVLT60"], "start_seconds": ["40", "120"], "properties": ["motors, tires, screech", "ducks, speak, quack"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of the ducks drinking from a pink pool in the grass"], "captions_pred_audio": ["a car is revving its engine and skidding ", "ducks are quacking and a man is speaking"], "question": "which entity is a bird?", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a young woman speaks over spraying and another person yells"], "sample_ids": ["xjvTpk2Zpr8", "uYT5gxnyMWM"], "start_seconds": ["70", "50"], "properties": ["wind, blows, vehicle", "person, spray, yell"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking and a baby is crying"], "question": "which entity is a person speaking over spraying?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a train horn blows as it passes by"], "sample_ids": ["yRx9txMcBl0", "zVacuqSb4LI"], "start_seconds": ["40", "30"], "properties": ["motors, tires, screech", "horn, blows, train"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["an insect buzzes around continuously", "birds vocalize and chirp continuously"], "sample_ids": ["v25l1jef3JY", "w1mlz3Pe4fU"], "start_seconds": ["0", "300"], "properties": ["buzzes, continuously, insect", "vocalize, chirp, continuously"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a bird in a cage"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "birds are chirping and singing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a door opens and birds chirp", "a man speaks followed by another man speaking outside"], "sample_ids": ["yeFvk9x0wWI", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "two men, speak, follow"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a woman and man are speaking"], "sample_ids": ["x9JovgqUcs", "vbpKkWvfOu4"], "start_seconds": ["500", "560"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "two people, speaking, woman, man"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a woman is speaking and a man is speaking"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "a stream of water runs briefly"], "sample_ids": ["x4a9YGIw4ok", "x-PeY8Yb8M4"], "start_seconds": ["120", "300"], "properties": ["water, gurgles, stops", "stream, water, run"], "captions_pred_video": ["footage is blurry and out of focus", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a toilet flushes and water splashes", "a car is driving on a wet road "], "question": "which entity has more water", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a clock ticktocks briefly"], "sample_ids": ["uzQnlJXBbOM", "u7C-AEBQM"], "start_seconds": ["50", "30"], "properties": ["ringing, beep, stop", "ticktocks, clock, ticktocks briefly"], "captions_pred_video": ["footage of a person using a cell phone on a table", null], "captions_pred_audio": ["a telephone rings and a man speaks", "a ticktock of a clock"], "question": "which entity is a clock?", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xyx6eNVEYRY", "xKB8O8LTs6s"], "start_seconds": ["380", "70"], "properties": ["loud, engine, muffles", "music, gunfire, explosion"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["x6ijhqRY38s", "zFjIWfSD-4"], "start_seconds": ["250", "410"], "properties": ["bowl, silverware, man", "People, motor, brakes"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a man moving silverware in a bowl?", "label": 0}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "water pouring and bubbling"], "sample_ids": ["uPDn2BFTHk", "uyRfq-jKPpo"], "start_seconds": ["140", "50"], "properties": ["lady, laugh, baby", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a baby laughs and a woman speaks", "water is running from a faucet"], "question": "which entity is more likely to be in a bath", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["weDbePuc-Xc", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["cartoon character, music, vocalize", "wind, blow, vehicle"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["two frogs croak at each other", "small dogs yip and bark sharply"], "sample_ids": ["zg0X6BnhOLQ", "v-wcQf4BDY0"], "start_seconds": ["410", "120"], "properties": ["two frogs, croak, at each other", "bark, yip, sharply"], "captions_pred_video": ["footage of lightning in the sky at night", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a frog is croaking", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a speedboat passes quickly on the water", "a propeller rotates loudly and intensely"], "sample_ids": ["tjmoSi330GM", "ugHJF0hfYkg"], "start_seconds": ["23", "10"], "properties": ["speed, water, boat", "loud, intense, propeller"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a helicopter is flying overhead "], "question": "which is quieter", "label": 0}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "water flows and trickles"], "sample_ids": ["vW4x7S1VfQc", "tB7hWb9gTuQ"], "start_seconds": ["150", "30"], "properties": ["clacking, oil, woman", "water, flow, trickle"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["food sizzles in a frying pan", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["yYEVLuqEytU", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["grunt, slurp, background", "animal, grunts, snorts"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a duck quacks loudly and continuously"], "sample_ids": ["sNB8zxXneIM", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["several, quack, cocks", "loud, continuous, quacks"], "captions_pred_video": ["a group of geese in a cage", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a duck is quacking loudly"], "question": "which duck is quacking loudly", "label": 1}, {"captions": ["food is frying then a woman speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["ukxt9I7eMMg", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["food, woman, speak", "music, gunfire, explosion"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["water bubbles and gurgles.", "a toilet flushes and water drains unevenly"], "sample_ids": ["tB7hWb9gTuQ", "vhJWZheqaE"], "start_seconds": ["30", "0"], "properties": ["bubbles, gurgles, water", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", null], "captions_pred_audio": ["water is splashing and gurgling", "a toilet is flushed"], "question": "which entity is a toilet?", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vK93VuO0yNc", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["male voice, bus, rumble", "wind, blow, vehicle"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", null], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "water flows as men speak and yell"], "sample_ids": ["wSVhSdj0F0", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["beep, clang, footsteps", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking and yelling?", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tDlysoZiA1I", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["animal, grunts, chirps", "loud, jet engine, roar"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["xyx6eNVEYRY", "xjvTpk2Zpr8"], "start_seconds": ["380", "70"], "properties": ["loud, engine, muffles", "wind, blows, vehicle"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["water splashes as an animal walks through", "wind blowing followed by a zoom"], "sample_ids": ["w1ir-sZ3Im8", "vr8ZXjEBhMQ"], "start_seconds": ["90", "150"], "properties": ["animal, water, splashes", "wind, blow, zoom"], "captions_pred_video": ["footage of a group of people riding horses through a river", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["water splashes and gurgles as people speak", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more calm", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["xSKJGCItUWE", "sSMl2vc3ek"], "start_seconds": ["10", "20"], "properties": ["engine, run, boy", "loud, multiple, distance"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "a machine engine runs and a man speaks"], "sample_ids": ["s4tUs779vBA", "vs65y4qmyBE"], "start_seconds": ["160", "340"], "properties": ["a, sound, stop", "engine, run, man"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a heavy engine is running and men are speaking "], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a machine beeps continuously"], "sample_ids": ["wnpJndXuxLc", "y682ml90jGw"], "start_seconds": ["50", "11"], "properties": ["beeps, loud, whistle", "beeps, machine, continuously"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a beeping sound is being made "], "question": "which entity is a machine", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "someone snores nearby"], "sample_ids": ["vh30P49Po6s", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["loud, continuous, quacks", "someone snores, nearby, someone"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a duck is quacking loudly", "a person is snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "an insect buzzes around continuously"], "sample_ids": ["uEU-Hg5MTN8", "v25l1jef3JY"], "start_seconds": ["27", "0"], "properties": ["a woman, laughs, animal", "buzzes, continuously, insect"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a fly is buzzing around a microphone "], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a car accelerates and wind blows"], "sample_ids": ["uKCSGgof8gI", "u0TrcHhkPQ"], "start_seconds": ["12", "20"], "properties": ["chirps, distance, signal", "accelerates, wind, blows"], "captions_pred_video": ["footage of a street in a small town on a sunny day", null], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["x6ijhqRY38s", "tiDFTC-5vU"], "start_seconds": ["250", "30"], "properties": ["bowl, silverware, man", "male, duck, laugh"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck?", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wtDqrBygTcU", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["man, engine, run", "music, gunfire, explosion"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and a motor is running", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a person is snoring while sleeping", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vJrjSeP17yE", "tiDFTC-5vU"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "male, duck, laugh"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and ducks are quacking"], "question": "which entity is a person", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["uRExseg-0XI", "zj2R0XoFr5k"], "start_seconds": ["210", "50"], "properties": ["woman, man, water", "airplane, boy, fly"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a vehicle engine accelerating then running on idle"], "sample_ids": ["w34HjHr6gAY", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["beeps, hit, woman", "engine, accelerate, idle"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tK4VlLsNxak", "zj2R0XoFr5k"], "start_seconds": ["120", "50"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "airplane, boy, fly"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "goats bleat and metal clings"], "sample_ids": ["tgbONvsP47Y", "tH17JPjDPnc"], "start_seconds": ["0", "260"], "properties": ["pass, vehicle, roadway", "bleat, metal, clings"], "captions_pred_video": ["footage of a fire truck entering a garage", "feed of the goats eating hay in the barn"], "captions_pred_audio": ["a car is driving on the road ", "a cow is mooing and mechanisms are ticking "], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "vehicles pass by on a roadway"], "sample_ids": ["yks4cLgIDMc", "tgbONvsP47Y"], "start_seconds": ["170", "0"], "properties": ["background, speaking, child", "pass, vehicle, roadway"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a child is crying", "a car is driving on the road "], "question": "which entity is more active", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "birds chirp and objects are moved around"], "sample_ids": ["s4Uz1Ffgo04", "yPUYU6t3rwo"], "start_seconds": ["100", "370"], "properties": ["water, rushes, vehicle", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "insects buzz and a man speaks"], "question": "which entity is more calm", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["zliInBdC98Y", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["a, baby, cries, wails", "engine revs, vehicle, people"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a baby cries and a woman speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "some light rustling followed by a loud burp and a girl speaking"], "sample_ids": ["uYT5gxnyMWM", "vdoxuJn9lTc"], "start_seconds": ["50", "40"], "properties": ["female, spraying, scream", "burp, loud, girl"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a child speaks followed by a burp"], "question": "which entity is a burp", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a clock ticktocks"], "sample_ids": ["ylpYOorfH4o", "v-g-j2uTByM"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "ticktocks, clock, ticktocks"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a baby cries and a woman moans", "a telephone rings followed by a woman talking"], "sample_ids": ["smDKStoHBJo", "tGcFnX0GHI"], "start_seconds": ["0", "0"], "properties": ["a, cry, woman", "ring, talk, woman"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "a man speaks as a car is passing by"], "sample_ids": ["s4tUs779vBA", "sK4u5T8hW78"], "start_seconds": ["160", "30"], "properties": ["a, sound, stop", "a, car, pass"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["some tunes played by whistling", "a duck quacks continuously"], "sample_ids": ["u6BnG6YZqJ4", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["tune, play, whistling", "quacks, continuously, duck"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person whistling a song", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "a person snoring several times"], "sample_ids": ["ukg5L09Wpvo", "spJCm8tD9Zo"], "start_seconds": ["150", "90"], "properties": ["clickety-clack, train, whistle", "snore, person, several"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a person is snoring loudly"], "question": "which is louder", "label": 0}, {"captions": ["a person burps loudly for a long time nearby", "small dogs growl, bark and yip."], "sample_ids": ["vf44CgrjT0A", "sShpyu2l4YQ"], "start_seconds": ["20", "0"], "properties": ["loud, long, person", "growl, bark, yip"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "the puppies are playing with a toy"], "captions_pred_audio": ["a loud burp", "a dog is barking and growling"], "question": "which entity is more likely to be a dog", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "wind blows as people chatter quietly"], "sample_ids": ["yeFvk9x0wWI", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["clack, bird, chirp", "wind, chatter, people"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage is blurry and out of focus"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "water flows and trickles"], "sample_ids": ["xyx6eNVEYRY", "tB7hWb9gTuQ"], "start_seconds": ["380", "30"], "properties": ["loud, engine, muffles", "water, flow, trickle"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "a child speaks in closed space"], "sample_ids": ["y2ZBGpgbhHM", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["dog, chirp, breathe", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xjvTpk2Zpr8", "su6FAOcOA8c"], "start_seconds": ["70", "4"], "properties": ["engine, run, wind", "engine, idle, woman"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "pigeons vocalize and birds chirp"], "sample_ids": ["tDVADusiIoc", "uiS58TNyUiw"], "start_seconds": ["60", "430"], "properties": ["wind, radio, waves", "vocalize, bird, chirp"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["y8dSeubCNI", "vbZ-0lGPneg"], "start_seconds": ["4", "30"], "properties": ["engine revving, people speaking, motorcycle", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["an engine revving and people talking in the background", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program playing?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["tDVADusiIoc", "ukg5L09Wpvo"], "start_seconds": ["60", "150"], "properties": ["water, radio, man", "clickety-clack, train, whistle"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a woman speaks as she rubs two objects together"], "sample_ids": ["smDKStoHBJo", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["a, talk, baby, cry", "two objects, woman, speak"], "captions_pred_video": ["a man holding a crying baby in his arms", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman is speaking", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "birds chirp and objects are moved around"], "sample_ids": ["w2JXXIAdUdg", "yPUYU6t3rwo"], "start_seconds": ["10", "370"], "properties": ["snoring, distance, person", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a person snoring and a dog whimpering", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["scraping and female speech with distant music", "vehicles pass by on a roadway"], "sample_ids": ["yHeVV-xeOxQ", "tgbONvsP47Y"], "start_seconds": ["130", "0"], "properties": ["female, speech, music", "pass, vehicle, roadway"], "captions_pred_video": ["of a girl milking a goat's udder", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a car is driving on the road "], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "pigeons vocalize and birds chirp"], "sample_ids": ["yeFvk9x0wWI", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["clack, bird, chirp", "vocalize, bird, chirp"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of the pigeon in the cage"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["wind blows strongly", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w8uLijTqtlU", "vJ7JPEFhyLA"], "start_seconds": ["70", "16"], "properties": ["wind, blows, strongly", "three men, wind, flow"], "captions_pred_video": ["footage is blurry and shaky", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a liquid flowing?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a machine beeps continuously"], "sample_ids": ["vimzuGQvdcU", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["a, man, yells", "beeps, machine, continuously"], "captions_pred_video": ["a group of people are rafting down a river", null], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a toilet flushes and water drains", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sfAvvZwdLCY", "xfaoyyzw2WU"], "start_seconds": ["20", "180"], "properties": ["water drains, flushes, water", "loud, jet engine, roar"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a toilet is flushed", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a baby laugh at a sputter", "an engine revs and a turning noise is made"], "sample_ids": ["sLUnaPT5gM8", "tOSWIURC-4"], "start_seconds": ["0", "0"], "properties": ["laugh, sputter, baby", "noise, engine, revs"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a lawn mower is running "], "question": "which noise is made by an engine", "label": 1}, {"captions": ["a woman and man are speaking", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vbpKkWvfOu4", "sLUnaPT5gM8"], "start_seconds": ["560", "0"], "properties": ["two people, speaking, woman, man", "loud, laughter, intermittent"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["multiple people speak and children yell while water gurgles", "people speaking indiscriminately in the distance with a person snoring loudly nearby"], "sample_ids": ["vb1fPSDI4c", "w2JXXIAdUdg"], "start_seconds": ["30", "10"], "properties": ["multiple, people, yell", "snoring, distance, person"], "captions_pred_video": [null, "a close up shot of a person's mouth with a toothbrush in it"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a person snoring and a dog whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["wind blowing followed by a zoom", "wind blowing followed by a zoom"], "sample_ids": ["vr8ZXjEBhMQ", "vr8ZXjEBhMQ"], "start_seconds": ["150", "150"], "properties": ["wind, blow, zoom", "wind, blow, zoom"], "captions_pred_video": ["is taken from a motorcycle's point of view", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is blowing the wind", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uKCSGgof8gI", "uEU-Hg5MTN8"], "start_seconds": ["12", "27"], "properties": ["chirps, distance, signal", "a woman, laughs, animal"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video of a woman laughing?", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["vlJS7LN2XyM", "wSVhSdj0F0"], "start_seconds": ["30", "10"], "properties": ["background, clocks, ticking", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a car horn honks and keys jangle with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a helicopter engine idles continuously", "a horn rings out as a machine runs by"], "sample_ids": ["ugHJF0hfYkg", "slZLHwNbbt4"], "start_seconds": ["10", "300"], "properties": ["engine, idle, continuously", "a, horn, run"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a helicopter is flying overhead ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["dogs barking and whimpering", "a man speaks as a motor runs in the background"], "sample_ids": ["tIY7qOV3rEM", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "background, motor, run"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "a clock ticktocks"], "sample_ids": ["yDoT73BWsdA", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["engine revs, tires squeal, vehicle", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["yks4cLgIDMc", "xfaoyyzw2WU"], "start_seconds": ["170", "180"], "properties": ["background, speaking, child", "loud, jet engine, roar"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking and a child is crying", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["ugHJF0hfYkg", "wSVhSdj0F0"], "start_seconds": ["10", "10"], "properties": ["loud, propeller, move", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a car horn honks and keys jangle with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["u21-Z5gJCB8", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["background, voice, man", "wind, blows, vehicle"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a clock ticktocks continuously", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vlJS7LN2XyM", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks continuously", "gun, shoot, water"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a ticktock of a clock", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["yLy-WycbVVE", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "beeps, hit, woman"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "cats meow and then a person begins to talk while the cats continue to meow"], "sample_ids": ["xERFUeZONz8", "x5cuQjOdM3E"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "cat, talk, meow"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["an emergency vehicle siren blares", "a cat meows and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vVhthZ45k3Y", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["cat, purr, hiss", "applause, audience, yells"], "captions_pred_video": ["footage is blurry and out of focus", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "people applaud and hoot and chat quietly"], "sample_ids": ["vmrxwuAMb2I", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["a dog, inhales, exhales", "people, applaud, hoot"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", null], "captions_pred_audio": ["a dog barks and growls", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["ugHJF0hfYkg", "zj2R0XoFr5k"], "start_seconds": ["10", "50"], "properties": ["loud, propeller, move", "airplane, boy, fly"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman speaks while a helicopter flies overhead "], "question": "which is not a moving object", "label": 0}, {"captions": ["people converse as a motor runs and air brakes hiss", "a man speaks as a motor runs in the background"], "sample_ids": ["zFjIWfSD-4", "xZepNM9qcRA"], "start_seconds": ["410", "30"], "properties": ["People, motor, brakes", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yZrFNS7GFBQ", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["pigeon, buzzes, insect", "water, radio, man"], "captions_pred_video": ["of the bird in the cage", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tgbONvsP47Y", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["noise, truck, accelerate", "music, gunfire, explosion"], "captions_pred_video": ["footage of a fire truck entering a garage", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car is driving on the road ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "a clock ticktocks"], "sample_ids": ["tMJne1a4AFI", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["wind, buzz, rustling", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a swarm of bees on the ground", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a swarm of bees buzzing around", "a clock is ticking loudly"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["a flush is followed by gurgling water, then another flush", "water splashes and a motorboat passes as people yell"], "sample_ids": ["tqR406bGiE", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["flush, water, gurgle", "water, splashes, motorboat"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about water?", "label": 0}, {"captions": ["a train horn blows as it passes by", "an airplane engine runs"], "sample_ids": ["zVacuqSb4LI", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["horn, blows, train", "engine, airplane, runs"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a car is driving by on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["s6DESzUTGjY", "zFjIWfSD-4"], "start_seconds": ["16", "410"], "properties": ["wind, laugh, woman", "People, motor, brakes"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", null], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tw76HGONaKg", "sSMl2vc3ek"], "start_seconds": ["570", "20"], "properties": ["A, game, keyboard", "loud, multiple, distance"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a infant makes noise and is excited", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wIJK3-5y0kA", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["noise, excited, infant", "harsh, wind, blows"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is not a person", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a car speeding up in the distance"], "sample_ids": ["wy1eKjR7KC0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["people, talk, distance", "distance, car, speed"], "captions_pred_video": ["two police officers riding motorcycles down the street", null], "captions_pred_audio": ["a man is speaking and a siren is going off", "a race car accelerates and revs its engine "], "question": "which is farther away", "label": 0}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a infant makes noise and is excited"], "sample_ids": ["vW4x7S1VfQc", "wIJK3-5y0kA"], "start_seconds": ["150", "30"], "properties": ["clacking, oil, woman", "noise, excited, infant"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["food sizzles in a frying pan", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a clock ticktocks in wind", "a stream of water runs briefly"], "sample_ids": ["yVumC9TGknc", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["ticktocks, clock, wind", "stream, water, run"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a series of beeps and chirps", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["xSKJGCItUWE", "vbZ-0lGPneg"], "start_seconds": ["10", "30"], "properties": ["engine, run, boy", "a woman, a television program, a bird"], "captions_pred_video": ["footage of the helicopter flying in the room", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vddP56-ogds", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["liquid, laughs, man", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking while food is frying in the background"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a man talks followed by a woman shouting"], "sample_ids": ["sncRqQ67iJU", "s3cTDAj31g"], "start_seconds": ["460", "80"], "properties": ["loud, repeatedly, man", "man, talk, woman"], "captions_pred_video": ["of an airplane flying in the dark sky at night", null], "captions_pred_audio": ["a person is snoring", "a man is speaking and a baby is crying"], "question": "which entity is a man talking to a woman?", "label": 1}, {"captions": ["small dogs yip and bark sharply", "people applaud and hoot and chat quietly"], "sample_ids": ["v-wcQf4BDY0", "wwyfGO2J4"], "start_seconds": ["120", "90"], "properties": ["bark, yip, sharply", "people, applaud, hoot"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", null], "captions_pred_audio": ["a dog barks and growls", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a car accelerates and wind blows"], "sample_ids": ["vBHyYJ8pL0", "u0TrcHhkPQ"], "start_seconds": ["2", "20"], "properties": ["noise, door, opening", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a race car accelerates and revs its engine "], "question": "which entity is a car?", "label": 1}, {"captions": ["bees buzz and wind blows", "wind blows as people chatter quietly"], "sample_ids": ["tMJne1a4AFI", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["bees buzz, wind blows, bees", "wind, chatter, people"], "captions_pred_video": ["a swarm of bees on the ground", "footage is blurry and out of focus"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a duck quacks several times", "an infant crying frantically"], "sample_ids": ["vh30P49Po6s", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["quacks, duck, several", "cry, infant, frantically"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of the baby crying in the car seat"], "captions_pred_audio": ["a duck is quacking loudly", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["uYT5gxnyMWM", "tw76HGONaKg"], "start_seconds": ["50", "570"], "properties": ["person, spray, yell", "A, game, keyboard"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks and types on a computer keyboard "], "question": "which entity is a video game?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "water running down a sink while a man is talking"], "sample_ids": ["tOSWIURC-4", "vSeGhaZt-aI"], "start_seconds": ["0", "50"], "properties": ["engine, work, nearby", "water, sink, talk"], "captions_pred_video": [null, "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking and pouring liquid with background noise "], "question": "which is a source of water", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vddP56-ogds", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["liquid, laughs, man", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["goats bleat and metal clings", "a boat travels through the waves as the wind blows loudly and a man speaks over a radio"], "sample_ids": ["tH17JPjDPnc", "tDVADusiIoc"], "start_seconds": ["260", "60"], "properties": ["bleat, metal, clings", "wind, radio, waves"], "captions_pred_video": ["feed of the goats eating hay in the barn", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["waves crash against a shoreline and people speak", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["yFB25fqfU8I", "wyllXV6PjKo"], "start_seconds": ["300", "30"], "properties": ["wave, crash, shoreline", "a baby, a woman, a man"], "captions_pred_video": ["footage of a person surfing in the ocean", null], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a woman speaks and a baby cries"], "question": "which entity is more likely to be a video of a baby crying?", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["zj2R0XoFr5k", "uWAAAL4CIoc"], "start_seconds": ["50", "0"], "properties": ["airplane, fly, overhead", "a woman, chirps, animal"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", null], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a dog is barking "], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a duck quacks continuously"], "sample_ids": ["tDVADusiIoc", "vh30P49Po6s"], "start_seconds": ["60", "30"], "properties": ["man, radio, blows", "quacks, continuously, duck"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a duck is quacking loudly"], "question": "which entity is speaking", "label": 0}, {"captions": ["some people speak", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vbZ-0lGPneg", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "music, gunfire, explosion"], "captions_pred_video": ["of a man holding a baby duck in his hands", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a man speaks over intermittent keyboard taps"], "sample_ids": ["uzQnlJXBbOM", "tw76HGONaKg"], "start_seconds": ["50", "570"], "properties": ["ringing, beep, stop", "audio, man, keyboard"], "captions_pred_video": ["footage of a person using a cell phone on a table", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a telephone rings and a man speaks", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a man speaks as a car is passing by"], "sample_ids": ["tQWGZLItBXk", "sK4u5T8hW78"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "a, car, pass"], "captions_pred_video": ["worms revolution screenshots", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xOZfdgAgJ9o", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["woman, whimpering, speaking", "water, radio, man"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["an audience gives applause as a man yells and a group sings", "an airplane engine spools and people speak"], "sample_ids": ["tdWhHV3X25Q", "wTjoRj1se3U"], "start_seconds": ["60", "390"], "properties": ["applause, audience, yells", "airplane, engine, spool"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["continuous snoring", "a clock ticktocks"], "sample_ids": ["sLkeqCDJIyw", "v-g-j2uTByM"], "start_seconds": ["120", "30"], "properties": ["loud, snoring, noise", "ticktocks, clock, ticktocks"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person is snoring loudly", "a clock is ticking loudly"], "question": "which entity makes a ticktocks noise", "label": 1}, {"captions": ["people speak and tapping occurs", "pigeons vocalize and birds chirp"], "sample_ids": ["tFCUUGdREgA", "uiS58TNyUiw"], "start_seconds": ["70", "430"], "properties": ["people, tap, speak", "vocalize, bird, chirp"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a clock ticktocks briefly", "a vehicle engine accelerating then running on idle"], "sample_ids": ["u7C-AEBQM", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks briefly", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a ticktock of a clock", "an engine is idling"], "question": "which is a moving object", "label": 1}, {"captions": ["water runs into a sink while men speak", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vzceMbklWc", "vfYTJq7nU"], "start_seconds": ["180", "130"], "properties": ["water, sink, run", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and a man is speaking", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["a helicopter engine runs continuously", "a man speaks as a car is passing by"], "sample_ids": ["ugHJF0hfYkg", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["engine, running, continuously", "a, car, pass"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is stationary", "label": 1}, {"captions": ["a motorcycle engine is idling", "an airplane engine runs"], "sample_ids": ["vZAqdHZ81yA", "yVPZ2MNWpms"], "start_seconds": ["180", "0"], "properties": ["engine, motorcycle, idling", "engine, airplane, runs"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["an engine is idling loudly", "a car is driving by on the road "], "question": "which entity has a moving engine", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "waves crash against a shoreline and people speak"], "sample_ids": ["tw76HGONaKg", "yFB25fqfU8I"], "start_seconds": ["570", "300"], "properties": ["A, game, keyboard", "wave, crash, shoreline"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which is a natural phenomenon", "label": 1}, {"captions": ["a beep repeats multiple times", "a stream of water flows as people talk and wind blows"], "sample_ids": ["y682ml90jGw", "xBxDz0CFVn0"], "start_seconds": ["11", "30"], "properties": ["beep, repeat, multiple", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a beeping sound is being made ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a duck quacks continuously"], "sample_ids": ["rqfQRErjfk8", "vh30P49Po6s"], "start_seconds": ["170", "30"], "properties": ["crowd, cheers, applauds", "quacks, continuously, duck"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["y8dSeubCNI", "yeFvk9x0wWI"], "start_seconds": ["4", "30"], "properties": ["men, women, car", "clack, bird, chirp"], "captions_pred_video": [null, "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["an engine revving and people talking in the background", "birds chirp in the background as a car drives by "], "question": "which entity is quieter", "label": 1}, {"captions": ["paper folding and crinkling", "some men converse over an engine running"], "sample_ids": ["zPpG3RD8lSs", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["paper, fold, crinkle", "men, converse, engine"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", null], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a man speaks while a motorcycle revs and accelerates "], "question": "which is a more active scene", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["v0x1odnXtP0", "tDVADusiIoc"], "start_seconds": ["210", "60"], "properties": ["keyboard, type, computer", "water, radio, man"], "captions_pred_video": ["how to make money on youtube in spanish", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["ukg5L09Wpvo", "tDVADusiIoc"], "start_seconds": ["150", "60"], "properties": ["a train, a horn, a bell", "water, radio, man"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a horn?", "label": 0}, {"captions": ["paper is repeatedly crumpled and crinkled", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vms5XGTDVQc", "wDVMhEdTiVw"], "start_seconds": ["220", "30"], "properties": ["paper, crumpled, crinkled", "gun, shoot, water"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["paper is crumpled and crinkled", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not a gun?", "label": 0}, {"captions": ["water is sprayed across a hard surface", "people applaud and hoot and chat quietly"], "sample_ids": ["sQwlkXjQabo", "wwyfGO2J4"], "start_seconds": ["10", "90"], "properties": ["water, spray, surface", "people, applaud, hoot"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a performance", "label": 1}, {"captions": ["a clock ticktocks briefly", "a crowd yells, reacts and applauds"], "sample_ids": ["u7C-AEBQM", "wztCSUxOf8"], "start_seconds": ["30", "130"], "properties": ["ticktocks, clock, ticktocks briefly", "a crowd, yells, applauds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a crowd is clapping"], "question": "which entity is more animated", "label": 1}, {"captions": ["an airplane engine spools and people speak", "waves crash against a shoreline and people speak"], "sample_ids": ["wTjoRj1se3U", "yFB25fqfU8I"], "start_seconds": ["390", "300"], "properties": ["airplane, engine, spool", "wave, crash, shoreline"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["y8dSeubCNI", "wDVMhEdTiVw"], "start_seconds": ["4", "30"], "properties": ["men, women, car", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["an engine revving and people talking in the background", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yRx9txMcBl0", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["motors, tires, screech", "water, radio, man"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water?", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "wind blows as people chatter quietly"], "sample_ids": ["x6ijhqRY38s", "xBxDz0CFVn0"], "start_seconds": ["250", "30"], "properties": ["something metal, glass, hit", "wind, chatter, people"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["wind blowing followed by a zoom", "water pouring and bubbling"], "sample_ids": ["vr8ZXjEBhMQ", "uyRfq-jKPpo"], "start_seconds": ["150", "50"], "properties": ["wind, blow, zoom", "water, bubbles, pouring"], "captions_pred_video": ["is taken from a motorcycle's point of view", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "water is running from a faucet"], "question": "which entity is more likely to be a video of a windy day?", "label": 0}, {"captions": ["an small aircraft engine runs and a boy speaks", "someone is typing on a computer keyboard"], "sample_ids": ["xSKJGCItUWE", "v0x1odnXtP0"], "start_seconds": ["10", "210"], "properties": ["engine, run, boy", "keyboard, type, computer"], "captions_pred_video": ["footage of the helicopter flying in the room", "how to make money on youtube in spanish"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a drill runs and two people laugh", "wind blows as people chatter quietly"], "sample_ids": ["tEE3MpBt1sg", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "wind, chatter, people"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage is blurry and out of focus"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sEprKHm8Sj8", "vJ7JPEFhyLA"], "start_seconds": ["90", "16"], "properties": ["car, tires, slows", "three men, wind, flow"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a car?", "label": 1}, {"captions": ["a church bell rings several times", "some men converse over an engine running"], "sample_ids": ["sUVVjE3Ucp8", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["ring, bell, several", "men, converse, engine"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", null], "captions_pred_audio": ["a church bell is ringing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "pigeons vocalize and birds chirp"], "sample_ids": ["ukg5L09Wpvo", "uiS58TNyUiw"], "start_seconds": ["150", "430"], "properties": ["clickety-clack, train, whistle", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "of the pigeon in the cage"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a loud engine muffles a man as he speaks"], "sample_ids": ["t97k0cejSQE", "xyx6eNVEYRY"], "start_seconds": ["250", "380"], "properties": ["bird, chirp, insect", "loud, engine, muffles"], "captions_pred_video": ["a bee on a purple thistle flower", "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "an aircraft engine is running and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["ukxt9I7eMMg", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["food, pan, cook", "multiple, people, yell"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "some tunes played by whistling"], "sample_ids": ["vz8868znkVQ", "u6BnG6YZqJ4"], "start_seconds": ["60", "0"], "properties": ["audio, click, kid speaking", "tune, play, whistling"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a clock ticktocks"], "sample_ids": ["xKB8O8LTs6s", "v-g-j2uTByM"], "start_seconds": ["70", "30"], "properties": ["music, radio, gunshots", "ticktocks, clock, ticktocks"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["xOZfdgAgJ9o", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["woman, whimpering, speaking", "stream, water, flow"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a person snoring several times"], "sample_ids": ["zsLxS-uLJTw", "spJCm8tD9Zo"], "start_seconds": ["20", "90"], "properties": ["horn, blast, train", "snore, person, several"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a person is snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a man speaks followed by another man speaking outside"], "sample_ids": ["sxYkFKFIZD0", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["screech, man, door", "two men, speak, follow"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker?", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "several insects fly while two men talk"], "sample_ids": ["rqu8iB22IY", "s-T9OVOiMLo"], "start_seconds": ["5", "330"], "properties": ["sound, repeats, laugh", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about insects?", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "birds chirp and objects are moved around"], "sample_ids": ["siJFXfGWgDk", "yPUYU6t3rwo"], "start_seconds": ["50", "370"], "properties": ["a, bird, vehicle", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "insects buzz and a man speaks"], "question": "which entity has more birds", "label": 1}, {"captions": ["an insect buzzes around continuously", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["v25l1jef3JY", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["buzzes, continuously, insect", "applause, audience, yells"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking and a crowd is clapping"], "question": "which entity is not a person", "label": 0}, {"captions": ["a person uses a saw to cut some wood", "a propeller rotates loudly and intensely"], "sample_ids": ["sHbXC6na9hg", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["a person, saw, wood", "loud, intense, propeller"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["an engine is idling and vibrating", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a helicopter engine idles continuously", "people speak in the background as a clock ticktocks"], "sample_ids": ["ugHJF0hfYkg", "vZAw4apG0Es"], "start_seconds": ["10", "30"], "properties": ["engine, idle, continuously", "background, clock, ticktocks"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a helicopter is flying overhead ", "a clock is ticking and people are talking"], "question": "which entity is a clock", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "a toilet flushes and a female speaks"], "sample_ids": ["xSKJGCItUWE", "yaln9y8I7ms"], "start_seconds": ["10", "230"], "properties": ["engine, work, child", "female, flushes, toilet"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage is blurry and out of focus"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["water flows as men speak and yell", "water pouring and bubbling"], "sample_ids": ["vJ7JPEFhyLA", "uyRfq-jKPpo"], "start_seconds": ["16", "50"], "properties": ["water, flow, men", "water, bubbles, pouring"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a telephone rings followed by a woman talking"], "sample_ids": ["smDKStoHBJo", "tGcFnX0GHI"], "start_seconds": ["0", "0"], "properties": ["a, talk, baby, cry", "ring, talk, woman"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation between two people?", "label": 0}, {"captions": ["a man speaks as a boat engine runs", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wtDqrBygTcU", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["man, engine, run", "men, talk, cars"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and a motor is running", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has a man speaking as an engine runs?", "label": 0}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "a person snores loudly multiple times at a close distance"], "sample_ids": ["w9lpbUn0hPc", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["male, wind, rustling", "loud, multiple, distance"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", null], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a clock ticks quietly and rhythmically"], "sample_ids": ["zofjfKhqLk8", "u7C-AEBQM"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "ticks, rhythmic, quiet"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a ticktock of a clock"], "question": "which entity is quieter", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a person sniffs and sneezes"], "sample_ids": ["wy1eKjR7KC0", "uRlbY6aoBU"], "start_seconds": ["30", "0"], "properties": ["people, talk, distance", "sneezes, person, sniffs"], "captions_pred_video": ["two police officers riding motorcycles down the street", null], "captions_pred_audio": ["a man is speaking and a siren is going off", "a man is sneezing "], "question": "which entity is more likely to be a sneeze", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "wind blowing followed by a zoom"], "sample_ids": ["w-4gHptFNuU", "vr8ZXjEBhMQ"], "start_seconds": ["21", "150"], "properties": ["engine revs, accelerates, bump", "wind, blow, zoom"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a car accelerates and revs its engine ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "people speak in a closed space"], "sample_ids": ["zofjfKhqLk8", "sTpirNYo8vQ"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "people, space, speak"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking while a car is revving and accelerating "], "question": "which entity is more quiet", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "a pigeon cooing as an insect buzzes by briefly"], "sample_ids": ["wvKpEYswXO0", "yZrFNS7GFBQ"], "start_seconds": ["150", "30"], "properties": ["plastic, tap, speak", "pigeon, buzzes, insect"], "captions_pred_video": ["of the person preparing food in the kitchen", "of the bird in the cage"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "an owl hoots in the background "], "question": "which entity is a bird?", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["smDKStoHBJo", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["a, infant, speaking", "a, scream, girl"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a baby is crying"], "question": "which entity has a scream", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a stream of water runs briefly"], "sample_ids": ["wwyfGO2J4", "x-PeY8Yb8M4"], "start_seconds": ["90", "300"], "properties": ["people, applaud, hoot", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["xvDdE3zNf8Y", "w34HjHr6gAY"], "start_seconds": ["120", "30"], "properties": ["a, female, speaks", "beeps, hit, woman"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman speaks and crumples paper", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["someone whistles a tune", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sIXTftIuUgw", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["someone, tune, whistle", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a telephone rings followed by a woman talking"], "sample_ids": ["wvKpEYswXO0", "tGcFnX0GHI"], "start_seconds": ["150", "0"], "properties": ["sound, water, running", "ring, talk, woman"], "captions_pred_video": ["of the person preparing food in the kitchen", null], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["se87d6yxEOA", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["run, whistle, pass", "music, gunfire, explosion"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["food is frying while a woman speaks", "water splashes and a door squeaks"], "sample_ids": ["yhQ2Lg-7qDY", "sdXV-ylviw"], "start_seconds": ["130", "190"], "properties": ["food, woman, speak", "sound, splash, door"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a dog barks and taps with background noise "], "question": "which entity is silent", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["uYT5gxnyMWM", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["a, scream, girl", "airplane, boy, fly"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about a girl speaking followed by a scream?", "label": 0}, {"captions": ["a beep repeats multiple times", "paper is crumpling consistently"], "sample_ids": ["y682ml90jGw", "v5cSxLaHADY"], "start_seconds": ["11", "0"], "properties": ["beep, repeat, multiple", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a beeping sound is being made ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vBHyYJ8pL0", "zFjIWfSD-4"], "start_seconds": ["2", "410"], "properties": ["noise, door, opening", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a door opening and closing?", "label": 0}, {"captions": ["a baby laugh at a sputter", "a child speaks in closed space"], "sample_ids": ["sLUnaPT5gM8", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["laugh, sputter, baby", "child, space, speak"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a child", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "an infant crying frantically"], "sample_ids": ["sNB8zxXneIM", "zwOBqeFTgiU"], "start_seconds": ["20", "30"], "properties": ["several, quack, cocks", "cry, infant, frantically"], "captions_pred_video": ["a group of geese in a cage", "of the baby crying in the car seat"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "paper is crumpling consistently"], "sample_ids": ["tDlysoZiA1I", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["animal, grunt, multiple", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "a child speaks in closed space"], "sample_ids": ["slZLHwNbbt4", "yW6FWLSLkx4"], "start_seconds": ["300", "40"], "properties": ["train, horn, sound", "child, space, speak"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a man speaks as a car is passing by"], "sample_ids": ["weDbePuc-Xc", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["music, slaps, human", "a, car, pass"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["heavy rain splashes as it falls", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wP8ZKrlx3oA", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["fall, rain, splash", "a woman, something, fried"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of something being fried?", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "water pouring and bubbling"], "sample_ids": ["uZesmtKZGSw", "uyRfq-jKPpo"], "start_seconds": ["250", "50"], "properties": ["car, track, man", "water, bubbles, pouring"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "vehicles pass by on a roadway"], "sample_ids": ["ul60S8TXDA8", "tgbONvsP47Y"], "start_seconds": ["60", "0"], "properties": ["sound, distance, bell", "pass, vehicle, roadway"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["people speak softly as food sizzles", "vehicles pass by on a roadway"], "sample_ids": ["yhQ2Lg-7qDY", "tgbONvsP47Y"], "start_seconds": ["130", "0"], "properties": ["food, sizzle, speak", "pass, vehicle, roadway"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a machine beeps continuously", "multiple people speak and children yell while water gurgles"], "sample_ids": ["y682ml90jGw", "vb1fPSDI4c"], "start_seconds": ["11", "30"], "properties": ["beeps, machine, continuously", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a church bell rings several times", "someone is typing on a computer keyboard"], "sample_ids": ["sUVVjE3Ucp8", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["ring, bell, several", "keyboard, type, computer"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "how to make money on youtube in spanish"], "captions_pred_audio": ["a church bell is ringing ", "a person is typing on a keyboard"], "question": "which is not a type of computer", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "a man speaks, then dials a rotary telephone"], "sample_ids": ["zF8yoL0rkbI", "tK4VlLsNxak"], "start_seconds": ["30", "120"], "properties": ["engine, run, someone", "a, dial, telephone"], "captions_pred_video": ["footage of the traffic on the street at night", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man is speaking and using a sewing machine"], "question": "which entity is a rotary telephone?", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "vehicle engines race around a track as a man commentates"], "sample_ids": ["spJCm8tD9Zo", "sZPuqDgX2V0"], "start_seconds": ["90", "30"], "properties": ["snores, wheezes, sleeps", "commentator, race, track"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a helicopter is flying overhead "], "question": "which entity is a video of a race?", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "three men talk while wind blows and some liquid flows"], "sample_ids": ["t25U-v4k4ts", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["a, chirps, bird", "three men, wind, flow"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a man speaks as a car is passing by"], "sample_ids": ["s4Uz1Ffgo04", "sK4u5T8hW78"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "a, car, pass"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is quieter", "label": 1}, {"captions": ["a car accelerates and wind blows", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["u0TrcHhkPQ", "vbZ-0lGPneg"], "start_seconds": ["20", "30"], "properties": ["accelerates, wind, blows", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "some tunes played by whistling"], "sample_ids": ["xZepNM9qcRA", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["background, motor, run", "tune, play, whistling"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "water flows and trickles"], "sample_ids": ["vBslzh7saPw", "tB7hWb9gTuQ"], "start_seconds": ["90", "30"], "properties": ["engine, roar, louder", "water, flow, trickle"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a jet engine roars and accelerates ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["long loud burping by a man", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xmiUIOhtZyQ", "wqZ135Ssz0"], "start_seconds": ["60", "60"], "properties": ["loud, burp, man", "two men, woman, birds"], "captions_pred_video": ["homer simpson drinking a beer", null], "captions_pred_audio": ["a person burps and music plays in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["two frogs croak at each other", "a door slams shut roughly"], "sample_ids": ["zg0X6BnhOLQ", "zkKdxzNC97Y"], "start_seconds": ["410", "27"], "properties": ["two frogs, croak, at each other", "a door, slams, shut"], "captions_pred_video": ["footage of lightning in the sky at night", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a frog is croaking", "a door is opened and closed"], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "small dogs yip and bark sharply"], "sample_ids": ["wIvYjuR3nrg", "v-wcQf4BDY0"], "start_seconds": ["9", "120"], "properties": ["birds, pigeons, vocalize", "bark, yip, sharply"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["birds are chirping and cooing", "a dog barks and growls"], "question": "which animal is more vocal", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "people speak as gunfire rings out"], "sample_ids": ["vddP56-ogds", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["water, splash, person, laugh", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["wind blows strongly", "some clanking with distant murmuring"], "sample_ids": ["w8uLijTqtlU", "uMTTDZ2mb4"], "start_seconds": ["70", "30"], "properties": ["wind, blows, strongly", "clanking, murmuring, distant"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "people are talking and a car is driving by with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "water is sprayed across a hard surface"], "sample_ids": ["xO-Q2BlIIPU", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["two men, exclamation, speak", "water, spray, surface"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "spraying followed by silence"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "plastic is tapped on while someone speaks"], "sample_ids": ["zO-LSSY92ZM", "wvKpEYswXO0"], "start_seconds": ["30", "150"], "properties": ["liquid, surface, sound", "plastic, tap, speak"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "of the person preparing food in the kitchen"], "captions_pred_audio": ["steam is hissing and hissing", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "wind blows as people chatter quietly"], "sample_ids": ["s59PfAghdkM", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["bird, chirp, background, horse, neigh", "wind, chatter, people"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a car speeding up in the distance"], "sample_ids": ["x5cuQjOdM3E", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["cat, talk, meow", "distance, car, speed"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "pigeons vocalize and birds chirp"], "sample_ids": ["zcDwZ6W7E3E", "uiS58TNyUiw"], "start_seconds": ["180", "430"], "properties": ["a, man, speak", "vocalize, bird, chirp"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a pigeon cooing as an insect buzzes by briefly"], "sample_ids": ["yaln9y8I7ms", "yZrFNS7GFBQ"], "start_seconds": ["230", "30"], "properties": ["female, flushes, toilet", "pigeon, buzzes, insect"], "captions_pred_video": ["footage is blurry and out of focus", "of the bird in the cage"], "captions_pred_audio": ["a toilet flushes and a man speaks", "an owl hoots in the background "], "question": "which entity is a bird?", "label": 1}, {"captions": ["goats bleat and people speak", "a child speaks in closed space"], "sample_ids": ["z5iUE5h0EPs", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["goats bleat, people speak, language", "child, space, speak"], "captions_pred_video": ["of the goat in the barn", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a goat bleats and a man speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a person is snoring while sleeping", "a car accelerates and wind blows"], "sample_ids": ["vJrjSeP17yE", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["a person is sleeping, snoring, person", "accelerates, wind, blows"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["xM4joTqDVp4", "xjvTpk2Zpr8"], "start_seconds": ["160", "70"], "properties": ["background, chirp, birds", "wind, blows, vehicle"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sapQIQUhFc", "xKB8O8LTs6s"], "start_seconds": ["280", "70"], "properties": ["liquid, flow, distance", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a woman speaks happily and an animal chirps"], "sample_ids": ["tDlysoZiA1I", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["animal, grunts, chirps", "a woman, chirps, animal"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking and a dog is barking "], "question": "which entity has a woman speaking happily and an animal chirps?", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vr8ZXjEBhMQ", "zl9Dqx-j7q4"], "start_seconds": ["150", "6"], "properties": ["wind, blow, zoom", "engine, laugh, loud"], "captions_pred_video": ["is taken from a motorcycle's point of view", "footage of a man driving a car in the dark"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a heavy rain falls endlessly", "paper folding and crinkling"], "sample_ids": ["wP8ZKrlx3oA", "zPpG3RD8lSs"], "start_seconds": ["40", "20"], "properties": ["heavy, rain, fall", "paper, fold, crinkle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a heavy rain is falling on a surface", "the wind blows and a mouse clicks "], "question": "which entity is not a natural phenomenon", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "a toilet flushes and a female speaks"], "sample_ids": ["x6ijhqRY38s", "yaln9y8I7ms"], "start_seconds": ["250", "230"], "properties": ["something metal, glass, hit", "female, flushes, toilet"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["people speak and tapping occurs", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["tFCUUGdREgA", "uEU-Hg5MTN8"], "start_seconds": ["70", "27"], "properties": ["people, tap, speak", "a woman, laughs, animal"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a more snorting animal", "label": 1}, {"captions": ["someone whistles a song", "a clock ticktocks"], "sample_ids": ["sIXTftIuUgw", "v-g-j2uTByM"], "start_seconds": ["90", "30"], "properties": ["someone, song, whistle", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person whistling a song", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["wind blows and a stream of water flows nearby", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sYITalLZjj4", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["stream, flow, wind", "wind, blow, vehicle"], "captions_pred_video": ["two ducks are swimming in the water near each other", null], "captions_pred_audio": ["wind blows and birds chirp", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a stream of water flowing nearby?", "label": 0}, {"captions": ["birds chirps while a siren signals in the distance", "paper is crumpling consistently"], "sample_ids": ["uKCSGgof8gI", "v5cSxLaHADY"], "start_seconds": ["12", "0"], "properties": ["chirps, distance, signal", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["birds fly and flutter around", "several insects fly while two men talk"], "sample_ids": ["wGKgwOP3h30", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["fly, flutter, around", "several, fly, men"], "captions_pred_video": ["of the pigeons in the coop", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["pigeons coo and flap their wings", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video of flying?", "label": 0}, {"captions": ["a large bell chimes back and forth loudly", "vehicles pass by on a roadway"], "sample_ids": ["w2M4i1mklOA", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["loud, chime, bell", "pass, vehicle, roadway"], "captions_pred_video": ["footage of an antique clock", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "a man is filing a hard object"], "sample_ids": ["yI-KvObbDoY", "vveS8HT7Uog"], "start_seconds": ["260", "100"], "properties": ["sound, smack, wind", "a man, hard, object"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "footage is of a workbench with various tools on it including a hammer and a screwdriver"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a man is filing and speaking with background noise and breathing "], "question": "which entity is about a man filing a hard object?", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "an infant crying frantically"], "sample_ids": ["yswmmRZFItk", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["background, frog, croak", "cry, infant, frantically"], "captions_pred_video": ["a close up of a frog in the water", "of the baby crying in the car seat"], "captions_pred_audio": ["a frog is croaking", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["rwTERCUno", "w5W5Kqtc8E"], "start_seconds": ["90", "100"], "properties": ["engine, idle, sputter", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine is idling and vibrating", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a child babbles as a woman speaks"], "sample_ids": ["sfAvvZwdLCY", "wEBlkGWVWwE"], "start_seconds": ["20", "260"], "properties": ["water drains, flushes, water", "a, babble, woman"], "captions_pred_video": ["footage of the toilet in the bathroom", "shows a person writing on the whiteboard"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a child is speaking with background noise and clapping "], "question": "which entity is a human", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a car accelerates and wind blows"], "sample_ids": ["sfAvvZwdLCY", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["flushes, drains, water", "accelerates, wind, blows"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a duck quacks continuously"], "sample_ids": ["wyllXV6PjKo", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["a kid, talk, cry", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman speaks and a baby cries", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["smDKStoHBJo", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["a, talk, baby, cry", "water, radio, man"], "captions_pred_video": ["a man holding a crying baby in his arms", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["tapping occurs then a baby cries", "a man speaks as a motor runs in the background"], "sample_ids": ["wIJK3-5y0kA", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["a, cry, baby", "background, motor, run"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a child speaks in closed space"], "sample_ids": ["y2ZBGpgbhHM", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["animal, growl, bird", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vuUVPzd2FXw", "vfYTJq7nU"], "start_seconds": ["160", "130"], "properties": ["a, steam, release", "rustling, ducks, quack"], "captions_pred_video": ["of the person cooking on the grill with a spatula", null], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "water pouring and bubbling"], "sample_ids": ["yeFvk9x0wWI", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["clack, bird, chirp", "water, bubbles, pouring"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "some men converse over an engine running"], "sample_ids": ["sShpyu2l4YQ", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["growl, bark, yip", "men, converse, engine"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uJV8NDaHqqk", "xBxDz0CFVn0"], "start_seconds": ["100", "30"], "properties": ["loud, fly, chirp", "stream, water, flow"], "captions_pred_video": ["a bee hive in a wooden box", "footage is blurry and out of focus"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "an adult male speaks and dials a rotary phone"], "sample_ids": ["tOj4tdLRaA", "tK4VlLsNxak"], "start_seconds": ["70", "120"], "properties": ["woman, laugh, baby", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": [null, "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and using a sewing machine"], "question": "which entity is a person", "label": 1}, {"captions": ["water flows as men speak and yell", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vJ7JPEFhyLA", "zj2R0XoFr5k"], "start_seconds": ["16", "50"], "properties": ["water, flow, men", "airplane, boy, fly"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["spJCm8tD9Zo", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["snores, wheezes, sleeps", "airplane, boy, fly"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person is snoring loudly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["a car accelerates and wind blows", "water pouring and bubbling"], "sample_ids": ["u0TrcHhkPQ", "uyRfq-jKPpo"], "start_seconds": ["20", "50"], "properties": ["accelerates, wind, blows", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "someone whistles a tune"], "sample_ids": ["xyL9F5VrjkE", "sIXTftIuUgw"], "start_seconds": ["20", "90"], "properties": ["wind, motor, distance", "someone, tune, whistle"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a person whistling a song"], "question": "which is not a musical instrument", "label": 0}, {"captions": ["a duck quacks loudly and continuously", "some men converse over an engine running"], "sample_ids": ["vh30P49Po6s", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["loud, continuous, quacks", "men, converse, engine"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is quieter", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a bird chirps in response to a woman chirping for the birds"], "sample_ids": ["uJV8NDaHqqk", "uOpoD0gGXcs"], "start_seconds": ["100", "120"], "properties": ["loud, fly, chirp", "chirps, woman, bird"], "captions_pred_video": ["a bee hive in a wooden box", "a herd of cows grazing in the field"], "captions_pred_audio": ["a swarm of bees buzzing around", "birds are chirping and a man is speaking"], "question": "which entity is a response to a human chirping?", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "frogs croak and vocalize"], "sample_ids": ["w6RTHR6AeAg", "yswmmRZFItk"], "start_seconds": ["40", "0"], "properties": ["call, owl, screech", "croak, vocalize, frog"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a frog is croaking"], "question": "which animal is more likely to be a predator", "label": 1}, {"captions": ["continuous sneezing together with speech", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["x4dZyf9Gbj0", "ukg5L09Wpvo"], "start_seconds": ["130", "150"], "properties": ["continuous, sneeze, speech", "clickety-clack, train, whistle"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a woman sneezes and speaks", "a train blows its whistle and blows its horn "], "question": "which entity is continuous", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "water flows as men speak and yell"], "sample_ids": ["vuUVPzd2FXw", "vJ7JPEFhyLA"], "start_seconds": ["160", "16"], "properties": ["a, steam, release", "water, flow, men"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man talking while metallic objects are rapped and steam is released?", "label": 0}, {"captions": ["a baby cries and a woman speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["tMbMDvT50j8", "xBxDz0CFVn0"], "start_seconds": ["12", "30"], "properties": ["a, cry, woman", "stream, water, flow"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["scraping and female speech with distant music", "an infant crying as a woman laughs"], "sample_ids": ["yHeVV-xeOxQ", "xhmRY9yhC7c"], "start_seconds": ["130", "20"], "properties": ["female, speech, music", "a, laugh, infant"], "captions_pred_video": ["of a girl milking a goat's udder", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "someone is typing on a computer keyboard"], "sample_ids": ["uEU-Hg5MTN8", "v0x1odnXtP0"], "start_seconds": ["27", "210"], "properties": ["animal, grunts, snorts", "keyboard, type, computer"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vlS6YMeWAPo", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["sheep, baa, birds", "a woman, something, fried"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a goat bleats and birds chirp", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["multiple ducks quack continuously", "goats bleat and people speak"], "sample_ids": ["wfHeoPDLMaM", "z5iUE5h0EPs"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "goats bleat, people speak, language"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "of the goat in the barn"], "captions_pred_audio": ["ducks are quacking", "a goat bleats and a man speaks"], "question": "which entity is speaking a language", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["y1saVTXsKwc", "tdWhHV3X25Q"], "start_seconds": ["80", "60"], "properties": ["a, dog, talk", "applause, audience, yells"], "captions_pred_video": ["a dog playing with a pink ball", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a dog barks and a man speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman sneezes then speaks", "an infant crying as a woman laughs"], "sample_ids": ["x4dZyf9Gbj0", "xhmRY9yhC7c"], "start_seconds": ["130", "20"], "properties": ["sneezes, speaks, woman", "a, laugh, infant"], "captions_pred_video": ["footage is blurry and out of focus", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman sneezes and speaks", "a baby cries and a woman speaks"], "question": "which woman is laughing", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["t69a8aRKhmc", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["a, b, c", "applause, audience, yells"], "captions_pred_video": ["footage is blurry and out of focus", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a person sniffs and sneezes"], "sample_ids": ["vqZuVbG6-HI", "uRlbY6aoBU"], "start_seconds": ["130", "0"], "properties": ["background, male, female", "sneezes, person, sniffs"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is sneezing "], "question": "which entity is a person", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "an engine runs loudly"], "sample_ids": ["u--KhUW8l1Y", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["horn, siren, life", "loud, engine, run"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "plastic is tapped on while someone speaks"], "sample_ids": ["yYEVLuqEytU", "wvKpEYswXO0"], "start_seconds": ["40", "150"], "properties": ["grunt, slurp, background", "plastic, tap, speak"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of the person preparing food in the kitchen"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["spYNpeN7rPY", "wz7N8YRy74I"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "rooster, crow, background, men"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a clock in the background?", "label": 0}, {"captions": ["a woman talks while something is fried and objects are tapped", "birds chirp and objects are moved around"], "sample_ids": ["yajyRTUQk3U", "yPUYU6t3rwo"], "start_seconds": ["400", "370"], "properties": ["a woman, something, fried", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "insects buzz and a man speaks"], "question": "which entity is about birds?", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a woman speaks happily and an animal chirps"], "sample_ids": ["ukg5L09Wpvo", "uWAAAL4CIoc"], "start_seconds": ["150", "0"], "properties": ["a train, a horn, a bell", "a woman, chirps, animal"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vBHyYJ8pL0", "yajyRTUQk3U"], "start_seconds": ["2", "400"], "properties": ["noise, door, opening", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["some men converse over an engine running", "water is sprayed across a hard surface"], "sample_ids": ["sCiy7QS1U", "sQwlkXjQabo"], "start_seconds": ["300", "10"], "properties": ["men, converse, engine", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a child babbles as a woman speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wEBlkGWVWwE", "vfYTJq7nU"], "start_seconds": ["260", "130"], "properties": ["a, babble, woman", "rustling, ducks, quack"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a duck quacks and a woman speaks"], "question": "which entity is about a child?", "label": 0}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "waves crash against a shoreline and people speak"], "sample_ids": ["t69a8aRKhmc", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["a, b, c", "wave, crash, shoreline"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["birds chirp and a dog breathes heavily", "water is sprayed across a hard surface"], "sample_ids": ["y2ZBGpgbhHM", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["dog, chirp, breathe", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["birds chirping and a dog panting", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["tDVADusiIoc", "uEU-Hg5MTN8"], "start_seconds": ["60", "27"], "properties": ["wind, radio, waves", "a woman, laughs, animal"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be in a boat", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["zF8yoL0rkbI", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["engine, run, someone", "loud, jet engine, roar"], "captions_pred_video": ["footage of the traffic on the street at night", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a large crowd cheers and applauds"], "sample_ids": ["xl2PIWyXaM", "rqfQRErjfk8"], "start_seconds": ["160", "170"], "properties": ["chirp, man, younger person", "crowd, cheers, applauds"], "captions_pred_video": [null, "a man hugging another man in front of an orchestra"], "captions_pred_audio": ["birds are chirping and people are talking", "a crowd of people clapping and cheering"], "question": "which entity is more active", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zofjfKhqLk8", "tiDFTC-5vU"], "start_seconds": ["10", "30"], "properties": ["background, metal, clings", "male, duck, laugh"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a child speaks in closed space"], "sample_ids": ["vfYTJq7nU", "yW6FWLSLkx4"], "start_seconds": ["130", "40"], "properties": ["rustling, ducks, quack", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a man speaks followed by another man speaking outside"], "sample_ids": ["xKB8O8LTs6s", "viuTg1M-dqg"], "start_seconds": ["70", "30"], "properties": ["music, radio, gunshots", "two men, speak, follow"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a woman speaking on a radio?", "label": 0}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["xl2PIWyXaM", "zFjIWfSD-4"], "start_seconds": ["160", "410"], "properties": ["chirp, man, younger person", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and people are talking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "a car accelerates and wind blows"], "sample_ids": ["zcDwZ6W7E3E", "u0TrcHhkPQ"], "start_seconds": ["180", "20"], "properties": ["a, man, speak", "accelerates, wind, blows"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which is not a person speaking", "label": 1}, {"captions": ["a clock ticktocks continuously", "a duck quacks continuously"], "sample_ids": ["vlJS7LN2XyM", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks continuously", "quacks, continuously, duck"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a ticktock of a clock", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vJrjSeP17yE", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["a person is sleeping, snoring, person", "a woman, something, fried"], "captions_pred_video": ["a black background with a small plane flying in the sky", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 0}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wnpJndXuxLc", "tDVADusiIoc"], "start_seconds": ["50", "60"], "properties": ["beeps, loud, whistle", "water, radio, man"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["uiItxDsDMFI", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["wood, piece, saw", "rooster, crow, background, men"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a saw is being used with background noise ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "people cheer as a vehicle engine revs"], "sample_ids": ["s4Uz1Ffgo04", "xjhAnI2q6hM"], "start_seconds": ["100", "6"], "properties": ["roars, background, people speaking", "engine revs, vehicle, people"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a truck is revving its engine and a man is speaking "], "question": "which vehicle is revving", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xvDdE3zNf8Y", "su6FAOcOA8c"], "start_seconds": ["120", "4"], "properties": ["A, crumple, paper", "engine, idle, woman"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman speaks and crumples paper", "a woman is speaking and a subway train is moving "], "question": "which woman is speaking", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "small dogs yip and bark sharply"], "sample_ids": ["s4tUs779vBA", "v-wcQf4BDY0"], "start_seconds": ["160", "120"], "properties": ["a, sound, stop", "bark, yip, sharply"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["an airplane engine runs", "wind blows as people chatter quietly"], "sample_ids": ["yVPZ2MNWpms", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["engine, airplane, runs", "wind, chatter, people"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage is blurry and out of focus"], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wqUmIEzuNz4", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["frog, bird, vocalize", "a woman, laughs, animal"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a cat meows and rustles", "a woman is speaking and a baby is crying"], "question": "which entity is a human", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "several insects fly while two men talk"], "sample_ids": ["smDKStoHBJo", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["a, infant, speaking", "several, fly, men"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["wind blows and a stream of water flows nearby", "a car speeding up in the distance"], "sample_ids": ["sYITalLZjj4", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["stream, flow, wind", "distance, car, speed"], "captions_pred_video": ["two ducks are swimming in the water near each other", null], "captions_pred_audio": ["wind blows and birds chirp", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "a propeller rotates loudly and intensely"], "sample_ids": ["wqZ135Ssz0", "ugHJF0hfYkg"], "start_seconds": ["60", "10"], "properties": ["two men, woman, birds", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a helicopter is flying overhead "], "question": "which entity is more quiet", "label": 0}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a train engine runs and a horn blows"], "sample_ids": ["zY3icUyMdh8", "zPX9o1uDiI"], "start_seconds": ["20", "40"], "properties": ["dog, bark, engine", "engine, horn, run"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a train moves with its horn blowing and wheels squealing "], "question": "which entity is a train", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w9lpbUn0hPc", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["male, wind, rustling", "stream, water, flow"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["wind blowing followed by a zoom", "pigeons vocalize and birds chirp"], "sample_ids": ["vr8ZXjEBhMQ", "uiS58TNyUiw"], "start_seconds": ["150", "430"], "properties": ["wind, blow, zoom", "vocalize, bird, chirp"], "captions_pred_video": ["is taken from a motorcycle's point of view", "of the pigeon in the cage"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["an aircraft engine runs", "roadway noise occurs and a truck accelerates"], "sample_ids": ["yLCORCnd35Q", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["engine, aircraft, runs", "noise, truck, accelerate"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a car is driving on the road "], "question": "which is not a source of noise", "label": 0}, {"captions": ["an airplane engine runs", "a man talks as several small engines run"], "sample_ids": ["yVPZ2MNWpms", "u9A6VZQCZpU"], "start_seconds": ["0", "30"], "properties": ["engine, airplane, runs", "a, man, talk"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking while a race car is revving and accelerating "], "question": "which entity is a person", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "someone is typing on a computer keyboard"], "sample_ids": ["vzceMbklWc", "v0x1odnXtP0"], "start_seconds": ["180", "210"], "properties": ["water, faucet, sink", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["water is running and a man is speaking", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard?", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a man speaks as a car is passing by"], "sample_ids": ["w5W5Kqtc8E", "sK4u5T8hW78"], "start_seconds": ["100", "30"], "properties": ["wind, blow, vehicle", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "water flows as men speak and yell"], "sample_ids": ["zhx6hoYrHeI", "vJ7JPEFhyLA"], "start_seconds": ["160", "16"], "properties": ["engine, sputter, rough", "water, flow, men"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking and yelling?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "paper is crumpling consistently"], "sample_ids": ["w2M4i1mklOA", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["alarm, gears, turn", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of an antique clock", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "paper folding and crinkling"], "sample_ids": ["yDoT73BWsdA", "zPpG3RD8lSs"], "start_seconds": ["10", "20"], "properties": ["engine revs, tires squeal, vehicle", "paper, fold, crinkle"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "the wind blows and a mouse clicks "], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["wind blows as people chatter quietly", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["xBxDz0CFVn0", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["wind, chatter, people", "background, birds, rustling"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 0}, {"captions": ["birds vocalize and a man speaks", "plastic is tapped on while someone speaks"], "sample_ids": ["v0wPrLBI3hg", "wvKpEYswXO0"], "start_seconds": ["30", "150"], "properties": ["vocalize, bird, speak", "plastic, tap, speak"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "of the person preparing food in the kitchen"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "male speech followed by light wind, rustling, distant speech and brief hissing"], "sample_ids": ["tDlfY3nmx1A", "w9lpbUn0hPc"], "start_seconds": ["160", "30"], "properties": ["applause, laugh, man", "male, wind, rustling"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage of a man in a black shirt standing in front of a white truck in a parking lot"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a man is speaking with wind noise and breathing sounds in the background "], "question": "which entity is a speech?", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a horn rings out as a machine runs by"], "sample_ids": ["w5W5Kqtc8E", "slZLHwNbbt4"], "start_seconds": ["100", "300"], "properties": ["wind, engine, scream", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "water flows as men speak and yell"], "sample_ids": ["ylpYOorfH4o", "vJ7JPEFhyLA"], "start_seconds": ["410", "16"], "properties": ["motor, run, steady", "water, flow, men"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a woman talks while something is fried and objects are tapped", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yajyRTUQk3U", "tDVADusiIoc"], "start_seconds": ["400", "60"], "properties": ["a woman, something, fried", "water, radio, man"], "captions_pred_video": ["- a woman cooking in the kitchen", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["a car speeding up in the distance", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["u0TrcHhkPQ", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["distance, car, speed", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is not a car?", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wnpJndXuxLc", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["beeps, loud, whistle", "three men, wind, flow"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "a clock ticktocks"], "sample_ids": ["s4Uz1Ffgo04", "v-g-j2uTByM"], "start_seconds": ["100", "30"], "properties": ["water, rushes, vehicle", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "an engine runs loudly"], "sample_ids": ["vlS6YMeWAPo", "vqZuVbG6-HI"], "start_seconds": ["40", "130"], "properties": ["noise, bleat, call", "loud, engine, run"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a goat bleats and birds chirp", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "small dogs yip and bark sharply"], "sample_ids": ["viuTg1M-dqg", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["two men, speak, follow", "bark, yip, sharply"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["continuous sneezing together with speech", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["x4dZyf9Gbj0", "tDVADusiIoc"], "start_seconds": ["130", "60"], "properties": ["continuous, sneeze, speech", "water, radio, man"], "captions_pred_video": ["footage is blurry and out of focus", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sofxkNWaP0s", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["wind, engine, louder", "rooster, crow, background, men"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a clock ticktocks"], "sample_ids": ["zofjfKhqLk8", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "wind blows as people chatter quietly"], "sample_ids": ["vzxHnu-SFEw", "xBxDz0CFVn0"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "wind, chatter, people"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["sEprKHm8Sj8", "siJFXfGWgDk"], "start_seconds": ["90", "50"], "properties": ["car, tires, slows", "a, bird, vehicle"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and birds are chirping in the background "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "a toilet flushes and a female speaks"], "sample_ids": ["vbpKkWvfOu4", "yaln9y8I7ms"], "start_seconds": ["560", "230"], "properties": ["a, woman, man", "female, flushes, toilet"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a toilet flushes and a man speaks"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a race car approaches quickly and slows down squealing tires", "someone is typing on a computer keyboard"], "sample_ids": ["sEprKHm8Sj8", "v0x1odnXtP0"], "start_seconds": ["90", "210"], "properties": ["car, tires, slows", "keyboard, type, computer"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "how to make money on youtube in spanish"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person is typing on a keyboard"], "question": "which object is stationary", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wvKpEYswXO0", "uEU-Hg5MTN8"], "start_seconds": ["150", "27"], "properties": ["sound, water, running", "a woman, laughs, animal"], "captions_pred_video": ["of the person preparing food in the kitchen", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking softly?", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a person snores loudly multiple times at a close distance"], "sample_ids": ["zkKdxzNC97Y", "sSMl2vc3ek"], "start_seconds": ["27", "20"], "properties": ["loud, bang, noise", "loud, multiple, distance"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a person snoring loudly"], "question": "which entity is louder", "label": 0}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "people cheer as a vehicle engine revs"], "sample_ids": ["xNMovAf3o50", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["rain, thunder, music", "engine revs, vehicle, people"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["vveS8HT7Uog", "xjvTpk2Zpr8"], "start_seconds": ["100", "70"], "properties": ["a man, objects, speak", "wind, blows, vehicle"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a loud engine muffles a man as he speaks"], "sample_ids": ["v7jJS8aAyA", "xyx6eNVEYRY"], "start_seconds": ["10", "380"], "properties": ["wind, blows, loudly", "loud, engine, muffles"], "captions_pred_video": [null, "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "an aircraft engine is running and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a dark barks and whimpers", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["sYj4hpDUZDQ", "sapQIQUhFc"], "start_seconds": ["30", "280"], "properties": ["barks, whimpers, dark", "liquid, flow, distance"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", null], "captions_pred_audio": ["a dog barks and a cat meows", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more distant", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "an insect buzzes around continuously"], "sample_ids": ["w1mlz3Pe4fU", "v25l1jef3JY"], "start_seconds": ["300", "0"], "properties": ["vocalize, chirp, continuously", "buzzes, continuously, insect"], "captions_pred_video": ["of a bird in a cage", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["birds are chirping and singing", "a fly is buzzing around a microphone "], "question": "which entity is a type of insect", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xV7Mg1QucSc", "ukg5L09Wpvo"], "start_seconds": ["14", "150"], "properties": ["alarm, ticktocks, laughs", "clickety-clack, train, whistle"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vqZuVbG6-HI", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["background, male, female", "a woman, laughs, animal"], "captions_pred_video": ["footage is blurry because it's raining outside", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and an animal snorting?", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zO-LSSY92ZM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["liquid, surface, sound", "stream, water, flow"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "footage is blurry and out of focus"], "captions_pred_audio": ["steam is hissing and hissing", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sLUnaPT5gM8", "sSMl2vc3ek"], "start_seconds": ["0", "20"], "properties": ["loud, laughter, intermittent", "loud, multiple, distance"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an airplane engine spools and people speak", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wTjoRj1se3U", "vb1fPSDI4c"], "start_seconds": ["390", "30"], "properties": ["airplane, engine, spool", "multiple, people, yell"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "winds blows roughly as a vehicle races past"], "sample_ids": ["sG7TyPnFDR0", "xjvTpk2Zpr8"], "start_seconds": ["180", "70"], "properties": ["beeps, machine, smoke alarm", "wind, blows, vehicle"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xfaoyyzw2WU", "yajyRTUQk3U"], "start_seconds": ["180", "400"], "properties": ["loud, jet engine, roar", "a woman, something, fried"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "- a woman cooking in the kitchen"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "sucking and grunting followed by slurping with birds in the background"], "sample_ids": ["wnpJndXuxLc", "yYEVLuqEytU"], "start_seconds": ["50", "40"], "properties": ["blows, vehicle, train", "grunt, slurp, background"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a baby goat is being petted by a person's hand"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "several sheep bleat and a man speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vZAw4apG0Es", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["background, tick, repeat", "rooster, crow, background, men"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a mechanical buzzing getting louder"], "sample_ids": ["tDlysoZiA1I", "sEprKHm8Sj8"], "start_seconds": ["0", "90"], "properties": ["animal, grunt, chirp", "noise, loud, buzzing"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["ticking continues without interruption", "a horn rings out as a machine runs by"], "sample_ids": ["v-g-j2uTByM", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["ticking, continuous, clock", "a, horn, run"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a clock is ticking loudly", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is not continuous", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["siJFXfGWgDk", "tDVADusiIoc"], "start_seconds": ["50", "60"], "properties": ["a, bird, vehicle", "water, radio, man"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "ducks quack continuously"], "sample_ids": ["vbr9mHKc8WM", "sNB8zxXneIM"], "start_seconds": ["40", "20"], "properties": ["noise, loudness, engine", "quack, duck, continuously"], "captions_pred_video": [null, "a group of geese in a cage"], "captions_pred_audio": ["an engine is idling", "a rooster is crowing and wind is blowing "], "question": "which entity makes noise continuously", "label": 1}, {"captions": ["an electronic device bleeps once", "a man speaks as a car is passing by"], "sample_ids": ["tHJ6JSa8Y4", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["bleeps, electronic, device", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a clock is ticking and beeping", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "some men converse over an engine running"], "sample_ids": ["sWZzXuWYY", "sCiy7QS1U"], "start_seconds": ["420", "300"], "properties": ["male, speech, banging", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a clock ticktocks briefly", "an insect buzzes around continuously"], "sample_ids": ["u7C-AEBQM", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["ticktocks, clock, ticktocks briefly", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a ticktock of a clock", "a fly is buzzing around a microphone "], "question": "which entity buzzes continuously", "label": 1}, {"captions": ["loud, continuous burping", "birds chirp in the background while a horse neighs followed by a girl speaking"], "sample_ids": ["y636gklDioE", "s59PfAghdkM"], "start_seconds": ["20", "0"], "properties": ["loud, continuous, burping", "bird, chirp, background, horse, neigh"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "is an anime scene featuring two people and a horse in the foreground and a fence in the background"], "captions_pred_audio": ["a person burps loudly several times", "birds are chirping a horse is neighing and a woman is speaking "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "some men converse over an engine running"], "sample_ids": ["wz7N8YRy74I", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["rooster, crow, background, people", "men, converse, engine"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a rooster in it?", "label": 0}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "frogs croak and vocalize"], "sample_ids": ["wz7N8YRy74I", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["rooster, crow, background, people", "croak, vocalize, frog"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a close up of a frog in the water"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a frog is croaking"], "question": "which animal is speaking", "label": 1}, {"captions": ["paper is crumpling consistently", "winds blows roughly as a vehicle races past"], "sample_ids": ["v5cSxLaHADY", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "wind, blows, vehicle"], "captions_pred_video": ["footage of the person holding a pair of scissors", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["paper is crumpled and crinkled", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["plastic is tapped on while someone speaks", "some tunes played by whistling"], "sample_ids": ["wvKpEYswXO0", "u6BnG6YZqJ4"], "start_seconds": ["150", "0"], "properties": ["plastic, tap, speak", "tune, play, whistling"], "captions_pred_video": ["of the person preparing food in the kitchen", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["an engine runs and wind blows", "people speak as gunfire rings out"], "sample_ids": ["vs65y4qmyBE", "wqTCwqVRDlk"], "start_seconds": ["340", "80"], "properties": ["engine, run, wind", "gunfire, ring, speak"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "children cheer as a man speaks then an audience screams"], "sample_ids": ["u7C-AEBQM", "vJvryTwuAV8"], "start_seconds": ["30", "16"], "properties": ["ticks, rhythmic, quiet", "audience, cheer, man"], "captions_pred_video": [null, "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a crowd is shouting and whooping "], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "vehicles pass by on a roadway"], "sample_ids": ["y2ZBGpgbhHM", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["dog, chirp, breathe", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds chirping and a dog panting", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaking with light rustling", "a motor idles, accelerates, then slows down."], "sample_ids": ["zOZleIRqZm4", "vYkA3cfXp5Q"], "start_seconds": ["80", "30"], "properties": ["light, rustling, man", "speed, idle, accelerate"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["heavy rain splashes as it falls", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wP8ZKrlx3oA", "yDoT73BWsdA"], "start_seconds": ["40", "10"], "properties": ["fall, rain, splash", "engine, revs, vehicle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vbZ-0lGPneg", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["a woman, a television program, a bird", "female, spraying, scream"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman is speaking and a baby is crying"], "question": "which entity has a woman talking?", "label": 0}, {"captions": ["a fly buzzes around loudly as birds chirp", "a vehicle engine accelerating then running on idle"], "sample_ids": ["uJV8NDaHqqk", "vYkA3cfXp5Q"], "start_seconds": ["100", "30"], "properties": ["loud, fly, chirp", "engine, accelerate, idle"], "captions_pred_video": ["a bee hive in a wooden box", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a swarm of bees buzzing around", "an engine is idling"], "question": "which entity is not a fly?", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a child speaks in closed space"], "sample_ids": ["sEprKHm8Sj8", "yW6FWLSLkx4"], "start_seconds": ["90", "40"], "properties": ["car, tires, slows", "child, space, speak"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "wind blows strongly"], "sample_ids": ["w5W5Kqtc8E", "w8uLijTqtlU"], "start_seconds": ["100", "70"], "properties": ["water, splashes, motorboat", "wind, blows, strongly"], "captions_pred_video": [null, "footage is blurry and shaky"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "the wind is blowing strongly"], "question": "which entity is more likely to blow strongly", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "bird squawks are accompanied by a man and woman speaking"], "sample_ids": ["yVumC9TGknc", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["humming, clock, birds", "man, woman, squawks"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", null], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has a clock ticking?", "label": 0}, {"captions": ["an engine sputters followed by a car zooming by", "a vehicle engine accelerating then running on idle"], "sample_ids": ["u5RmF3c3Aw", "vYkA3cfXp5Q"], "start_seconds": ["60", "30"], "properties": ["engine, car, zoom", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "material crumbles into a microphone"], "sample_ids": ["sG7TyPnFDR0", "vofpvUo6NAw"], "start_seconds": ["180", "220"], "properties": ["beeps, machine, smoke alarm", "material, crumbles, microphone"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "person wrapping a toy car in a plastic bag"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "paper is being crumpled and crinkled"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "paper is crumpling consistently"], "sample_ids": ["wTideSjRFS0", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["food, sizzle, woman", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "water is sprayed across a hard surface"], "sample_ids": ["vZAw4apG0Es", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["people, clock, converse", "water, spray, surface"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a clock is ticking and people are talking", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a woman speaks as she rubs two objects together"], "sample_ids": ["v7jJS8aAyA", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["wind, blows, loudly", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is silent", "label": 1}, {"captions": ["a woman and man are speaking", "a horn rings out as a machine runs by"], "sample_ids": ["vbpKkWvfOu4", "slZLHwNbbt4"], "start_seconds": ["560", "300"], "properties": ["two people, speaking, woman, man", "a, horn, run"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["spJCm8tD9Zo", "uEU-Hg5MTN8"], "start_seconds": ["90", "27"], "properties": ["snores, wheezes, sleeps", "a woman, laughs, animal"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["a power tool runs and touches a surface", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zfvPRf3chY", "tdWhHV3X25Q"], "start_seconds": ["290", "60"], "properties": ["power tool, run, touch", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a man is speaking and a crowd is clapping"], "question": "which is not a power tool", "label": 1}, {"captions": ["someone whistles a tune", "a person speaks over rustling leaves"], "sample_ids": ["sIXTftIuUgw", "zOZleIRqZm4"], "start_seconds": ["90", "80"], "properties": ["someone, tune, whistle", "rustling, leaves, person"], "captions_pred_video": [null, "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with crickets chirping in the background"], "question": "which entity is a person speaking over rustling leaves?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "a duck quacks continuously"], "sample_ids": ["wz7N8YRy74I", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, men", "quacks, continuously, duck"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 1}, {"captions": ["a woman sneezes then speaks", "water is sprayed across a hard surface"], "sample_ids": ["x4dZyf9Gbj0", "sQwlkXjQabo"], "start_seconds": ["130", "10"], "properties": ["sneezes, speaks, woman", "water, spray, surface"], "captions_pred_video": ["footage is blurry and out of focus", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman sneezes and speaks", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vlS6YMeWAPo", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["noise, bleat, call", "three men, wind, flow"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a goat bleats and birds chirp", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a goat?", "label": 0}, {"captions": ["a vehicle engine revs as the vehicle passes", "a child speaks in closed space"], "sample_ids": ["yDoT73BWsdA", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["engine, revs, vehicle", "child, space, speak"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["someone whistles briefly", "paper is crumpling consistently"], "sample_ids": ["uFoga8sHpiw", "v5cSxLaHADY"], "start_seconds": ["90", "0"], "properties": ["sound, duration, pitch", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a bird in a cage", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person whistles a song", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a frog croaks as other frogs croak in the background"], "sample_ids": ["zgUgkpk78xU", "yswmmRZFItk"], "start_seconds": ["70", "0"], "properties": ["horn, bells, ring", "background, frog, croak"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "a close up of a frog in the water"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a frog is croaking"], "question": "which entity is a warning", "label": 0}, {"captions": ["a man talks while vehicles pass by", "a woman speaks happily and an animal chirps"], "sample_ids": ["sK4u5T8hW78", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["a, man, talk", "a woman, chirps, animal"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a dog is barking "], "question": "which entity is more active", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a bell dings followed by a loud horn blaring"], "sample_ids": ["zl9Dqx-j7q4", "tZGN5a7ybxo"], "start_seconds": ["6", "60"], "properties": ["motors rev, laugh, loudly", "a bell, a horn, a ding"], "captions_pred_video": ["footage of a man driving a car in the dark", "is taken from a moving vehicle on the train tracks"], "captions_pred_audio": ["a jet engine roars ", "a train is moving and blowing its horn "], "question": "which entity is louder", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sNB8zxXneIM", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["several, quack, cocks", "men, talk, cars"], "captions_pred_video": ["a group of geese in a cage", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a stream of water runs briefly", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["x-PeY8Yb8M4", "zl9Dqx-j7q4"], "start_seconds": ["300", "6"], "properties": ["stream, water, run", "engine, laugh, loud"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a car is driving on a wet road ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["ukg5L09Wpvo", "yks4cLgIDMc"], "start_seconds": ["150", "170"], "properties": ["clickety-clack, train, whistle", "background, speaking, child"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a woman speaks happily and an animal chirps"], "sample_ids": ["v5P-ThUCINM", "uWAAAL4CIoc"], "start_seconds": ["400", "0"], "properties": ["background, chirp, bird", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and birds are chirping", "a woman is speaking and a dog is barking "], "question": "which entity has a chirpy animal?", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wwyfGO2J4", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["people, applaud, hoot", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a toilet flushes and water drains"], "sample_ids": ["vSeGhaZt-aI", "sfAvvZwdLCY"], "start_seconds": ["50", "20"], "properties": ["water, sink, talk", "water drains, flushes, water"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a toilet is flushed"], "question": "which entity is draining water", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "an infant crying as a woman laughs"], "sample_ids": ["xBxDz0CFVn0", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["stream, water, flow", "a, laugh, infant"], "captions_pred_video": ["footage is blurry and out of focus", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "wind blowing followed by a zoom"], "sample_ids": ["vs65y4qmyBE", "vr8ZXjEBhMQ"], "start_seconds": ["340", "150"], "properties": ["engine, run, man", "wind, blow, zoom"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a dog barks and whimpers", "pigeons vocalize and birds chirp"], "sample_ids": ["sShpyu2l4YQ", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["barks, whimpers, dog", "vocalize, bird, chirp"], "captions_pred_video": ["the puppies are playing with a toy", "of the pigeon in the cage"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "an infant crying as a woman laughs"], "sample_ids": ["zhx6hoYrHeI", "xhmRY9yhC7c"], "start_seconds": ["160", "20"], "properties": ["engine, sputter, rough", "a, laugh, infant"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a clock ticktocks"], "sample_ids": ["vzceMbklWc", "v-g-j2uTByM"], "start_seconds": ["180", "30"], "properties": ["water, faucet, sink", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["water is running and a man is speaking", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sShpyu2l4YQ", "sSMl2vc3ek"], "start_seconds": ["0", "20"], "properties": ["growl, bark, yip", "loud, multiple, distance"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["people speak softly as food sizzles", "people cheer as a vehicle engine revs"], "sample_ids": ["yhQ2Lg-7qDY", "xjhAnI2q6hM"], "start_seconds": ["130", "6"], "properties": ["food, sizzle, speak", "engine revs, vehicle, people"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "water flows and trickles"], "sample_ids": ["tDVADusiIoc", "tB7hWb9gTuQ"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "water, flow, trickle"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["uKCSGgof8gI", "ukg5L09Wpvo"], "start_seconds": ["12", "150"], "properties": ["chirps, distance, signal", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["food is frying while a woman speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yhQ2Lg-7qDY", "zFjIWfSD-4"], "start_seconds": ["130", "410"], "properties": ["food, woman, speak", "People, motor, brakes"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a woman sneezes then speaks", "an infant crying frantically"], "sample_ids": ["x4dZyf9Gbj0", "zwOBqeFTgiU"], "start_seconds": ["130", "30"], "properties": ["sneezes, speaks, woman", "cry, infant, frantically"], "captions_pred_video": ["footage is blurry and out of focus", "of the baby crying in the car seat"], "captions_pred_audio": ["a woman sneezes and speaks", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a man speaks followed by another man speaking outside"], "sample_ids": ["w5W5Kqtc8E", "viuTg1M-dqg"], "start_seconds": ["100", "30"], "properties": ["water, splashes, motorboat", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 1}, {"captions": ["a motorcycle engine is idling", "a woman speaks happily and an animal chirps"], "sample_ids": ["vZAqdHZ81yA", "uWAAAL4CIoc"], "start_seconds": ["180", "0"], "properties": ["engine, motorcycle, idling", "a woman, chirps, animal"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a woman is speaking and a dog is barking "], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "continuous snoring"], "sample_ids": ["tOSWIURC-4", "sLkeqCDJIyw"], "start_seconds": ["0", "120"], "properties": ["engine, work, nearby", "loud, snoring, noise"], "captions_pred_video": [null, ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["a lawn mower is running ", "a person is snoring loudly"], "question": "which entity makes a loud noise", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "several insects fly while two men talk"], "sample_ids": ["u21-Z5gJCB8", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["background, voice, man", "several, fly, men"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a helicopter engine runs continuously", "an engine runs loudly"], "sample_ids": ["ugHJF0hfYkg", "vqZuVbG6-HI"], "start_seconds": ["10", "130"], "properties": ["engine, running, continuously", "loud, engine, run"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a helicopter is flying overhead ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a car speeding up in the distance"], "sample_ids": ["xZepNM9qcRA", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, motor, run", "distance, car, speed"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["several ducks quack and cocks crow far away", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sNB8zxXneIM", "w5W5Kqtc8E"], "start_seconds": ["20", "100"], "properties": ["several, quack, cocks", "wind, blow, vehicle"], "captions_pred_video": ["a group of geese in a cage", null], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "winds blows roughly as a vehicle races past"], "sample_ids": ["smGI3C1NZc", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["water, drain, toilet", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a toilet is flushed", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a person is whistling", "a helicopter engine runs continuously"], "sample_ids": ["sIXTftIuUgw", "ugHJF0hfYkg"], "start_seconds": ["90", "10"], "properties": ["person, whistling, person", "engine, running, continuously"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a person whistling a song", "a helicopter is flying overhead "], "question": "which entity is not whistling", "label": 1}, {"captions": ["people clap and speak in the distance", "a man speaks as a motor runs in the background"], "sample_ids": ["wwyfGO2J4", "xZepNM9qcRA"], "start_seconds": ["90", "30"], "properties": ["clap, distance, speak", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["uKCSGgof8gI", "vddP56-ogds"], "start_seconds": ["12", "30"], "properties": ["chirps, distance, signal", "liquid, laughs, man"], "captions_pred_video": ["footage of a street in a small town on a sunny day", null], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "water is running and gurgling and a man is speaking"], "question": "which entity is more likely to be a video of a woman laughing?", "label": 1}, {"captions": ["water flows as men speak and yell", "some tunes played by whistling"], "sample_ids": ["vJ7JPEFhyLA", "u6BnG6YZqJ4"], "start_seconds": ["16", "0"], "properties": ["water, flow, men", "tune, play, whistling"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a dark barks and whimpers", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sYj4hpDUZDQ", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["barks, whimpers, dark", "airplane, boy, fly"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a dog barks and a cat meows", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a person is burping then speaks and laughs"], "sample_ids": ["wEBlkGWVWwE", "wAAkbZToh8"], "start_seconds": ["260", "0"], "properties": ["a, babble, woman", "burp, laugh, speak"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man burps and a woman speaks"], "question": "which entity is speaking", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "a woman speaks as she rubs two objects together"], "sample_ids": ["wSVhSdj0F0", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["beep, clang, footsteps", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["tgbONvsP47Y", "vddP56-ogds"], "start_seconds": ["0", "30"], "properties": ["noise, truck, accelerate", "liquid, laughs, man"], "captions_pred_video": ["footage of a fire truck entering a garage", null], "captions_pred_audio": ["a car is driving on the road ", "water is running and gurgling and a man is speaking"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a stream runs then someone speaks", "a child speaks in closed space"], "sample_ids": ["wbHTKEJZyhc", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["stream, run, someone", "child, space, speak"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["y2bVZ7rz-5M", "wz7N8YRy74I"], "start_seconds": ["280", "30"], "properties": ["engine, horn, siren", "rooster, crow, background, men"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "water flows as men speak and yell"], "sample_ids": ["ukg5L09Wpvo", "vJ7JPEFhyLA"], "start_seconds": ["150", "16"], "properties": ["a train, a horn, a bell", "water, flow, men"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be in motion", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["vbpKkWvfOu4", "wwyfGO2J4"], "start_seconds": ["560", "90"], "properties": ["a, man, speaks", "people, applaud, hoot"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a horn rings out as a machine runs by"], "sample_ids": ["vqZuVbG6-HI", "slZLHwNbbt4"], "start_seconds": ["130", "300"], "properties": ["background, male, female", "a, horn, run"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity has a horn", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a man talks while metallic objects are rapped and steam is released"], "sample_ids": ["vf9xf3vMsGM", "vuUVPzd2FXw"], "start_seconds": ["540", "160"], "properties": ["A man speaks while turning a water faucet on.", "a, steam, release"], "captions_pred_video": ["of the person washing their hands under the faucet", "of the person cooking on the grill with a spatula"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a man is speaking and dishes are clanging"], "question": "which entity is about water?", "label": 0}, {"captions": ["a vehicle accelerates and squeals tires", "a child speaks in closed space"], "sample_ids": ["yRx9txMcBl0", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["accelerates, tires, squeals", "child, space, speak"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a music is played followed by a frog croaking and then music is played again"], "sample_ids": ["vh30P49Po6s", "voJh2gJxXhA"], "start_seconds": ["30", "50"], "properties": ["loud, continuous, quacks", "music, frog, croak"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a frog on a black background with a red diamond in the center"], "captions_pred_audio": ["a duck is quacking loudly", "music is playing and crickets are chirping "], "question": "which entity is quieter", "label": 1}, {"captions": ["a door opens and birds chirp", "a car accelerates and wind blows"], "sample_ids": ["yeFvk9x0wWI", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["door, open, birds", "accelerates, wind, blows"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["u21-Z5gJCB8", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["background, voice, man", "loud, laughter, intermittent"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a small engine idles continuously"], "sample_ids": ["ukxt9I7eMMg", "y5WII6cTH7k"], "start_seconds": ["30", "40"], "properties": ["continuous, woman, speaking", "engine, idle, continuously"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "an engine is knocking and vibrating "], "question": "which entity is a video of a woman speaking?", "label": 0}, {"captions": ["water running down a sink while a man is talking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vSeGhaZt-aI", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["water, sink, talk", "stream, water, flow"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is flowing", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a car accelerates and wind blows"], "sample_ids": ["vXlk0lIQBFo", "u0TrcHhkPQ"], "start_seconds": ["470", "20"], "properties": ["wind, speak, vocalize", "accelerates, wind, blows"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", null], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a person sniffles and sneezes", "a man speaks as a motor runs in the background"], "sample_ids": ["uRlbY6aoBU", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["sneezes, sniffles, person", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is sneezing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "water runs into a sink while men speak"], "sample_ids": ["x6ijhqRY38s", "vzceMbklWc"], "start_seconds": ["250", "180"], "properties": ["something metal, glass, hit", "water, sink, run"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "water is running and a man is speaking"], "question": "which entity is a video of water running into a sink?", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a child speaks in closed space"], "sample_ids": ["voJh2gJxXhA", "yW6FWLSLkx4"], "start_seconds": ["50", "40"], "properties": ["music, frog, croak", "child, space, speak"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a machine beeps continuously", "a car accelerates and wind blows"], "sample_ids": ["y682ml90jGw", "u0TrcHhkPQ"], "start_seconds": ["11", "20"], "properties": ["beeps, machine, continuously", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a train engine runs and a horn blows"], "sample_ids": ["su6FAOcOA8c", "zPX9o1uDiI"], "start_seconds": ["4", "40"], "properties": ["engine, run, woman", "engine, horn, run"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a train moves with its horn blowing and wheels squealing "], "question": "which entity has a horn", "label": 1}, {"captions": ["birds chirp as a train approaches", "an infant crying frantically"], "sample_ids": ["xM4joTqDVp4", "zwOBqeFTgiU"], "start_seconds": ["160", "30"], "properties": ["bird, chirp, train", "cry, infant, frantically"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "of the baby crying in the car seat"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a horn honks and then loudly blares", "waves crash against a shoreline and people speak"], "sample_ids": ["wnpJndXuxLc", "yFB25fqfU8I"], "start_seconds": ["50", "300"], "properties": ["horn, honk, loud", "wave, crash, shoreline"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["running water in a faucet with some clinks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zNRChLjqcU", "vfYTJq7nU"], "start_seconds": ["220", "130"], "properties": ["water, faucet, run", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running from a faucet into a sink", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["people speak in a closed space", "a man speaks as a car is passing by"], "sample_ids": ["sTpirNYo8vQ", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["people, space, speak", "a, car, pass"], "captions_pred_video": ["of a man taking a selfie on a bus", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more likely to be in a closed space", "label": 0}, {"captions": ["dogs barking and whimpering", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tIY7qOV3rEM", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["barking, whimpering, dog", "loud, jet engine, roar"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["uKCSGgof8gI", "vbZ-0lGPneg"], "start_seconds": ["12", "30"], "properties": ["chirps, distance, signal", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["v0x1odnXtP0", "yajyRTUQk3U"], "start_seconds": ["210", "400"], "properties": ["keyboard, type, computer", "a woman, something, fried"], "captions_pred_video": ["how to make money on youtube in spanish", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking while food is frying in the background"], "question": "which entity is a demonstration of cooking?", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "people cheer as a vehicle engine revs"], "sample_ids": ["vcmWSmvti8", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["music, man, fire", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zofjfKhqLk8", "tiDFTC-5vU"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "male, duck, laugh"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "race cars go around a track as a man commentates"], "sample_ids": ["wz7N8YRy74I", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["rooster, crow, background, people", "car, track, man"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["several beeps are followed by a hit and a woman talking", "paper folding and crinkling"], "sample_ids": ["w34HjHr6gAY", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["beeps, hit, woman", "paper, fold, crinkle"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of folding and crinkling paper?", "label": 1}, {"captions": ["a person sniffs and sneezes", "waves crash against a shoreline and people speak"], "sample_ids": ["uRlbY6aoBU", "yFB25fqfU8I"], "start_seconds": ["0", "300"], "properties": ["sneezes, person, sniffs", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "small dogs yip and bark sharply"], "sample_ids": ["uZesmtKZGSw", "v-wcQf4BDY0"], "start_seconds": ["250", "120"], "properties": ["men, talk, cars", "bark, yip, sharply"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "a infant makes noise and is excited"], "sample_ids": ["sZvwOuuPGP0", "wIJK3-5y0kA"], "start_seconds": ["50", "30"], "properties": ["engine, diesel, truck", "noise, excited, infant"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a medium engine is running ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a woman sneezes then speaks", "small dogs yip and bark sharply"], "sample_ids": ["x4dZyf9Gbj0", "v-wcQf4BDY0"], "start_seconds": ["130", "120"], "properties": ["sneezes, speaks, woman", "bark, yip, sharply"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman sneezes and speaks", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman talking as an infant is crying", "a clock ticktocks"], "sample_ids": ["tMbMDvT50j8", "v-g-j2uTByM"], "start_seconds": ["12", "30"], "properties": ["a, talk, infant", "ticktocks, clock, ticktocks"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a baby cries and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["ugHJF0hfYkg", "su6FAOcOA8c"], "start_seconds": ["10", "4"], "properties": ["loud, intense, propeller", "engine, idle, woman"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a subway train is moving "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["yajyRTUQk3U", "y2bVZ7rz-5M"], "start_seconds": ["400", "280"], "properties": ["noise, woman, speak", "motor noise, horn, siren"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a truck is honking its horn and a siren is blaring "], "question": "which noise is more ominous", "label": 1}, {"captions": ["an engine runs and a man speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yT5WfYMRr-U", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "multiple, people, yell"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "wind blows as people chatter quietly"], "sample_ids": ["xMXvkIcaG0Y", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["sound, humming, rattling", "wind, chatter, people"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", "footage is blurry and out of focus"], "captions_pred_audio": ["an engine is revving and accelerating ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a duck quacks continuously", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vh30P49Po6s", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["quacks, continuously, duck", "stream, water, flow"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage is blurry and out of focus"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wSVhSdj0F0", "tDVADusiIoc"], "start_seconds": ["10", "60"], "properties": ["horn honks, keys jingle, slam", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tK4VlLsNxak", "sSMl2vc3ek"], "start_seconds": ["120", "20"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "loud, multiple, distance"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sfAvvZwdLCY", "wz7N8YRy74I"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "rooster, crow, background, men"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "the revving of an engine throttle followed by a man speaking"], "sample_ids": ["zFjIWfSD-4", "tezvROoo4bs"], "start_seconds": ["410", "40"], "properties": ["People, motor, brakes", "audio, throttle, speaking"], "captions_pred_video": [null, "footage of a busy city street with cars parked on both sides of the road"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a car accelerates and revs while a man speaks "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a frog croaks as other frogs croak in the background"], "sample_ids": ["yYEVLuqEytU", "yswmmRZFItk"], "start_seconds": ["40", "0"], "properties": ["animal, pig, background", "background, frog, croak"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a close up of a frog in the water"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a frog is croaking"], "question": "which entity has a frog in the background?", "label": 1}, {"captions": ["continuous snoring", "paper folding and crinkling"], "sample_ids": ["sLkeqCDJIyw", "zPpG3RD8lSs"], "start_seconds": ["120", "20"], "properties": ["loud, snoring, noise", "paper, fold, crinkle"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a person is snoring loudly", "the wind blows and a mouse clicks "], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a airplane flies overhead as a woman speaks"], "sample_ids": ["tDlysoZiA1I", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["animal, grunt, chirp", "airplane, fly, woman"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["ylpYOorfH4o", "uEU-Hg5MTN8"], "start_seconds": ["410", "27"], "properties": ["engine, running, wind", "a woman, laughs, animal"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["zY3icUyMdh8", "yDoT73BWsdA"], "start_seconds": ["20", "10"], "properties": ["dog, bark, engine", "engine, revs, vehicle"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "dishes cling together then a man begins to speak"], "sample_ids": ["xfaoyyzw2WU", "sQGXqGcwOTc"], "start_seconds": ["180", "3"], "properties": ["loud, jet engine, roar", "cling, speak, dishes"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "mechanisms are operating and water is splashing "], "question": "which entity is quieter", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "ticking continues without interruption"], "sample_ids": ["tQWGZLItBXk", "v-g-j2uTByM"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "ticking, continuous, clock"], "captions_pred_video": ["worms revolution screenshots", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "wind blowing followed by a zoom"], "sample_ids": ["tQWGZLItBXk", "vr8ZXjEBhMQ"], "start_seconds": ["170", "150"], "properties": ["music, person, ding", "wind, blow, zoom"], "captions_pred_video": ["worms revolution screenshots", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sQGXqGcwOTc", "sLUnaPT5gM8"], "start_seconds": ["3", "0"], "properties": ["audio, kid, giggles", "loud, laughter, intermittent"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a scream", "label": 1}, {"captions": ["someone snores nearby", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["spJCm8tD9Zo", "wqZ135Ssz0"], "start_seconds": ["90", "60"], "properties": ["someone snores, nearby, someone", "two men, woman, birds"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "water pouring and bubbling"], "sample_ids": ["sWZzXuWYY", "uyRfq-jKPpo"], "start_seconds": ["420", "50"], "properties": ["male, clanks, thumps", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "food is frying while a woman speaks"], "sample_ids": ["zcDwZ6W7E3E", "yhQ2Lg-7qDY"], "start_seconds": ["180", "130"], "properties": ["man, speak, motorcycles", "food, woman, speak"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a faucet is running and a man is speaking"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "roadway noise occurs and a truck accelerates"], "sample_ids": ["su6FAOcOA8c", "tgbONvsP47Y"], "start_seconds": ["4", "0"], "properties": ["engine, run, woman", "noise, truck, accelerate"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "dishes cling together then a man begins to speak"], "sample_ids": ["yNtRmrn0io8", "sQGXqGcwOTc"], "start_seconds": ["210", "3"], "properties": ["storm, distance, strike", "cling, speak, dishes"], "captions_pred_video": ["footage of a house in the middle of the night", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["rain falls and thunder roars", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["people speak in a closed space", "a man speaks followed by another man speaking outside"], "sample_ids": ["sTpirNYo8vQ", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["people, space, speak", "two men, speak, follow"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "someone snores nearby"], "sample_ids": ["vimzuGQvdcU", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["a, man, yells", "someone snores, nearby, someone"], "captions_pred_video": ["a group of people are rafting down a river", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a person is snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "water flows and trickles"], "sample_ids": ["tDlfY3nmx1A", "tB7hWb9gTuQ"], "start_seconds": ["160", "30"], "properties": ["applause, laugh, man", "water, flow, trickle"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ugHJF0hfYkg", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["loud, propeller, move", "female, spraying, scream"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "people speak as gunfire rings out"], "sample_ids": ["uWAAAL4CIoc", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["a woman, chirps, animal", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "water pouring and bubbling"], "sample_ids": ["zF8yoL0rkbI", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["engine, run, someone", "water, bubbles, pouring"], "captions_pred_video": ["footage of the traffic on the street at night", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a baby laughs giddily and a woman laughs then speaks"], "sample_ids": ["zY3icUyMdh8", "wjsXBsc7M40"], "start_seconds": ["20", "10"], "properties": ["dog, bark, engine", "a baby laughs, a woman laughs, a woman speaks"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of the baby playing with a toothbrush"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a baby laughs and a woman speaks"], "question": "which entity is more playful", "label": 1}, {"captions": ["a clock ticktocks briefly", "wind blows and a vehicle blows a hard then a train blows a horn"], "sample_ids": ["u7C-AEBQM", "wnpJndXuxLc"], "start_seconds": ["30", "50"], "properties": ["ticktocks, clock, ticktocks briefly", "blows, vehicle, train"], "captions_pred_video": [null, "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a ticktock of a clock", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is not a clock?", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["w1mlz3Pe4fU", "ukg5L09Wpvo"], "start_seconds": ["300", "150"], "properties": ["vocalize, chirp, continuously", "clickety-clack, train, whistle"], "captions_pred_video": ["of a bird in a cage", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["birds are chirping and singing", "a train blows its whistle and blows its horn "], "question": "which entity is continuous", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a clock ticktocks"], "sample_ids": ["yajyRTUQk3U", "v-g-j2uTByM"], "start_seconds": ["400", "30"], "properties": ["a woman, something, fried", "ticktocks, clock, ticktocks"], "captions_pred_video": ["- a woman cooking in the kitchen", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["un9VQlzgZM", "vVhthZ45k3Y"], "start_seconds": ["5", "30"], "properties": ["wind, speak, laugh", "cat, purr, hiss"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking and a cat is meowing"], "question": "which entity is more likely to be a recording", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["ukxt9I7eMMg", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["continuous, woman, speaking", "water, radio, man"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sEprKHm8Sj8", "su6FAOcOA8c"], "start_seconds": ["90", "4"], "properties": ["car, tires, slows", "engine, idle, woman"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "water flows and trickles"], "sample_ids": ["w8uLijTqtlU", "tB7hWb9gTuQ"], "start_seconds": ["70", "30"], "properties": ["wind, microphone, noise", "water, flow, trickle"], "captions_pred_video": ["footage is blurry and shaky", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["the wind is blowing strongly", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a toilet flushes and a female speaks"], "sample_ids": ["sa6TLVbooCc", "yaln9y8I7ms"], "start_seconds": ["240", "230"], "properties": ["people, laugh, child", "female, flushes, toilet"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "someone is typing on a computer keyboard"], "sample_ids": ["v5P-ThUCINM", "v0x1odnXtP0"], "start_seconds": ["400", "210"], "properties": ["background, chirp, bird", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["wSVhSdj0F0", "w34HjHr6gAY"], "start_seconds": ["10", "30"], "properties": ["beep, clang, footsteps", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["tiDFTC-5vU", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a beep sounds followed by a child speaking"], "question": "which entity has a duck quacking?", "label": 0}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sZPuqDgX2V0", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["engine, accelerate, intercom", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a vehicle?", "label": 0}, {"captions": ["men speak and a nozzle sprays liquid", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wRV8yMk886E", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["liquid, spray, nozzle", "a woman, a television program, a bird"], "captions_pred_video": ["two cars are parked in a parking lot at night", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "vehicles pass by on a roadway"], "sample_ids": ["vimzuGQvdcU", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["a, man, yells", "pass, vehicle, roadway"], "captions_pred_video": ["a group of people are rafting down a river", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a car is driving on the road "], "question": "which entity is more passive", "label": 1}, {"captions": ["an engine runs and wind blows", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vs65y4qmyBE", "tdWhHV3X25Q"], "start_seconds": ["340", "60"], "properties": ["engine, run, wind", "applause, audience, yells"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tOSWIURC-4", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["noise, engine, revs", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a lawn mower is running ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["food is frying while a woman speaks", "small dogs yip and bark sharply"], "sample_ids": ["yhQ2Lg-7qDY", "v-wcQf4BDY0"], "start_seconds": ["130", "120"], "properties": ["food, woman, speak", "bark, yip, sharply"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sAam2NqGhLY", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["snoring, breathing, child", "animal, grunts, snorts"], "captions_pred_video": ["of a little girl sleeping on a couch", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person is snoring", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "water pouring and bubbling"], "sample_ids": ["xNMovAf3o50", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["rain, thunder, music", "water, bubbles, pouring"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "water is running from a faucet"], "question": "which entity is more likely to be a natural occurrence", "label": 0}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "someone whistles a tune"], "sample_ids": ["yNtRmrn0io8", "sIXTftIuUgw"], "start_seconds": ["210", "90"], "properties": ["storm, distance, strike", "someone, tune, whistle"], "captions_pred_video": ["footage of a house in the middle of the night", null], "captions_pred_audio": ["rain falls and thunder roars", "a person whistling a song"], "question": "which is not a musical instrument", "label": 0}, {"captions": ["long loud burping by a man", "music plays followed by gunshots and then an explosion"], "sample_ids": ["xmiUIOhtZyQ", "xKB8O8LTs6s"], "start_seconds": ["60", "70"], "properties": ["loud, burp, man", "music, gunshots, explosion"], "captions_pred_video": ["homer simpson drinking a beer", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a person burps and music plays in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "a door opens and closes"], "sample_ids": ["uC9dtII1KDI", "vBHyYJ8pL0"], "start_seconds": ["150", "2"], "properties": ["wind, gusts, distance", "open, close, door"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", null], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which door is more likely to open and close", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "wind blows as people chatter quietly"], "sample_ids": ["yZp6xizR0yU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["animal, bleat, cry", "wind, chatter, people"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["ul60S8TXDA8", "ziUT9IFTkjg"], "start_seconds": ["60", "10"], "properties": ["sound, distance, bell", "background, birds, rustling"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", null], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "birds are chirping and a chime is ringing "], "question": "which entity has a bird in the background?", "label": 1}, {"captions": ["birds chirp as a bell rings", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["ziUT9IFTkjg", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["chirp, bell, ring", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "water pouring and bubbling"], "sample_ids": ["vlS6YMeWAPo", "uyRfq-jKPpo"], "start_seconds": ["40", "50"], "properties": ["sheep, baa, birds", "water, bubbles, pouring"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a goat bleats and birds chirp", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["food is frying then a woman speaks", "water rushes and then a vehicle zooms past"], "sample_ids": ["ukxt9I7eMMg", "s4Uz1Ffgo04"], "start_seconds": ["30", "100"], "properties": ["food, woman, speak", "water, rushes, vehicle"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is more active", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vlJS7LN2XyM", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["background, clocks, ticking", "engine, laugh, loud"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a ticktock of a clock", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a man talks as several small engines run"], "sample_ids": ["wDVMhEdTiVw", "u9A6VZQCZpU"], "start_seconds": ["30", "30"], "properties": ["gun, shoot, water", "a, man, talk"], "captions_pred_video": ["a blurry image of trees and water in the forest", null], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a man is speaking while a race car is revving and accelerating "], "question": "which entity is talking", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a woman speaks happily and an animal chirps"], "sample_ids": ["w6RTHR6AeAg", "uWAAAL4CIoc"], "start_seconds": ["40", "0"], "properties": ["call, owl, screech", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a woman is speaking and a dog is barking "], "question": "which entity is a bird?", "label": 0}, {"captions": ["multiple birds chirp and an animal grunts", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["tDlysoZiA1I", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["animal, grunt, multiple", "a, scream, girl"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking and a baby is crying"], "question": "which entity is a human", "label": 1}, {"captions": ["dogs barking and whimpering", "vehicles pass by on a roadway"], "sample_ids": ["tIY7qOV3rEM", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["barking, whimpering, dog", "pass, vehicle, roadway"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a car is driving on the road "], "question": "which entity is more passive", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "a duck quacks loudly and continuously"], "sample_ids": ["vf44CgrjT0A", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["loud, long, person", "loud, continuous, quacks"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a loud burp", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["ukg5L09Wpvo", "ukg5L09Wpvo"], "start_seconds": ["150", "150"], "properties": ["clickety-clack, train, whistle", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a train blows its whistle and blows its horn "], "question": "which train is going faster", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a man speaks as a motor runs in the background"], "sample_ids": ["xC8kbrKJmco", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["background, goat, scream", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a goat is bleating ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a dark barks and whimpers", "water flows and trickles"], "sample_ids": ["sYj4hpDUZDQ", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["barks, whimpers, dark", "water, flow, trickle"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a dog barks and a cat meows", "water is splashing and gurgling"], "question": "which entity is more silent", "label": 1}, {"captions": ["birds coo incessantly", "a man speaks as a car is passing by"], "sample_ids": ["yZrFNS7GFBQ", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["coo, bird, incessant", "a, car, pass"], "captions_pred_video": ["of the bird in the cage", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "someone whistles a tune"], "sample_ids": ["tIY7qOV3rEM", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "someone, tune, whistle"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a person whistling a song"], "question": "which entity is a human activity", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a man speaks as a car is passing by"], "sample_ids": ["spYNpeN7rPY", "sK4u5T8hW78"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "a, car, pass"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a clock?", "label": 0}, {"captions": ["a woman talks while something is fried and objects are tapped", "people cheer as a vehicle engine revs"], "sample_ids": ["yajyRTUQk3U", "xjhAnI2q6hM"], "start_seconds": ["400", "6"], "properties": ["a woman, something, fried", "engine revs, vehicle, people"], "captions_pred_video": ["- a woman cooking in the kitchen", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a person snoring several times", "an infant crying as a woman laughs"], "sample_ids": ["spJCm8tD9Zo", "xhmRY9yhC7c"], "start_seconds": ["90", "20"], "properties": ["snore, person, several", "a, laugh, infant"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a person is snoring loudly", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["zALy31PjDl0", "ukg5L09Wpvo"], "start_seconds": ["21", "150"], "properties": ["a man, a vehicle, a horn", "clickety-clack, train, whistle"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a woman speaks as she rubs two objects together"], "sample_ids": ["weDbePuc-Xc", "vzxHnu-SFEw"], "start_seconds": ["40", "80"], "properties": ["cartoon character, music, vocalize", "two objects, woman, speak"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a baby coos and fidgets as a lady speaks and laughs"], "sample_ids": ["yJ0TePmaOo", "uPDn2BFTHk"], "start_seconds": ["390", "140"], "properties": ["two hard objects, man, speak", "lady, laugh, baby"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a baby laughs and a woman speaks"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a small engine idles continuously", "a clock ticktocks"], "sample_ids": ["y5WII6cTH7k", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["engine, idle, continuously", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a clock is ticking loudly"], "question": "which entity is ticking continuously", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w5W5Kqtc8E", "vJ7JPEFhyLA"], "start_seconds": ["100", "16"], "properties": ["wind, engine, scream", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a vehicle?", "label": 0}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "birds chirp as a man speaks and a younger person speaks"], "sample_ids": ["s4Uz1Ffgo04", "xl2PIWyXaM"], "start_seconds": ["100", "160"], "properties": ["water, rushes, motorcycle", "chirp, man, younger person"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "birds are chirping and people are talking"], "question": "which entity is more calm", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "paper is crumpling consistently"], "sample_ids": ["tK4VlLsNxak", "v5cSxLaHADY"], "start_seconds": ["120", "0"], "properties": ["a, dial, telephone", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["goats bleat and metal clings", "people applaud and hoot and chat quietly"], "sample_ids": ["tH17JPjDPnc", "wwyfGO2J4"], "start_seconds": ["260", "90"], "properties": ["bleat, metal, clings", "people, applaud, hoot"], "captions_pred_video": ["feed of the goats eating hay in the barn", null], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "someone whistles a tune"], "sample_ids": ["yZp6xizR0yU", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["animal, bleat, cry", "someone, tune, whistle"], "captions_pred_video": ["footage of a woman feeding goats in a barn", null], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a beep repeats multiple times", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["y682ml90jGw", "vbZ-0lGPneg"], "start_seconds": ["11", "30"], "properties": ["beep, repeat, multiple", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["zO-LSSY92ZM", "vzceMbklWc"], "start_seconds": ["30", "180"], "properties": ["liquid, surface, sound", "water, faucet, sink"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", null], "captions_pred_audio": ["steam is hissing and hissing", "water is running and a man is speaking"], "question": "which entity is a source of water", "label": 1}, {"captions": ["an engine runs and a man speaks", "an airplane engine runs"], "sample_ids": ["yT5WfYMRr-U", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["engine, run, man", "engine, airplane, runs"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a car is driving by on the road "], "question": "which entity has a man speaking while an engine runs?", "label": 0}, {"captions": ["a horn rings out as a machine runs by", "a machine beeps continuously"], "sample_ids": ["slZLHwNbbt4", "y682ml90jGw"], "start_seconds": ["300", "11"], "properties": ["a, horn, run", "beeps, machine, continuously"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a beeping sound is being made "], "question": "which machine is beeping continuously?", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "a clock ticktocks"], "sample_ids": ["xjhAnI2q6hM", "v-g-j2uTByM"], "start_seconds": ["6", "30"], "properties": ["wind, blow, loudly", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["dogs barking and whimpering", "wind blows as people chatter quietly"], "sample_ids": ["tIY7qOV3rEM", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "wind, chatter, people"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage is blurry and out of focus"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["an emergency siren wails as it passes", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vGj1XLJvNrw", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["wails, wails, pass", "a woman, something, fried"], "captions_pred_video": ["footage of a police car driving down a city street", "- a woman cooking in the kitchen"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a person uses a saw to cut some wood"], "sample_ids": ["wTjoRj1se3U", "sHbXC6na9hg"], "start_seconds": ["390", "0"], "properties": ["engine, run, people", "a person, saw, wood"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a jet engine is running and people are talking", "an engine is idling and vibrating"], "question": "which entity is a person", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "several insects fly while two men talk"], "sample_ids": ["vJvryTwuAV8", "s-T9OVOiMLo"], "start_seconds": ["16", "330"], "properties": ["audience, cheer, man", "several, fly, men"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more insects", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zofjfKhqLk8", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "stream, water, flow"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage is blurry and out of focus"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["xyL9F5VrjkE", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["wind, motor, distance", "female, spraying, scream"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["the revving of an engine throttle followed by a man speaking", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tezvROoo4bs", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["audio, throttle, speaking", "men, talk, cars"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["food is frying and sizzles", "people cheer as a vehicle engine revs"], "sample_ids": ["zNRChLjqcU", "xjhAnI2q6hM"], "start_seconds": ["220", "6"], "properties": ["food is frying, sizzles, food", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["water is running from a faucet into a sink", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man is filing a hard object", "a stream runs then someone speaks"], "sample_ids": ["vveS8HT7Uog", "wbHTKEJZyhc"], "start_seconds": ["100", "20"], "properties": ["a man, hard, object", "stream, run, someone"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a waterfall is flowing and people are speaking "], "question": "which entity is a stream", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a clock ticktocks"], "sample_ids": ["vZAw4apG0Es", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["background, tick, repeat", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a clock is ticking and people are talking", "a clock is ticking loudly"], "question": "which entity has a tick that repeats", "label": 0}, {"captions": ["the revving of an engine throttle followed by a man speaking", "a clock ticktocks"], "sample_ids": ["tezvROoo4bs", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["audio, throttle, speaking", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["x9JovgqUcs", "yajyRTUQk3U"], "start_seconds": ["500", "400"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a person speaking and typing on a computer keyboard?", "label": 0}, {"captions": ["a telephone rings followed by a woman talking", "someone whistles briefly"], "sample_ids": ["tGcFnX0GHI", "uFoga8sHpiw"], "start_seconds": ["0", "90"], "properties": ["ring, talk, woman", "sound, duration, pitch"], "captions_pred_video": [null, "footage of a bird in a cage"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a person whistles a song"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a man speaks followed by another man speaking outside"], "sample_ids": ["ukxt9I7eMMg", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["food, pan, cook", "two men, speak, follow"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["a goat screams and people speak in the background", "water pouring and bubbling"], "sample_ids": ["xC8kbrKJmco", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["background, goat, scream", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a goat is bleating ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a grown man speaks and water bubbles and runs"], "sample_ids": ["v7jJS8aAyA", "vSeGhaZt-aI"], "start_seconds": ["10", "50"], "properties": ["wind, blows, loudly", "water, bubbles, run"], "captions_pred_video": [null, "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a baby cries and a woman moans", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["smDKStoHBJo", "w34HjHr6gAY"], "start_seconds": ["0", "30"], "properties": ["a, cry, woman", "beeps, hit, woman"], "captions_pred_video": ["a man holding a crying baby in his arms", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "people speak as gunfire rings out"], "sample_ids": ["zofjfKhqLk8", "wqTCwqVRDlk"], "start_seconds": ["10", "80"], "properties": ["background, metal, clank", "gunfire, ring, speak"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a door opens and birds chirp", "a woman speaks happily and an animal chirps"], "sample_ids": ["yeFvk9x0wWI", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["door, open, birds", "a woman, chirps, animal"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "children cheer as a man speaks then an audience screams"], "sample_ids": ["zgUgkpk78xU", "vJvryTwuAV8"], "start_seconds": ["70", "16"], "properties": ["horn, bells, ring", "audience, cheer, man"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking and a crowd is shouting and whooping "], "question": "which entity is a warning", "label": 0}, {"captions": ["a man speaking with light rustling", "vehicles pass by on a roadway"], "sample_ids": ["zOZleIRqZm4", "tgbONvsP47Y"], "start_seconds": ["80", "0"], "properties": ["light, rustling, man", "pass, vehicle, roadway"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a car is driving on the road "], "question": "which entity is more active", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "water runs into a sink while men speak"], "sample_ids": ["sAam2NqGhLY", "vzceMbklWc"], "start_seconds": ["20", "180"], "properties": ["snoring, breathing, child", "water, sink, run"], "captions_pred_video": ["of a little girl sleeping on a couch", null], "captions_pred_audio": ["a person is snoring", "water is running and a man is speaking"], "question": "which entity is a video", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "roadway noise occurs and a truck accelerates"], "sample_ids": ["uiS58TNyUiw", "tgbONvsP47Y"], "start_seconds": ["430", "0"], "properties": ["audio, man, speaking", "noise, truck, accelerate"], "captions_pred_video": ["of the pigeon in the cage", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a car is driving on the road "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man woman speak while crickets sing", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["zTLVJCo4WEE", "vlJS7LN2XyM"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "background, clocks, ticking"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a ticktock of a clock"], "question": "which entity has a clock ticking in the background?", "label": 1}, {"captions": ["a weapon fires multiple times", "wind blows as people chatter quietly"], "sample_ids": ["sMC07Ucy7kg", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["weapon, fire, multiple", "wind, chatter, people"], "captions_pred_video": ["footage is from a car's point of view", "footage is blurry and out of focus"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "people cheer as a vehicle engine revs"], "sample_ids": ["vmrxwuAMb2I", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["a dog, inhales, exhales", "engine revs, vehicle, people"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a dog barks and growls", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["someone snores nearby", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["spJCm8tD9Zo", "wqZ135Ssz0"], "start_seconds": ["90", "60"], "properties": ["someone snores, nearby, someone", "two men, woman, birds"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "animals bleat and moo as a person speaks"], "sample_ids": ["xKB8O8LTs6s", "tPJvjq9QePY"], "start_seconds": ["70", "40"], "properties": ["music, gunfire, explosion", "animal, bleat, moo"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a dog and a sheep in a barn"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a baby cries and a man speaks"], "question": "which entity is more calm", "label": 1}, {"captions": ["some clanking with distant murmuring", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["uMTTDZ2mb4", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["clanking, murmuring, distant", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more animal like", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "a church bell rings several times"], "sample_ids": ["uPDn2BFTHk", "sUVVjE3Ucp8"], "start_seconds": ["140", "0"], "properties": ["woman, laughs, speaks", "ring, bell, several"], "captions_pred_video": [null, "the video shows a stone wall with a clock on top of it and a bench in front of it"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a church bell is ringing "], "question": "which entity is silent", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "paper is crumpling consistently"], "sample_ids": ["spYNpeN7rPY", "v5cSxLaHADY"], "start_seconds": ["1", "0"], "properties": ["a clock, ticktock, man", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a piece of wood is being placed down and sawed"], "sample_ids": ["xBxDz0CFVn0", "uiItxDsDMFI"], "start_seconds": ["30", "30"], "properties": ["stream, water, flow", "wood, piece, saw"], "captions_pred_video": ["footage is blurry and out of focus", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a saw is being used with background noise "], "question": "which entity is being cut", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["soTOh3zYJfY", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["vehicle, skid, tires", "a woman, something, fried"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["w2JXXIAdUdg", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["snoring, distance, person", "animal, grunts, snorts"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a woman is speaking and a baby is crying"], "question": "which entity is a person speaking indiscriminately in the distance with a person snoring loudly nearby?", "label": 0}, {"captions": ["a man rubs two objects together then speaks", "a telephone rings followed by a woman talking"], "sample_ids": ["vveS8HT7Uog", "tGcFnX0GHI"], "start_seconds": ["100", "0"], "properties": ["a man, objects, speak", "ring, talk, woman"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["wRBHTgrbiwg", "tiDFTC-5vU"], "start_seconds": ["50", "30"], "properties": ["birds, chirp, cooing", "male, duck, laugh"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking and ducks are quacking"], "question": "which entity is a group of birds?", "label": 0}, {"captions": ["a loud engine muffles a man as he speaks", "people speak as gunfire rings out"], "sample_ids": ["xyx6eNVEYRY", "wqTCwqVRDlk"], "start_seconds": ["380", "80"], "properties": ["loud, engine, muffles", "gunfire, ring, speak"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["continuous snoring", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["sLkeqCDJIyw", "ukg5L09Wpvo"], "start_seconds": ["120", "150"], "properties": ["loud, snoring, noise", "clickety-clack, train, whistle"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a person is snoring loudly", "a train blows its whistle and blows its horn "], "question": "which noise is continuous", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a train horn blows as it passes by"], "sample_ids": ["yZmhM1HcsyE", "zVacuqSb4LI"], "start_seconds": ["4", "30"], "properties": ["engine, roar, water", "horn, blows, train"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which train is passing by", "label": 1}, {"captions": ["birds chirp and wind blows", "wind blowing followed by a zoom"], "sample_ids": ["sxIvBMSavMQ", "vr8ZXjEBhMQ"], "start_seconds": ["210", "150"], "properties": ["birds, chirp, wind", "wind, blow, zoom"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be a video of a wind blowing?", "label": 1}, {"captions": ["a person screams glaringly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xC8kbrKJmco", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["glaringly, screams, person", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a goat is bleating ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sQGXqGcwOTc", "yajyRTUQk3U"], "start_seconds": ["3", "400"], "properties": ["cling, speak, dishes", "a woman, something, fried"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "- a woman cooking in the kitchen"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wqADXCzngMw", "vJ7JPEFhyLA"], "start_seconds": ["340", "16"], "properties": ["engine, idle, man", "three men, wind, flow"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man talking?", "label": 0}, {"captions": ["a woman and man are speaking", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vbpKkWvfOu4", "vfYTJq7nU"], "start_seconds": ["560", "130"], "properties": ["two people, speaking, woman, man", "rustling, ducks, quack"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a duck quacks and a woman speaks"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["w6RTHR6AeAg", "vlS6YMeWAPo"], "start_seconds": ["40", "40"], "properties": ["call, owl, screech", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a goat bleats and birds chirp"], "question": "which entity is a bird?", "label": 0}, {"captions": ["a helicopter engine runs", "a horn rings out as a machine runs by"], "sample_ids": ["t5ZbXbniOWk", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["engine, helicopter, run", "a, horn, run"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a helicopter is flying overhead ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["vddP56-ogds", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["water, flow, laugh", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["people speak softly as food sizzles", "birds chirp and objects are moved around"], "sample_ids": ["yhQ2Lg-7qDY", "yPUYU6t3rwo"], "start_seconds": ["130", "370"], "properties": ["food, sizzle, speak", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a faucet is running and a man is speaking", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "a man speaks as a boat engine runs"], "sample_ids": ["yDoT73BWsdA", "wtDqrBygTcU"], "start_seconds": ["10", "30"], "properties": ["engine, revs, vehicle", "man, engine, run"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "shows a person riding on the back of a boat as it speeds through the water"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a motor is running"], "question": "which entity is about a vehicle engine?", "label": 0}, {"captions": ["a woman speaks and dog vocalizes", "a man speaks as a motor runs in the background"], "sample_ids": ["uWAAAL4CIoc", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["a, dog, vocalize", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "a propeller rotates loudly and intensely"], "sample_ids": ["yZp6xizR0yU", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["animal, bleat, cry", "loud, intense, propeller"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a man speaks as a car is passing by"], "sample_ids": ["uEU-Hg5MTN8", "sK4u5T8hW78"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "a, car, pass"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a woman talking as an infant is crying", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tMbMDvT50j8", "vbZ-0lGPneg"], "start_seconds": ["12", "30"], "properties": ["a, talk, infant", "a woman, a television program, a bird"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman is speaking and a dog is whimpering"], "question": "which woman is talking", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "ticking continues without interruption"], "sample_ids": ["w2M4i1mklOA", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["alarm, gears, turn", "ticking, continuous, clock"], "captions_pred_video": ["footage of an antique clock", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a clock is ticking loudly"], "question": "which clock is ticking continuously", "label": 1}, {"captions": ["a beep occurs briefly", "an engine runs loudly"], "sample_ids": ["xtWeJ56-U-g", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["beep, occur, briefly", "loud, engine, run"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "footage is blurry because it's raining outside"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "roadway noise occurs and a truck accelerates"], "sample_ids": ["y4tPJXBKDig", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["a, noise, talk", "noise, truck, accelerate"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a car is driving on the road "], "question": "which noise is made by a truck", "label": 1}, {"captions": ["a helicopter engine runs", "a toilet flushes and a female speaks"], "sample_ids": ["t5ZbXbniOWk", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["engine, helicopter, run", "female, flushes, toilet"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage is blurry and out of focus"], "captions_pred_audio": ["a helicopter is flying overhead ", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "pigeons vocalize and birds chirp"], "sample_ids": ["vlS6YMeWAPo", "uiS58TNyUiw"], "start_seconds": ["40", "430"], "properties": ["sheep, baa, birds", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "of the pigeon in the cage"], "captions_pred_audio": ["a goat bleats and birds chirp", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["food is frying then a woman speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["ukxt9I7eMMg", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["food, woman, speak", "people, applaud, hoot"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "people are clapping and speaking with background noise "], "question": "which entity is a performance", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a clock ticktocks"], "sample_ids": ["tDlysoZiA1I", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, chirp", "ticktocks, clock, ticktocks"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a woman speaks happily and an animal chirps"], "sample_ids": ["sU53zg9Jp7s", "uWAAAL4CIoc"], "start_seconds": ["380", "0"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "a woman, chirps, animal"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is more likely to be a crow?", "label": 0}, {"captions": ["material crumbles into a microphone", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vofpvUo6NAw", "zj2R0XoFr5k"], "start_seconds": ["220", "50"], "properties": ["material, crumbles, microphone", "airplane, boy, fly"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "vehicles pass by on a roadway"], "sample_ids": ["xyL9F5VrjkE", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["wind, blows, vehicle", "pass, vehicle, roadway"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a fire truck entering a garage"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sEprKHm8Sj8", "xKB8O8LTs6s"], "start_seconds": ["90", "70"], "properties": ["car, tires, slows", "music, gunfire, explosion"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sOa7g-44Dag", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["audio, scratching, man", "three men, wind, flow"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["several ducks quack and cocks crow far away", "a man speaks as a car is passing by"], "sample_ids": ["sNB8zxXneIM", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["several, quack, cocks", "a, car, pass"], "captions_pred_video": ["a group of geese in a cage", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a clock ticktocks continuously", "a car speeding up in the distance"], "sample_ids": ["vlJS7LN2XyM", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["ticktocks, clock, ticktocks continuously", "distance, car, speed"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["t97k0cejSQE", "tiDFTC-5vU"], "start_seconds": ["250", "30"], "properties": ["bird, chirp, insect", "male, duck, laugh"], "captions_pred_video": ["a bee on a purple thistle flower", null], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking and ducks are quacking"], "question": "which entity is a group of animals?", "label": 0}, {"captions": ["water bubbles and gurgles.", "an airplane flies overhead as a woman speaks"], "sample_ids": ["tB7hWb9gTuQ", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["bubbles, gurgles, water", "airplane, fly, overhead"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["water is splashing and gurgling", "a woman speaks while a helicopter flies overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp then an animal grunts", "someone is typing on a computer keyboard"], "sample_ids": ["tDlysoZiA1I", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["animal, grunt, chirp", "keyboard, type, computer"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "how to make money on youtube in spanish"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a person is typing on a keyboard"], "question": "which is not a type of animal", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sWZzXuWYY", "su6FAOcOA8c"], "start_seconds": ["420", "4"], "properties": ["male, clanks, thumps", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a vehicle accelerates and squeals tires"], "sample_ids": ["sfAvvZwdLCY", "yRx9txMcBl0"], "start_seconds": ["20", "40"], "properties": ["water drains, flushes, water", "accelerates, tires, squeals"], "captions_pred_video": ["footage of the toilet in the bathroom", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a toilet is flushed", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "people applaud and hoot and chat quietly"], "sample_ids": ["wy1eKjR7KC0", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["people, talk, distance", "people, applaud, hoot"], "captions_pred_video": ["two police officers riding motorcycles down the street", null], "captions_pred_audio": ["a man is speaking and a siren is going off", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a male speaks over some small clicks", "water is sprayed across a hard surface"], "sample_ids": ["uXxVebHsGZ8", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["male, clicks, speak", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a train engine runs and a horn blows", "people applaud and hoot and chat quietly"], "sample_ids": ["zPX9o1uDiI", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["engine, horn, run", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "a car revs and accelerates loudly and men and women chatter among themselves"], "sample_ids": ["uiS58TNyUiw", "y8dSeubCNI"], "start_seconds": ["430", "4"], "properties": ["vocalize, bird, chirp", "men, women, car"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "an engine revving and people talking in the background"], "question": "which entity is a human activity", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a clock ticktocks"], "sample_ids": ["weDbePuc-Xc", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["music, slaps, human", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a train horn blows as it passes by"], "sample_ids": ["sfAvvZwdLCY", "zVacuqSb4LI"], "start_seconds": ["20", "30"], "properties": ["flushes, drains, water", "horn, blows, train"], "captions_pred_video": ["footage of the toilet in the bathroom", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a toilet is flushed", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "water splashes as an animal walks through"], "sample_ids": ["vzceMbklWc", "w1ir-sZ3Im8"], "start_seconds": ["180", "90"], "properties": ["water, faucet, sink", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["water is running and a man is speaking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sxYkFKFIZD0", "zFjIWfSD-4"], "start_seconds": ["20", "410"], "properties": ["screech, man, door", "People, motor, brakes"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "an infant crying as a woman laughs"], "sample_ids": ["xOZfdgAgJ9o", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["woman, whimpering, speaking", "a, laugh, infant"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a baby cries and a woman speaks"], "question": "which woman is speaking?", "label": 0}, {"captions": ["multiple birds vocalize and wind blows", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uoGVs9yUqY4", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["multiple, vocalize, wind", "loud, multiple, distance"], "captions_pred_video": ["for how to make a wooden shed door youtube", null], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a drill drills through something then people begin laughing"], "sample_ids": ["tQWGZLItBXk", "tEE3MpBt1sg"], "start_seconds": ["170", "50"], "properties": ["music, kid, speak", "drill, something, laugh"], "captions_pred_video": ["worms revolution screenshots", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vBHyYJ8pL0", "ukg5L09Wpvo"], "start_seconds": ["2", "150"], "properties": ["noise, door, opening", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a clock ticktocks in wind", "an airplane engine spools and people speak"], "sample_ids": ["yVumC9TGknc", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["ticktocks, clock, wind", "airplane, engine, spool"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a series of beeps and chirps", "a jet engine is running and people are talking"], "question": "which entity is moving", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "water is sprayed across a hard surface"], "sample_ids": ["wDVMhEdTiVw", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["gun, shoot, water", "water, spray, surface"], "captions_pred_video": ["a blurry image of trees and water in the forest", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "spraying followed by silence"], "question": "which entity is a spray of water?", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["u7C-AEBQM", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["ticks, rhythmic, quiet", "sound, chirp, buzz"], "captions_pred_video": [null, "a bee on a purple thistle flower"], "captions_pred_audio": ["a ticktock of a clock", "a bee buzzes and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a telephone rings and a bird vocalizes", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["skd2PphS6oI", "uYT5gxnyMWM"], "start_seconds": ["190", "50"], "properties": ["ring, bird, vocalize", "female, spraying, scream"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a small engine idles continuously", "a car speeding up in the distance"], "sample_ids": ["y5WII6cTH7k", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["engine, idle, continuously", "distance, car, speed"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", null], "captions_pred_audio": ["an engine is knocking and vibrating ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["an adult male speaks and dials a rotary phone", "a car speeding up in the distance"], "sample_ids": ["tK4VlLsNxak", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "distance, car, speed"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "wind blows as people chatter quietly"], "sample_ids": ["vms5XGTDVQc", "xBxDz0CFVn0"], "start_seconds": ["220", "30"], "properties": ["paper, crumpled, crinkled", "wind, chatter, people"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "footage is blurry and out of focus"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["vJvryTwuAV8", "vqZuVbG6-HI"], "start_seconds": ["16", "130"], "properties": ["audience, cheer, man", "background, male, female"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a lawn mower is running and men are speaking "], "question": "which entity has a man speaking to an audience?", "label": 0}, {"captions": ["someone is snoring while sleeping", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["ujMt0-D-x2k", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["snore, sleep, someone", "engine, laugh, loud"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a person is snoring loudly", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sWZzXuWYY", "vYkA3cfXp5Q"], "start_seconds": ["420", "30"], "properties": ["male, clanks, thumps", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "winds blows roughly as a vehicle races past"], "sample_ids": ["yRx9txMcBl0", "xjvTpk2Zpr8"], "start_seconds": ["40", "70"], "properties": ["accelerates, tires, squeals", "wind, blows, vehicle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a jet engine roars and wind blows "], "question": "which vehicle is moving faster", "label": 1}, {"captions": ["a person snoring", "someone snores nearby"], "sample_ids": ["t8tv5YRMJUg", "spJCm8tD9Zo"], "start_seconds": ["0", "90"], "properties": ["a person, snore, loud", "someone snores, nearby, someone"], "captions_pred_video": ["of a man getting his face licked by another man", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a person is snoring loudly"], "question": "which entity is a person?", "label": 0}, {"captions": ["motors runs briefly and tires screech", "a vehicle is skidding and squealing tires"], "sample_ids": ["yRx9txMcBl0", "soTOh3zYJfY"], "start_seconds": ["40", "40"], "properties": ["motors, tires, screech", "vehicle, skid, tires"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wztCSUxOf8", "yajyRTUQk3U"], "start_seconds": ["130", "400"], "properties": ["a crowd, yells, applauds", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a woman is speaking while food is frying in the background"], "question": "which entity is a demonstration", "label": 1}, {"captions": ["scraping and female speech with distant music", "people speak as gunfire rings out"], "sample_ids": ["yHeVV-xeOxQ", "wqTCwqVRDlk"], "start_seconds": ["130", "80"], "properties": ["female, speech, music", "gunfire, ring, speak"], "captions_pred_video": ["of a girl milking a goat's udder", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vbZ-0lGPneg", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["a woman, a television program, a bird", "engine, revs, vehicle"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "water pouring and bubbling"], "sample_ids": ["yajyRTUQk3U", "uyRfq-jKPpo"], "start_seconds": ["400", "50"], "properties": ["noise, woman, speak", "water, bubbles, pouring"], "captions_pred_video": ["- a woman cooking in the kitchen", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zO-LSSY92ZM", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["liquid, surface, sound", "music, gunfire, explosion"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["steam is hissing and hissing", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["people speak softly as food sizzles", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yhQ2Lg-7qDY", "uYT5gxnyMWM"], "start_seconds": ["130", "50"], "properties": ["food, sizzle, speak", "female, spraying, scream"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a woman is speaking and a baby is crying"], "question": "which entity is more violent", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a clock ticktocks"], "sample_ids": ["zY3icUyMdh8", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wRV8yMk886E", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["liquid, spray, nozzle", "a woman, something, fried"], "captions_pred_video": ["two cars are parked in a parking lot at night", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking", "label": 1}, {"captions": ["birds tweet and squawk", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["w1mlz3Pe4fU", "x9JovgqUcs"], "start_seconds": ["300", "500"], "properties": ["squawk, tweet, scream", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["of a bird in a cage", null], "captions_pred_audio": ["birds are chirping and singing", "a man speaks and types on a keyboard"], "question": "which entity is speaking", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["tDVADusiIoc", "uYT5gxnyMWM"], "start_seconds": ["60", "50"], "properties": ["wind, radio, waves", "a, scream, girl"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking over a radio?", "label": 0}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "water is sprayed across a hard surface"], "sample_ids": ["y8dSeubCNI", "sQwlkXjQabo"], "start_seconds": ["4", "10"], "properties": ["men, women, car", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["an engine revving and people talking in the background", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a child yells and another yells", "a machine beeps continuously"], "sample_ids": ["vMDHu7Lxcgw", "y682ml90jGw"], "start_seconds": ["410", "11"], "properties": ["two, yell, child", "beeps, machine, continuously"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", null], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a machine beeps continuously", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["y682ml90jGw", "ukg5L09Wpvo"], "start_seconds": ["11", "150"], "properties": ["beeps, machine, continuously", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a beeping sound is being made ", "a train blows its whistle and blows its horn "], "question": "which entity is continuous", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["w6RTHR6AeAg", "x9JovgqUcs"], "start_seconds": ["40", "500"], "properties": ["call, owl, screech", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": [null, null], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a man speaks and types on a keyboard"], "question": "which entity is speaking", "label": 1}, {"captions": ["a door opens and closes", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vBHyYJ8pL0", "sLUnaPT5gM8"], "start_seconds": ["2", "0"], "properties": ["open, close, door", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["multiple insects buzz over rustling wind", "water rushes and then a vehicle zooms past"], "sample_ids": ["tMJne1a4AFI", "s4Uz1Ffgo04"], "start_seconds": ["0", "100"], "properties": ["wind, buzz, rustling", "water, rushes, vehicle"], "captions_pred_video": ["a swarm of bees on the ground", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is moving faster", "label": 1}, {"captions": ["water flows and trickles", "a woman speaks and then a man speaks"], "sample_ids": ["tB7hWb9gTuQ", "vbpKkWvfOu4"], "start_seconds": ["30", "560"], "properties": ["water, flow, trickle", "a, man, speaks"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["water is splashing and gurgling", "a woman is speaking and a man is speaking"], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["food is frying and sizzles", "a man speaks while turning a water faucet on"], "sample_ids": ["zNRChLjqcU", "vf9xf3vMsGM"], "start_seconds": ["220", "540"], "properties": ["food is frying, sizzles, food", "A man speaks while turning a water faucet on."], "captions_pred_video": [null, "of the person washing their hands under the faucet"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking while water is running in the background"], "question": "which entity is a man speaking while turning a water faucet on?", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["un9VQlzgZM", "rqu8iB22IY"], "start_seconds": ["5", "5"], "properties": ["females, talk, laugh", "sound, repeats, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a dog barks and a man speaks while music plays "], "question": "which entity is more like a joke", "label": 1}, {"captions": ["people clap and speak in the distance", "a child speaks in closed space"], "sample_ids": ["wwyfGO2J4", "yW6FWLSLkx4"], "start_seconds": ["90", "40"], "properties": ["clap, distance, speak", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "water splashes as an animal walks through"], "sample_ids": ["vddP56-ogds", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["water, splash, person, laugh", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "water splashes and gurgles as people speak"], "question": "which entity is about splashing water?", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "a woman speaks happily and an animal chirps"], "sample_ids": ["x6ijhqRY38s", "uWAAAL4CIoc"], "start_seconds": ["250", "0"], "properties": ["something metal, glass, hit", "a woman, chirps, animal"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a woman is speaking and a dog is barking "], "question": "which entity is more likely to be a recording", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zOZleIRqZm4", "tiDFTC-5vU"], "start_seconds": ["80", "30"], "properties": ["rustling, leaves, person", "male, duck, laugh"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking over rustling leaves?", "label": 0}, {"captions": ["birds chirp and an insect buzzes around", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["t97k0cejSQE", "zj2R0XoFr5k"], "start_seconds": ["250", "50"], "properties": ["bird, chirp, insect", "airplane, boy, fly"], "captions_pred_video": ["a bee on a purple thistle flower", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wEBlkGWVWwE", "uEU-Hg5MTN8"], "start_seconds": ["260", "27"], "properties": ["a, babble, woman", "a woman, laughs, animal"], "captions_pred_video": ["shows a person writing on the whiteboard", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a horn rings out as a machine runs by"], "sample_ids": ["uRExseg-0XI", "slZLHwNbbt4"], "start_seconds": ["210", "300"], "properties": ["woman, man, water", "a, horn, run"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity has a horn?", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "an insect buzzes around continuously"], "sample_ids": ["xjhAnI2q6hM", "v25l1jef3JY"], "start_seconds": ["6", "0"], "properties": ["wind, blow, loudly", "buzzes, continuously, insect"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a fly is buzzing around a microphone "], "question": "which entity is a natural phenomenon", "label": 0}, {"captions": ["a woman speaks and dog vocalizes", "a duck quacks loudly and continuously"], "sample_ids": ["uWAAAL4CIoc", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["a, dog, vocalize", "loud, continuous, quacks"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 1}, {"captions": ["an airplane accelerates briefly", "a duck quacks loudly and continuously"], "sample_ids": ["zjTG0gaGCUI", "vh30P49Po6s"], "start_seconds": ["80", "30"], "properties": ["accelerates, airplane, briefly", "loud, continuous, quacks"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "birds chirp and objects are moved around"], "sample_ids": ["y8WEcpOlT3I", "yPUYU6t3rwo"], "start_seconds": ["40", "370"], "properties": ["wind, speak, buffeting", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a person sniffs and sneezes", "a infant makes noise and is excited"], "sample_ids": ["uRlbY6aoBU", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["sneezes, person, sniffs", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is sneezing ", "a baby cries and a woman speaks"], "question": "which entity is more likely to be a baby", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wjsXBsc7M40", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "three men, wind, flow"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 0}, {"captions": ["an engine idles quietly then gradually becomes louder", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["vbr9mHKc8WM", "siJFXfGWgDk"], "start_seconds": ["40", "50"], "properties": ["noise, loudness, engine", "a, bird, vehicle"], "captions_pred_video": [null, "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["an engine is idling", "a man is speaking and birds are chirping in the background "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["su6FAOcOA8c", "y8WEcpOlT3I"], "start_seconds": ["4", "40"], "properties": ["engine, idle, woman", "harsh, wind, blows"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking with wind noise in the background "], "question": "which entity is a man speaking to another man?", "label": 1}, {"captions": ["people speak then an engine runs", "some men converse over an engine running"], "sample_ids": ["uMTTDZ2mb4", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["engine, run, people", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows people speaking while an engine runs?", "label": 0}, {"captions": ["a man speaks, then dials a rotary telephone", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["tK4VlLsNxak", "y8WEcpOlT3I"], "start_seconds": ["120", "40"], "properties": ["a, dial, telephone", "harsh, wind, blows"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking with wind noise in the background "], "question": "which entity is about a man speaking and dialing a rotary telephone?", "label": 0}, {"captions": ["a person is burping while a girl speaks", "a horn blasts as warning bells ring"], "sample_ids": ["vdoxuJn9lTc", "zgUgkpk78xU"], "start_seconds": ["40", "70"], "properties": ["person, burp, girl", "horn, bells, ring"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a child speaks followed by a burp", "a train blows its horn as it speeds down the tracks "], "question": "which entity is louder", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "a car accelerates and wind blows"], "sample_ids": ["ukg5L09Wpvo", "u0TrcHhkPQ"], "start_seconds": ["150", "20"], "properties": ["clickety-clack, train, whistle", "accelerates, wind, blows"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zcDwZ6W7E3E", "uYT5gxnyMWM"], "start_seconds": ["180", "50"], "properties": ["man, speak, motorcycles", "a, scream, girl"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["small dogs yip and bark sharply", "a duck quacks continuously"], "sample_ids": ["v-wcQf4BDY0", "vh30P49Po6s"], "start_seconds": ["120", "30"], "properties": ["bark, yip, sharply", "quacks, continuously, duck"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a dog barks and growls", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["water runs into a sink while men speak", "continuous chugging with birds chirping in the background"], "sample_ids": ["vzceMbklWc", "xM4joTqDVp4"], "start_seconds": ["180", "160"], "properties": ["water, sink, run", "background, chirp, birds"], "captions_pred_video": [null, "footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack"], "captions_pred_audio": ["water is running and a man is speaking", "birds are chirping and a train is moving "], "question": "which entity has birds chirping in the background?", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "a man speaks as a car is passing by"], "sample_ids": ["wIvYjuR3nrg", "sK4u5T8hW78"], "start_seconds": ["9", "30"], "properties": ["birds, pigeons, vocalize", "a, car, pass"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds are chirping and cooing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a moving object", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yI-KvObbDoY", "ukg5L09Wpvo"], "start_seconds": ["260", "150"], "properties": ["sound, smack, wind", "clickety-clack, train, whistle"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vbr9mHKc8WM", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["noise, loudness, engine", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["an engine is idling", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["an electronic device bleeps once", "an infant crying frantically"], "sample_ids": ["tHJ6JSa8Y4", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["bleeps, electronic, device", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a clock is ticking and beeping", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "water flows and trickles"], "sample_ids": ["tdWhHV3X25Q", "tB7hWb9gTuQ"], "start_seconds": ["60", "30"], "properties": ["applause, audience, yells", "water, flow, trickle"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a person snoring", "a duck quacks continuously"], "sample_ids": ["t8tv5YRMJUg", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["a person, snore, loud", "quacks, continuously, duck"], "captions_pred_video": ["of a man getting his face licked by another man", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a duck is quacking loudly"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "an insect buzzes around continuously"], "sample_ids": ["vuUVPzd2FXw", "v25l1jef3JY"], "start_seconds": ["160", "0"], "properties": ["a, steam, release", "buzzes, continuously, insect"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "people cheer as a vehicle engine revs"], "sample_ids": ["sapQIQUhFc", "xjhAnI2q6hM"], "start_seconds": ["280", "6"], "properties": ["water, stream, trickles", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wy1eKjR7KC0", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["people, talk, distance", "multiple, people, yell"], "captions_pred_video": ["two police officers riding motorcycles down the street", null], "captions_pred_audio": ["a man is speaking and a siren is going off", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "people speak as gunfire rings out"], "sample_ids": ["yDoT73BWsdA", "wqTCwqVRDlk"], "start_seconds": ["10", "80"], "properties": ["engine, revs, vehicle", "gunfire, ring, speak"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["an engine runs and a man speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yT5WfYMRr-U", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["engine, run, man", "airplane, boy, fly"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a plane flying by?", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a clock ticktocks"], "sample_ids": ["uPDn2BFTHk", "v-g-j2uTByM"], "start_seconds": ["140", "30"], "properties": ["lady, laugh, baby", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an engine runs and a man speaks", "wind blowing followed by a zoom"], "sample_ids": ["yT5WfYMRr-U", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["engine, run, man", "wind, blow, zoom"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "an airplane engine runs"], "sample_ids": ["tjmoSi330GM", "yVPZ2MNWpms"], "start_seconds": ["23", "0"], "properties": ["speed, water, boat", "engine, airplane, runs"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a car is driving by on the road "], "question": "which is not a vehicle", "label": 1}, {"captions": ["a woman and man are speaking", "a child speaks in closed space"], "sample_ids": ["vbpKkWvfOu4", "yW6FWLSLkx4"], "start_seconds": ["560", "40"], "properties": ["two people, speaking, woman, man", "child, space, speak"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a single person", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "a telephone rings followed by a woman talking"], "sample_ids": ["wAAkbZToh8", "tGcFnX0GHI"], "start_seconds": ["0", "0"], "properties": ["burp, laugh, speak", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man burps and a woman speaks", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["continuous sneezing together with speech", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["x4dZyf9Gbj0", "vbZ-0lGPneg"], "start_seconds": ["130", "30"], "properties": ["continuous, sneeze, speech", "a woman, a television program, a bird"], "captions_pred_video": ["footage is blurry and out of focus", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman sneezes and speaks", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person", "label": 1}, {"captions": ["a jet engine spools up and takes off", "an airplane engine spools and people speak"], "sample_ids": ["vBslzh7saPw", "wTjoRj1se3U"], "start_seconds": ["90", "390"], "properties": ["engine, spools, takes", "airplane, engine, spool"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a jet engine is running and people are talking"], "question": "which entity is a video of an airplane engine spooling?", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sQwlkXjQabo", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["liquid, surface, spray", "music, gunfire, explosion"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["spraying followed by silence", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["water pours followed by a woman speaking and then a man speaking", "small dogs yip and bark sharply"], "sample_ids": ["uRExseg-0XI", "v-wcQf4BDY0"], "start_seconds": ["210", "120"], "properties": ["audio, woman, man", "bark, yip, sharply"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a stream of water runs briefly"], "sample_ids": ["xZepNM9qcRA", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["background, motor, run", "stream, water, run"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a dark barks and whimpers", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sYj4hpDUZDQ", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["barks, whimpers, dark", "loud, multiple, distance"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", null], "captions_pred_audio": ["a dog barks and a cat meows", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["w34HjHr6gAY", "t25U-v4k4ts"], "start_seconds": ["30", "40"], "properties": ["beeps, hit, woman", "a, chirps, bird"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking and bees are buzzing"], "question": "which entity has a bird chirp?", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "water is sprayed across a hard surface"], "sample_ids": ["xOZfdgAgJ9o", "sQwlkXjQabo"], "start_seconds": ["40", "10"], "properties": ["woman, whimpering, speaking", "water, spray, surface"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "spraying followed by silence"], "question": "which entity is a video of a woman speaking and another woman whimpering?", "label": 0}, {"captions": ["a train horn sounds as a railroad passing bell rings", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zgUgkpk78xU", "tdWhHV3X25Q"], "start_seconds": ["70", "60"], "properties": ["horn, bell, train", "applause, audience, yells"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking and a crowd is clapping"], "question": "which entity is a human activity", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wqZ135Ssz0", "ukg5L09Wpvo"], "start_seconds": ["60", "150"], "properties": ["man, woman, squawks", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["a child speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yW6FWLSLkx4", "zj2R0XoFr5k"], "start_seconds": ["40", "50"], "properties": ["a, child, speaks", "airplane, boy, fly"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a child speaking?", "label": 0}, {"captions": ["someone is burping continuously", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["y636gklDioE", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["burps, burps, burps", "a woman, laughs, animal"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person burps loudly several times", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a prank", "label": 1}, {"captions": ["a weapon fires multiple times", "a woman speaks as she rubs two objects together"], "sample_ids": ["sMC07Ucy7kg", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["weapon, fire, multiple", "two objects, woman, speak"], "captions_pred_video": ["footage is from a car's point of view", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is not a weapon", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "a motor vehicle roars, drowning out people speaking in the background"], "sample_ids": ["sZvwOuuPGP0", "s4Uz1Ffgo04"], "start_seconds": ["50", "100"], "properties": ["engine, diesel, truck", "roars, background, people speaking"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a medium engine is running ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["someone snores nearby", "a car speeding up in the distance"], "sample_ids": ["spJCm8tD9Zo", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["someone snores, nearby, someone", "distance, car, speed"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 1}, {"captions": ["a toilet flushes and water drains", "an infant crying frantically"], "sample_ids": ["sfAvvZwdLCY", "zwOBqeFTgiU"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "cry, infant, frantically"], "captions_pred_video": ["footage of the toilet in the bathroom", "of the baby crying in the car seat"], "captions_pred_audio": ["a toilet is flushed", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["a dog barks and whimpers", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["sShpyu2l4YQ", "vlJS7LN2XyM"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "background, clocks, ticking"], "captions_pred_video": ["the puppies are playing with a toy", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a dog is barking and growling", "a ticktock of a clock"], "question": "which entity is more calm", "label": 1}, {"captions": ["frogs croak and vocalize", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yswmmRZFItk", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["croak, vocalize, frog", "water, radio, man"], "captions_pred_video": ["a close up of a frog in the water", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a frog is croaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a frog?", "label": 1}, {"captions": ["children cry and people talk", "a car accelerates and wind blows"], "sample_ids": ["xLwHe825Zs", "u0TrcHhkPQ"], "start_seconds": ["18", "20"], "properties": ["people talk, children cry, people talk", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby cries and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "the rumbling of a bus followed by a soft male voice"], "sample_ids": ["se87d6yxEOA", "vK93VuO0yNc"], "start_seconds": ["10", "30"], "properties": ["run, whistle, pass", "male voice, bus, rumble"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage is blurry due to the movement of the bus as it drives through the city at night"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a car drives by with wind noise in the background "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "people cheer as a vehicle engine revs"], "sample_ids": ["tZGN5a7ybxo", "xjhAnI2q6hM"], "start_seconds": ["60", "6"], "properties": ["ring, train, horn", "engine revs, vehicle, people"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["people speak and tapping occurs", "people cheer as a vehicle engine revs"], "sample_ids": ["tFCUUGdREgA", "xjhAnI2q6hM"], "start_seconds": ["70", "6"], "properties": ["people, tap, speak", "engine revs, vehicle, people"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["some men converse over an engine running", "a woman and man are speaking"], "sample_ids": ["sCiy7QS1U", "vbpKkWvfOu4"], "start_seconds": ["300", "560"], "properties": ["men, converse, engine", "two people, speaking, woman, man"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a woman is speaking and a man is speaking"], "question": "which entity shows two people speaking?", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yNtRmrn0io8", "uZesmtKZGSw"], "start_seconds": ["210", "250"], "properties": ["storm, distance, strike", "men, talk, cars"], "captions_pred_video": ["footage of a house in the middle of the night", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["rain falls and thunder roars", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more likely to be a violent event", "label": 0}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vlJS7LN2XyM", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["background, clocks, ticking", "water, radio, man"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "people applaud and hoot and chat quietly"], "sample_ids": ["voJh2gJxXhA", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["music, frog, croak", "people, applaud, hoot"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", null], "captions_pred_audio": ["music is playing and crickets are chirping ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an airplane engine spools and people speak", "some men converse over an engine running"], "sample_ids": ["wTjoRj1se3U", "sCiy7QS1U"], "start_seconds": ["390", "300"], "properties": ["airplane, engine, spool", "men, converse, engine"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a man conversing over an engine running?", "label": 1}, {"captions": ["birds fly and flutter around", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wGKgwOP3h30", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["fly, flutter, around", "multiple, people, yell"], "captions_pred_video": ["of the pigeons in the coop", null], "captions_pred_audio": ["pigeons coo and flap their wings", "a crowd of people are talking and laughing"], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and water drains", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sfAvvZwdLCY", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["water drains, flushes, water", "three men, wind, flow"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing", "label": 1}, {"captions": ["a infant makes noise and is excited", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wIJK3-5y0kA", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["noise, excited, infant", "rustling, ducks, quack"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man is filing a hard object", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vveS8HT7Uog", "sSMl2vc3ek"], "start_seconds": ["100", "20"], "properties": ["a man, hard, object", "loud, multiple, distance"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["water rushes by", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["x-PeY8Yb8M4", "zFjIWfSD-4"], "start_seconds": ["300", "410"], "properties": ["water, rushes, by", "People, motor, brakes"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks as crickets sing", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["ryFDPxgDOGc", "uYT5gxnyMWM"], "start_seconds": ["570", "50"], "properties": ["a, crickets, sing", "a, scream, girl"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sd7xVssqlw", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["accelerates, tires, squealing", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a steam engine runs and whistles as it passes by", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["se87d6yxEOA", "xfaoyyzw2WU"], "start_seconds": ["10", "180"], "properties": ["run, whistle, pass", "loud, jet engine, roar"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a drill drills through something then people begin laughing"], "sample_ids": ["zALy31PjDl0", "tEE3MpBt1sg"], "start_seconds": ["21", "50"], "properties": ["a man, a vehicle, a horn", "drill, something, laugh"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "people are laughing breathing and speaking with background noise "], "question": "which entity is about a drill?", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "a child speaks in closed space"], "sample_ids": ["vlS6YMeWAPo", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["noise, bleat, call", "child, space, speak"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a goat bleats and birds chirp", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["some tunes played by whistling", "some tunes played by whistling"], "sample_ids": ["u6BnG6YZqJ4", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["tune, play, whistling", "tune, play, whistling"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a person whistling a song", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vr8ZXjEBhMQ", "uYT5gxnyMWM"], "start_seconds": ["150", "50"], "properties": ["wind, blow, zoom", "female, spraying, scream"], "captions_pred_video": ["is taken from a motorcycle's point of view", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "frogs croak and vocalize"], "sample_ids": ["y4tPJXBKDig", "yswmmRZFItk"], "start_seconds": ["20", "0"], "properties": ["a, noise, talk", "croak, vocalize, frog"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "water flows and trickles"], "sample_ids": ["zliInBdC98Y", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["a, baby, cries, wails", "water, flow, trickle"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a baby cries and a woman speaks", "water is splashing and gurgling"], "question": "which entity is silent", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a woman speaks as she rubs two objects together"], "sample_ids": ["uWPRNLnpy7Y", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["accelerate, laugh, vehicle", "two objects, woman, speak"], "captions_pred_video": ["is taken from a car driving down the street", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["a man speaking with light rustling", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["zOZleIRqZm4", "s7knHCFW82w"], "start_seconds": ["80", "30"], "properties": ["light, rustling, man", "blow horn, get close, train"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is moving", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vZAw4apG0Es", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["background, clock, ticktocks", "rooster, crow, background, men"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a clock in the background?", "label": 0}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "people speak as gunfire rings out"], "sample_ids": ["sjlVMgdGSK0", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["accelerates, vehicle, race car", "gunfire, ring, speak"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a telephone rings followed by a woman talking"], "sample_ids": ["vSeGhaZt-aI", "tGcFnX0GHI"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, run", "ring, talk, woman"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a baby laugh at a sputter", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sLUnaPT5gM8", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["laugh, sputter, baby", "male, duck, laugh"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking and ducks are quacking"], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["wRBHTgrbiwg", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["bird, owl, speak", "engine, laugh, loud"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a car accelerates and wind blows", "winds blows roughly as a vehicle races past"], "sample_ids": ["u0TrcHhkPQ", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["accelerates, wind, blows", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vZAw4apG0Es", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["people, clock, converse", "People, motor, brakes"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a clock ticking?", "label": 0}, {"captions": ["a jet engine spools up and takes off", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vBslzh7saPw", "ukg5L09Wpvo"], "start_seconds": ["90", "150"], "properties": ["engine, spools, takes", "clickety-clack, train, whistle"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man speaks as a machine runs", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vD6lYD1l0BY", "vfYTJq7nU"], "start_seconds": ["330", "130"], "properties": ["a, machine, run", "rustling, ducks, quack"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a duck quacks and a woman speaks"], "question": "which entity is about a machine running?", "label": 0}, {"captions": ["a person is snoring while sleeping", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vJrjSeP17yE", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "multiple, people, yell"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a diesel truck engine runs continuously", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sZvwOuuPGP0", "tdWhHV3X25Q"], "start_seconds": ["50", "60"], "properties": ["engine, diesel, truck", "applause, audience, yells"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a medium engine is running ", "a man is speaking and a crowd is clapping"], "question": "which is not a vehicle", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "an infant crying as a woman laughs"], "sample_ids": ["siJFXfGWgDk", "xhmRY9yhC7c"], "start_seconds": ["50", "20"], "properties": ["man, woman, vehicle", "a, laugh, infant"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a baby cries and a woman speaks"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a girl talking, laughing and sneezing noise", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["y4tPJXBKDig", "w5W5Kqtc8E"], "start_seconds": ["20", "100"], "properties": ["a, noise, talk", "wind, blow, vehicle"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", null], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "water is sprayed across a hard surface"], "sample_ids": ["zcDwZ6W7E3E", "sQwlkXjQabo"], "start_seconds": ["180", "10"], "properties": ["man, speak, motorcycles", "water, spray, surface"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "spraying followed by silence"], "question": "which is a liquid", "label": 1}, {"captions": ["a man talks as several small engines run", "a motor runs in the distance as a soft wind periodically gusts"], "sample_ids": ["u9A6VZQCZpU", "xyL9F5VrjkE"], "start_seconds": ["30", "20"], "properties": ["a, man, talk", "wind, motor, distance"], "captions_pred_video": [null, "of a caterpillar truck loading logs into a trailer"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "the wind is blowing and a car is passing by "], "question": "which entity is about a motor?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["x5cuQjOdM3E", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["cat, meows, young woman", "people, applaud, hoot"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a stream of water runs briefly", "a man speaks as water trickles down a stream"], "sample_ids": ["x-PeY8Yb8M4", "sapQIQUhFc"], "start_seconds": ["300", "280"], "properties": ["stream, water, run", "water, stream, trickles"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking and a stream is flowing in the background "], "question": "which stream is running", "label": 0}, {"captions": ["some people speak", "several insects fly while two men talk"], "sample_ids": ["vbZ-0lGPneg", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "several, fly, men"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a woman and man speak while food is frying", "people cheer as a vehicle engine revs"], "sample_ids": ["zk-xJGQU8-4", "xjhAnI2q6hM"], "start_seconds": ["130", "6"], "properties": ["food, man, woman", "engine revs, vehicle, people"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "people speak as gunfire rings out"], "sample_ids": ["tDVADusiIoc", "wqTCwqVRDlk"], "start_seconds": ["60", "80"], "properties": ["water, radio, man", "gunfire, ring, speak"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war zone", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vJ7JPEFhyLA", "xfaoyyzw2WU"], "start_seconds": ["16", "180"], "properties": ["three men, wind, flow", "loud, jet engine, roar"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a male speaks and another male speaks", "water splashes as an animal walks through"], "sample_ids": ["viuTg1M-dqg", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["two males, speaking, male", "animal, water, splashes"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "some tunes played by whistling"], "sample_ids": ["yI-KvObbDoY", "u6BnG6YZqJ4"], "start_seconds": ["260", "0"], "properties": ["sound, smack, wind", "tune, play, whistling"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["vBslzh7saPw", "tDlysoZiA1I"], "start_seconds": ["90", "0"], "properties": ["engine, roar, louder", "animal, grunts, chirps"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a jet engine roars and accelerates ", "birds are chirping and a rooster is crowing "], "question": "which entity is quieter", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "water pouring and bubbling"], "sample_ids": ["tDVADusiIoc", "uyRfq-jKPpo"], "start_seconds": ["60", "50"], "properties": ["wind, radio, waves", "water, bubbles, pouring"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["running water in a faucet with some clinks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zNRChLjqcU", "zj2R0XoFr5k"], "start_seconds": ["220", "50"], "properties": ["water, faucet, run", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["water is running from a faucet into a sink", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a helicopter engine runs continuously", "wind blows as people chatter quietly"], "sample_ids": ["ugHJF0hfYkg", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["engine, running, continuously", "wind, chatter, people"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage is blurry and out of focus"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a vehicle accelerates and squeals tires"], "sample_ids": ["xM4joTqDVp4", "yRx9txMcBl0"], "start_seconds": ["160", "40"], "properties": ["background, chirp, birds", "accelerates, tires, squeals"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "paper is crumpling consistently"], "sample_ids": ["x9JovgqUcs", "v5cSxLaHADY"], "start_seconds": ["500", "0"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man speaks and types on a keyboard", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tMJne1a4AFI", "uZesmtKZGSw"], "start_seconds": ["0", "250"], "properties": ["wind, buzz, rustling", "men, talk, cars"], "captions_pred_video": ["a swarm of bees on the ground", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "an adult male speaks and dials a rotary phone"], "sample_ids": ["y2bVZ7rz-5M", "tK4VlLsNxak"], "start_seconds": ["280", "120"], "properties": ["engine, horn, siren", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking and using a sewing machine"], "question": "which entity is a vehicle", "label": 0}, {"captions": ["male speech with light ticking", "a child speaks in closed space"], "sample_ids": ["xO-Q2BlIIPU", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["male, speech, ticking", "child, space, speak"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a drill runs and two people laugh", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["tEE3MpBt1sg", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["two people, laugh, drill", "a, scream, girl"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity has more people", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a loud engine muffles a man as he speaks"], "sample_ids": ["wjsXBsc7M40", "xyx6eNVEYRY"], "start_seconds": ["10", "380"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "loud, engine, muffles"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a baby laughs and a woman speaks", "an aircraft engine is running and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vddP56-ogds", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["water, splash, person, laugh", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be in a storm", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a vehicle accelerates and squeals tires"], "sample_ids": ["vh30P49Po6s", "yRx9txMcBl0"], "start_seconds": ["30", "40"], "properties": ["loud, continuous, quacks", "accelerates, tires, squeals"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a duck is quacking loudly", "a car is revving its engine and skidding "], "question": "which entity is louder", "label": 0}, {"captions": ["someone whistles a tune", "a train engine runs and a horn blows"], "sample_ids": ["sIXTftIuUgw", "zPX9o1uDiI"], "start_seconds": ["90", "40"], "properties": ["someone, tune, whistle", "engine, horn, run"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a train moves with its horn blowing and wheels squealing "], "question": "which entity is a train", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["wvKpEYswXO0", "vXlk0lIQBFo"], "start_seconds": ["150", "470"], "properties": ["water, tap, run", "wind, speak, vocalize"], "captions_pred_video": ["of the person preparing food in the kitchen", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity is about water?", "label": 0}, {"captions": ["wind blows and women speak as livestock vocalizes", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vXlk0lIQBFo", "uEU-Hg5MTN8"], "start_seconds": ["470", "27"], "properties": ["wind, speak, vocalize", "a woman, laughs, animal"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman speaking and an animal snorting?", "label": 1}, {"captions": ["a man speaking with light rustling", "winds blows roughly as a vehicle races past"], "sample_ids": ["zOZleIRqZm4", "xjvTpk2Zpr8"], "start_seconds": ["80", "70"], "properties": ["light, rustling, man", "wind, blows, vehicle"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["a dog barks and whimpers", "a child speaks in closed space"], "sample_ids": ["sShpyu2l4YQ", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["barks, whimpers, dog", "child, space, speak"], "captions_pred_video": ["the puppies are playing with a toy", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["distant humming of an engine", "a infant makes noise and is excited"], "sample_ids": ["yVPZ2MNWpms", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["sound, distance, engine", "noise, excited, infant"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a car is driving by on the road ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a propeller moves loudly nearby", "some clanking with distant murmuring"], "sample_ids": ["ugHJF0hfYkg", "uMTTDZ2mb4"], "start_seconds": ["10", "30"], "properties": ["loud, propeller, move", "clanking, murmuring, distant"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "people are talking and a car is driving by with wind noise in the background "], "question": "which is quieter", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "someone whistles a tune"], "sample_ids": ["siJFXfGWgDk", "sIXTftIuUgw"], "start_seconds": ["50", "90"], "properties": ["man, woman, vehicle", "someone, tune, whistle"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a baby cries and a woman moans", "a woman speaks as she rubs two objects together"], "sample_ids": ["smDKStoHBJo", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["a, cry, woman", "two objects, woman, speak"], "captions_pred_video": ["a man holding a crying baby in his arms", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a woman speaking?", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a man speaks as a motor runs in the background"], "sample_ids": ["w5W5Kqtc8E", "xZepNM9qcRA"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "someone is typing on a computer keyboard"], "sample_ids": ["smDKStoHBJo", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["a, talk, baby, cry", "keyboard, type, computer"], "captions_pred_video": ["a man holding a crying baby in his arms", "how to make money on youtube in spanish"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["v0x1odnXtP0", "zj2R0XoFr5k"], "start_seconds": ["210", "50"], "properties": ["keyboard, type, computer", "airplane, boy, fly"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman speaks while a helicopter flies overhead "], "question": "which object is moving", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "wind blows as people chatter quietly"], "sample_ids": ["wyllXV6PjKo", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a kid, talk, cry", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wyllXV6PjKo", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["a kid, talk, cry", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a baby is crying"], "question": "which entity has a kid?", "label": 0}, {"captions": ["waves crash against a shoreline and people speak", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yFB25fqfU8I", "vbZ-0lGPneg"], "start_seconds": ["300", "30"], "properties": ["wave, crash, shoreline", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a person surfing in the ocean", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a woman is speaking and a dog is whimpering"], "question": "which entity is a video of a television program?", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["slZLHwNbbt4", "tDVADusiIoc"], "start_seconds": ["300", "60"], "properties": ["train, horn, sound", "water, radio, man"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "birds chirp and objects are moved around"], "sample_ids": ["zgUgkpk78xU", "yPUYU6t3rwo"], "start_seconds": ["70", "370"], "properties": ["horn, bells, ring", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a duck quacks continuously"], "sample_ids": ["v7jJS8aAyA", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["wind, blows, loudly", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a toilet flushes and water drains"], "sample_ids": ["tDlfY3nmx1A", "sfAvvZwdLCY"], "start_seconds": ["160", "20"], "properties": ["applause, laugh, man", "water drains, flushes, water"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a toilet is flushed"], "question": "which entity is a draining water?", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "wind blows as people chatter quietly"], "sample_ids": ["sQwlkXjQabo", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "wind, chatter, people"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage is blurry and out of focus"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a male speaks and another male speaks", "someone is burping continuously"], "sample_ids": ["viuTg1M-dqg", "y636gklDioE"], "start_seconds": ["30", "20"], "properties": ["two males, speaking, male", "burps, burps, burps"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "a dog sitting on a red chair in front of an old telephone"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a person burps loudly several times"], "question": "which entity is more likely to be a prank", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "small dogs yip and bark sharply"], "sample_ids": ["xzKKf9bKNUo", "v-wcQf4BDY0"], "start_seconds": ["10", "120"], "properties": ["background, noise, snoring", "bark, yip, sharply"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a person snoring loudly", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["an insect buzzes around continuously", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["v25l1jef3JY", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "a woman, a television program, a bird"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking and a dog is whimpering"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a door slams shut and an object moves on a hard surface", "water pouring and bubbling"], "sample_ids": ["zkKdxzNC97Y", "uyRfq-jKPpo"], "start_seconds": ["27", "50"], "properties": ["hard, surface, door", "water, bubbles, pouring"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a door is opened and closed", "water is running from a faucet"], "question": "which entity is more likely to be a liquid", "label": 1}, {"captions": ["some clanking with distant murmuring", "water flows as men speak and yell"], "sample_ids": ["uMTTDZ2mb4", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["clanking, murmuring, distant", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["soTOh3zYJfY", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["vehicle, skid, tires", "men, talk, cars"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a infant makes noise and is excited", "some men converse over an engine running"], "sample_ids": ["wIJK3-5y0kA", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["noise, excited, infant", "men, converse, engine"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more quiet", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "some men converse over an engine running"], "sample_ids": ["ukxt9I7eMMg", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["food, pan, cook", "men, converse, engine"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a conversation?", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a car accelerates and wind blows"], "sample_ids": ["xM4joTqDVp4", "u0TrcHhkPQ"], "start_seconds": ["160", "20"], "properties": ["background, chirp, birds", "accelerates, wind, blows"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", null], "captions_pred_audio": ["birds are chirping and a train is moving ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "an engine runs loudly"], "sample_ids": ["t69a8aRKhmc", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["a, b, c", "loud, engine, run"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a lawn mower is running and men are speaking "], "question": "which entity is quieter", "label": 0}, {"captions": ["an adult male speaks and dials a rotary phone", "some men converse over an engine running"], "sample_ids": ["tK4VlLsNxak", "sCiy7QS1U"], "start_seconds": ["120", "300"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "men, converse, engine"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation between two men?", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "an airplane engine roars increasingly louder"], "sample_ids": ["vKrYfzleLB8", "vBslzh7saPw"], "start_seconds": ["110", "90"], "properties": ["a, ring, gunshots", "engine, roar, louder"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a jet engine roars and accelerates "], "question": "which is louder", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["w1mlz3Pe4fU", "xKB8O8LTs6s"], "start_seconds": ["300", "70"], "properties": ["vocalize, chirp, continuously", "music, gunfire, explosion"], "captions_pred_video": ["of a bird in a cage", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["birds are chirping and singing", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["children speak as a female ask them questions", "several insects fly while two men talk"], "sample_ids": ["wEBlkGWVWwE", "s-T9OVOiMLo"], "start_seconds": ["260", "330"], "properties": ["female, speak, questions", "several, fly, men"], "captions_pred_video": ["shows a person writing on the whiteboard", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "people speak as gunfire rings out"], "sample_ids": ["ujMt0-D-x2k", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["snoring, rhythmical, nearby", "gunfire, ring, speak"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["ukxt9I7eMMg", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["food, pan, cook", "a woman, laughs, animal"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman speaking and laughing?", "label": 1}, {"captions": ["birds fly and flutter around", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["wGKgwOP3h30", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["fly, flutter, around", "sheep, baa, birds"], "captions_pred_video": ["of the pigeons in the coop", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["pigeons coo and flap their wings", "a goat bleats and birds chirp"], "question": "which entity is a sheep?", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "some men converse over an engine running"], "sample_ids": ["ylpYOorfH4o", "sCiy7QS1U"], "start_seconds": ["410", "300"], "properties": ["motor, run, steady", "men, converse, engine"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man speaks while a motorcycle revs and accelerates "], "question": "which is a more active scene", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vzxHnu-SFEw", "zj2R0XoFr5k"], "start_seconds": ["80", "50"], "properties": ["two objects, woman, speak", "airplane, boy, fly"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which object is moving", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "vehicles pass by on a roadway"], "sample_ids": ["yYEVLuqEytU", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["grunt, slurp, background", "pass, vehicle, roadway"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of a fire truck entering a garage"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "an infant crying as a woman laughs"], "sample_ids": ["ziUT9IFTkjg", "xhmRY9yhC7c"], "start_seconds": ["10", "20"], "properties": ["background, birds, rustling", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a person sniffles and sneezes", "a man speaks in the background while a slow tick repeats"], "sample_ids": ["uRlbY6aoBU", "vZAw4apG0Es"], "start_seconds": ["0", "30"], "properties": ["sneezes, sniffles, person", "background, tick, repeat"], "captions_pred_video": [null, "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a man is sneezing ", "a clock is ticking and people are talking"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "a man speaks then blows a vehicle horn as wind blows"], "sample_ids": ["wz7N8YRy74I", "zALy31PjDl0"], "start_seconds": ["30", "21"], "properties": ["rooster, crow, background, men", "a man, a vehicle, a horn"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a motorcycle is parked on the side of a brick walkway"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking and a car horn is honking"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a man speaks as a car is passing by"], "sample_ids": ["tDlysoZiA1I", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["animal, grunts, chirps", "a, car, pass"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a car speeding up in the distance"], "sample_ids": ["vdoxuJn9lTc", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["burp, loud, girl", "distance, car, speed"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", null], "captions_pred_audio": ["a child speaks followed by a burp", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["water flows as men speak and yell", "paper folding and crinkling"], "sample_ids": ["vJ7JPEFhyLA", "zPpG3RD8lSs"], "start_seconds": ["16", "20"], "properties": ["water, flow, men", "paper, fold, crinkle"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "the wind blows and a mouse clicks "], "question": "which entity is more still", "label": 1}, {"captions": ["a toilet flushes and water drains", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sfAvvZwdLCY", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["water drains, flushes, water", "rustling, ducks, quack"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["a airplane flies overhead as a woman speaks", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zj2R0XoFr5k", "wDVMhEdTiVw"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, woman", "gun, shoot, water"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a child speaks", "a machine beeps continuously"], "sample_ids": ["yW6FWLSLkx4", "y682ml90jGw"], "start_seconds": ["40", "11"], "properties": ["a, child, speaks", "beeps, machine, continuously"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uiS58TNyUiw", "vb1fPSDI4c"], "start_seconds": ["430", "30"], "properties": ["audio, man, speaking", "multiple, people, yell"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wqADXCzngMw", "vbZ-0lGPneg"], "start_seconds": ["340", "30"], "properties": ["engine, idle, man", "a woman, a television program, a bird"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a man talking?", "label": 0}, {"captions": ["race cars go around a track as a man commentates", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["uZesmtKZGSw", "xV7Mg1QucSc"], "start_seconds": ["250", "14"], "properties": ["car, track, man", "alarm, ticktocks, laughs"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "an alarm clock ticks and a woman laughs"], "question": "which entity has a man laugh?", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "people speak as gunfire rings out"], "sample_ids": ["uZesmtKZGSw", "wqTCwqVRDlk"], "start_seconds": ["250", "80"], "properties": ["men, talk, cars", "gunfire, ring, speak"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["zuua6-5goWw", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["birds, chirp, quiet, man, speaks", "harsh, wind, blows"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "on how to use a sewing machine youtube"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 0}, {"captions": ["people speak and tapping occurs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tFCUUGdREgA", "uYT5gxnyMWM"], "start_seconds": ["70", "50"], "properties": ["people, tap, speak", "female, spraying, scream"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and tapping?", "label": 0}, {"captions": ["a beep occurs briefly", "water pouring and bubbling"], "sample_ids": ["xtWeJ56-U-g", "uyRfq-jKPpo"], "start_seconds": ["20", "50"], "properties": ["beep, occur, briefly", "water, bubbles, pouring"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "water is running from a faucet"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["a man is filing a hard object", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vveS8HT7Uog", "vYkA3cfXp5Q"], "start_seconds": ["100", "30"], "properties": ["a man, hard, object", "engine, accelerate, idle"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "an engine is idling"], "question": "which object is more likely to be a vehicle engine?", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wRBHTgrbiwg", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["bird, owl, speak", "a woman, laughs, animal"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking briefly?", "label": 0}, {"captions": ["water splashes and a door squeaks", "someone is typing on a computer keyboard"], "sample_ids": ["sdXV-ylviw", "v0x1odnXtP0"], "start_seconds": ["190", "210"], "properties": ["sound, splash, door", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "pigeons vocalize and birds chirp"], "sample_ids": ["vs65y4qmyBE", "uiS58TNyUiw"], "start_seconds": ["340", "430"], "properties": ["wind, blows, strongly", "vocalize, bird, chirp"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "of the pigeon in the cage"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["tDVADusiIoc", "sLUnaPT5gM8"], "start_seconds": ["60", "0"], "properties": ["wind, radio, waves", "loud, laughter, intermittent"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["running water in a faucet with some clinks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zNRChLjqcU", "uEU-Hg5MTN8"], "start_seconds": ["220", "27"], "properties": ["water, faucet, run", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["water is running from a faucet into a sink", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "three men talk while wind blows and some liquid flows"], "sample_ids": ["weDbePuc-Xc", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["music, slaps, human", "three men, wind, flow"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more liquid flowing", "label": 1}, {"captions": ["a dog barks and whimpers", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sShpyu2l4YQ", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["barks, whimpers, dog", "airplane, boy, fly"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a dog is barking and growling", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "an insect buzzes around continuously"], "sample_ids": ["uJV8NDaHqqk", "v25l1jef3JY"], "start_seconds": ["100", "0"], "properties": ["loud, fly, chirp", "buzzes, continuously, insect"], "captions_pred_video": ["a bee hive in a wooden box", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a swarm of bees buzzing around", "a fly is buzzing around a microphone "], "question": "which fly buzzes around loudly", "label": 0}, {"captions": ["paper folding and crinkling", "a man is snoring loudly and repeatedly"], "sample_ids": ["zPpG3RD8lSs", "sncRqQ67iJU"], "start_seconds": ["20", "460"], "properties": ["paper, fold, crinkle", "loud, repeatedly, man"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a person is snoring"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["zcDwZ6W7E3E", "y8WEcpOlT3I"], "start_seconds": ["180", "40"], "properties": ["a, man, speak", "harsh, wind, blows"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking with wind noise in the background "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a train horn blows as it passes by", "an engine runs loudly"], "sample_ids": ["zVacuqSb4LI", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["horn, blows, train", "loud, engine, run"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a lawn mower is running and men are speaking "], "question": "which train is louder", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "continuous chugging with birds chirping in the background"], "sample_ids": ["sWZzXuWYY", "xM4joTqDVp4"], "start_seconds": ["420", "160"], "properties": ["male, clanks, thumps", "background, chirp, birds"], "captions_pred_video": [null, "footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "birds are chirping and a train is moving "], "question": "which entity has a male speaking?", "label": 0}, {"captions": ["a toilet flushes and water sputters as it drains", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["smGI3C1NZc", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["water, drain, toilet", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a toilet is flushed", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a moving object", "label": 1}, {"captions": ["motors runs briefly and tires screech", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yRx9txMcBl0", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["motors, tires, screech", "People, motor, brakes"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running and air brakes hissing?", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a clock ticktocks"], "sample_ids": ["wRBHTgrbiwg", "v-g-j2uTByM"], "start_seconds": ["50", "30"], "properties": ["bird, owl, speak", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a propeller rotates loudly and intensely"], "sample_ids": ["uYT5gxnyMWM", "ugHJF0hfYkg"], "start_seconds": ["50", "10"], "properties": ["female, spraying, scream", "loud, intense, propeller"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xSKJGCItUWE", "zl9Dqx-j7q4"], "start_seconds": ["10", "6"], "properties": ["engine, work, child", "engine, laugh, loud"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tOj4tdLRaA", "zj2R0XoFr5k"], "start_seconds": ["70", "50"], "properties": ["woman, laugh, baby", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a person speaking and a baby laughing?", "label": 0}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a car speeding up in the distance"], "sample_ids": ["w8uLijTqtlU", "u0TrcHhkPQ"], "start_seconds": ["70", "20"], "properties": ["wind, microphone, noise", "distance, car, speed"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a woman speaks as she rubs two objects together"], "sample_ids": ["yYEVLuqEytU", "vzxHnu-SFEw"], "start_seconds": ["40", "80"], "properties": ["animal, pig, background", "two objects, woman, speak"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a child speaks", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["yW6FWLSLkx4", "sLUnaPT5gM8"], "start_seconds": ["40", "0"], "properties": ["a, child, speaks", "loud, laughter, intermittent"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["multiple ducks quack continuously", "a stream of water runs briefly"], "sample_ids": ["wfHeoPDLMaM", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["multiple, quack, continuously", "stream, water, run"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["ducks are quacking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sd7xVssqlw", "yDoT73BWsdA"], "start_seconds": ["50", "10"], "properties": ["accelerates, tires, squealing", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving faster", "label": 0}, {"captions": ["a person whistles a meandering tune", "a stream of water runs briefly"], "sample_ids": ["uFoga8sHpiw", "x-PeY8Yb8M4"], "start_seconds": ["90", "300"], "properties": ["person, tune, whistle", "stream, water, run"], "captions_pred_video": ["footage of a bird in a cage", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person whistles a song", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["uRlbY6aoBU", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["a, distance, sneeze", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and ducks are quacking"], "question": "which entity is a person?", "label": 0}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a stream of water runs briefly"], "sample_ids": ["wnpJndXuxLc", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["beeps, loud, whistle", "stream, water, run"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "a stream of water runs briefly"], "sample_ids": ["vms5XGTDVQc", "x-PeY8Yb8M4"], "start_seconds": ["220", "300"], "properties": ["paper, crumpled, crinkled", "stream, water, run"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["paper is crumpled and crinkled", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "frogs croak and vocalize"], "sample_ids": ["vmrxwuAMb2I", "yswmmRZFItk"], "start_seconds": ["40", "0"], "properties": ["a dog, inhales, exhales", "croak, vocalize, frog"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "a close up of a frog in the water"], "captions_pred_audio": ["a dog barks and growls", "a frog is croaking"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a infant makes noise and is excited"], "sample_ids": ["smDKStoHBJo", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["a, infant, speaking", "noise, excited, infant"], "captions_pred_video": ["a man holding a crying baby in his arms", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a baby cries and a woman speaks"], "question": "which infant is making noise", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "water flows as men speak and yell"], "sample_ids": ["y2ZBGpgbhHM", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["birds, tweet, pant", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing?", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wqN6IIHw3po", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["rain, surface, fall", "engine, idle, woman"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and water is splashing", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a person sniffs and sneezes", "multiple birds chirp and an animal grunts"], "sample_ids": ["uRlbY6aoBU", "tDlysoZiA1I"], "start_seconds": ["0", "0"], "properties": ["sneezes, person, sniffs", "animal, grunt, multiple"], "captions_pred_video": [null, "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is sneezing ", "birds are chirping and a rooster is crowing "], "question": "which entity is a person", "label": 0}, {"captions": ["a person snoring", "people applaud and hoot and chat quietly"], "sample_ids": ["t8tv5YRMJUg", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["a person, snore, loud", "people, applaud, hoot"], "captions_pred_video": ["of a man getting his face licked by another man", null], "captions_pred_audio": ["a person sniffs and breathes heavily", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wqADXCzngMw", "uZesmtKZGSw"], "start_seconds": ["340", "250"], "properties": ["engine, idle, man", "men, talk, cars"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a man talking to an engine?", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "wind blowing followed by a zoom"], "sample_ids": ["zF8yoL0rkbI", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["engine, run, someone", "wind, blow, zoom"], "captions_pred_video": ["footage of the traffic on the street at night", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["birds chirp and wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sxIvBMSavMQ", "yajyRTUQk3U"], "start_seconds": ["210", "400"], "properties": ["birds, chirp, wind", "a woman, something, fried"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "bees buzz as wind blows"], "sample_ids": ["zsLxS-uLJTw", "tMJne1a4AFI"], "start_seconds": ["20", "0"], "properties": ["horn, blast, train", "bees, buzz, wind"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "a swarm of bees on the ground"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a swarm of bees buzzing around"], "question": "which entity is buzzing", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "vehicles pass by on a roadway"], "sample_ids": ["zgUgkpk78xU", "tgbONvsP47Y"], "start_seconds": ["70", "0"], "properties": ["horn, bells, ring", "pass, vehicle, roadway"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a baby laugh at a sputter", "wind blows as people chatter quietly"], "sample_ids": ["sLUnaPT5gM8", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["laugh, sputter, baby", "wind, chatter, people"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "a airplane flies overhead as a woman speaks"], "sample_ids": ["uWAAAL4CIoc", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["a woman, chirps, animal", "airplane, fly, woman"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is more likely to be flying", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["sapQIQUhFc", "siJFXfGWgDk"], "start_seconds": ["280", "50"], "properties": ["water, stream, trickles", "a, bird, vehicle"], "captions_pred_video": [null, "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking and birds are chirping in the background "], "question": "which entity is about a man speaking as water trickles down a stream?", "label": 0}, {"captions": ["a horn blasts as warning bells ring", "a muffled toilet flushes and the water drains"], "sample_ids": ["zgUgkpk78xU", "sfAvvZwdLCY"], "start_seconds": ["70", "20"], "properties": ["horn, bells, ring", "flushes, drains, water"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a toilet is flushed"], "question": "which entity is silent", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["yswmmRZFItk", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["background, frog, croak", "music, gunfire, explosion"], "captions_pred_video": ["a close up of a frog in the water", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a frog is croaking", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["an infant crying frantically", "paper folding and crinkling"], "sample_ids": ["zwOBqeFTgiU", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["cry, infant, frantically", "paper, fold, crinkle"], "captions_pred_video": ["of the baby crying in the car seat", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a baby cries loudly", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["a person speaks briefly", "plastic is tapped on while someone speaks"], "sample_ids": ["zOZleIRqZm4", "wvKpEYswXO0"], "start_seconds": ["80", "150"], "properties": ["person, talk, brief", "plastic, tap, speak"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a person talking?", "label": 0}, {"captions": ["a clock ticktocks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["v-g-j2uTByM", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["ticktocks, clock, ticktocks", "wind, blow, vehicle"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", null], "captions_pred_audio": ["a clock is ticking loudly", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a clock?", "label": 1}, {"captions": ["someone whistles a song", "a duck quacks continuously"], "sample_ids": ["sIXTftIuUgw", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["someone, song, whistle", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person whistling a song", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a toilet flushes and a female speaks"], "sample_ids": ["vzceMbklWc", "yaln9y8I7ms"], "start_seconds": ["180", "230"], "properties": ["water, faucet, sink", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["water is running and a man is speaking", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["yaln9y8I7ms", "su6FAOcOA8c"], "start_seconds": ["230", "4"], "properties": ["female, flushes, toilet", "engine, idle, woman"], "captions_pred_video": ["footage is blurry and out of focus", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking while a toilet flushes?", "label": 0}, {"captions": ["an airplane accelerates briefly", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zjTG0gaGCUI", "yajyRTUQk3U"], "start_seconds": ["80", "400"], "properties": ["accelerates, airplane, briefly", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a woman is speaking while food is frying in the background"], "question": "which object is fried", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a motorcycle engine is idling"], "sample_ids": ["x9JovgqUcs", "vZAqdHZ81yA"], "start_seconds": ["500", "180"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "engine, motorcycle, idling"], "captions_pred_video": [null, "a motorcycle is parked on the side of the road with its rear end facing the viewer"], "captions_pred_audio": ["a man speaks and types on a keyboard", "an engine is idling loudly"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a telephone rings followed by a woman talking"], "sample_ids": ["vJvryTwuAV8", "tGcFnX0GHI"], "start_seconds": ["16", "0"], "properties": ["audience, cheer, man", "ring, talk, woman"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", null], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["wIvYjuR3nrg", "zl9Dqx-j7q4"], "start_seconds": ["9", "6"], "properties": ["birds, pigeons, vocalize", "engine, laugh, loud"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and cooing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xfudFO976zE", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["animal, bleats, cry", "two men, woman, birds"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is talking", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "an infant crying as a woman laughs"], "sample_ids": ["wRV8yMk886E", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["liquid, spray, nozzle", "a, laugh, infant"], "captions_pred_video": ["two cars are parked in a parking lot at night", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["wind blows strongly", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["w8uLijTqtlU", "vXlk0lIQBFo"], "start_seconds": ["70", "470"], "properties": ["wind, blows, strongly", "wind, speak, vocalize"], "captions_pred_video": ["footage is blurry and shaky", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["the wind is blowing strongly", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a young woman speaks over spraying and another person yells"], "sample_ids": ["shmR4OZtzqA", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["man, engine, idle", "person, spray, yell"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man speaks while a motor runs", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a person screams glaringly", "dishes cling together then a man begins to speak"], "sample_ids": ["xC8kbrKJmco", "sQGXqGcwOTc"], "start_seconds": ["0", "3"], "properties": ["glaringly, screams, person", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a goat is bleating ", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["su6FAOcOA8c", "uYT5gxnyMWM"], "start_seconds": ["4", "50"], "properties": ["engine, idle, woman", "female, spraying, scream"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a woman speaking?", "label": 0}, {"captions": ["a vehicle engine runs and wind blows before women yell", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["w5W5Kqtc8E", "vfYTJq7nU"], "start_seconds": ["100", "130"], "properties": ["wind, blow, vehicle", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a baby cries and a woman moans", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["smDKStoHBJo", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["a, cry, woman", "airplane, boy, fly"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a telephone rings followed by a woman talking"], "sample_ids": ["sDSppXIlJrs", "tGcFnX0GHI"], "start_seconds": ["27", "0"], "properties": ["microphone, water, wind", "ring, talk, woman"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", null], "captions_pred_audio": ["the wind is blowing and water is splashing", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a recording", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "an aircraft engine runs as people speak"], "sample_ids": ["wqADXCzngMw", "wTjoRj1se3U"], "start_seconds": ["340", "390"], "properties": ["engine, idle, man", "engine, run, people"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a jet engine is running and people are talking"], "question": "which entity has a man talking to it?", "label": 0}, {"captions": ["water pouring and bubbling", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uyRfq-jKPpo", "vfYTJq7nU"], "start_seconds": ["50", "130"], "properties": ["water, bubbles, pouring", "rustling, ducks, quack"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", null], "captions_pred_audio": ["water is running from a faucet", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["zF8yoL0rkbI", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["engine, run, someone", "cling, speak, dishes"], "captions_pred_video": ["footage of the traffic on the street at night", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "mechanisms are operating and water is splashing "], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "children cheer as a man speaks then an audience screams"], "sample_ids": ["w1mlz3Pe4fU", "vJvryTwuAV8"], "start_seconds": ["300", "16"], "properties": ["vocalize, chirp, continuously", "audience, cheer, man"], "captions_pred_video": ["of a bird in a cage", "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking and a crowd is shouting and whooping "], "question": "which entity is a group of people", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "running water in a faucet with some clinks"], "sample_ids": ["vqZuVbG6-HI", "zNRChLjqcU"], "start_seconds": ["130", "220"], "properties": ["background, male, female", "water, faucet, run"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "water is running from a faucet into a sink"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as a car is passing by", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sK4u5T8hW78", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["a, car, pass", "three men, wind, flow"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more moving parts", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vSeGhaZt-aI", "sSMl2vc3ek"], "start_seconds": ["50", "20"], "properties": ["water, sink, talk", "loud, multiple, distance"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a door opens and closes", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vBHyYJ8pL0", "vYkA3cfXp5Q"], "start_seconds": ["2", "30"], "properties": ["open, close, door", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "an engine is idling"], "question": "which entity is a moving object", "label": 1}, {"captions": ["someone snores nearby", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["spJCm8tD9Zo", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["someone snores, nearby, someone", "airplane, boy, fly"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person is snoring loudly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "water flows as men speak and yell"], "sample_ids": ["y2bVZ7rz-5M", "vJ7JPEFhyLA"], "start_seconds": ["280", "16"], "properties": ["engine, horn, siren", "water, flow, men"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["su6FAOcOA8c", "yLy-WycbVVE"], "start_seconds": ["4", "30"], "properties": ["engine, idle, woman", "background, people, talk"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a door slams shut and an object moves on a hard surface", "a man speaks in the background while a slow tick repeats"], "sample_ids": ["zkKdxzNC97Y", "vZAw4apG0Es"], "start_seconds": ["27", "30"], "properties": ["hard, surface, door", "background, tick, repeat"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a door is opened and closed", "a clock is ticking and people are talking"], "question": "which entity is more calm", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a man speaks as a car is passing by"], "sample_ids": ["yPUYU6t3rwo", "sK4u5T8hW78"], "start_seconds": ["370", "30"], "properties": ["birds chirp, objects are moved around, birds", "a, car, pass"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["insects buzz and a man speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["tDlfY3nmx1A", "ukg5L09Wpvo"], "start_seconds": ["160", "150"], "properties": ["applause, laugh, man", "clickety-clack, train, whistle"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "dishes cling together then a man begins to speak"], "sample_ids": ["uEU-Hg5MTN8", "sQGXqGcwOTc"], "start_seconds": ["27", "3"], "properties": ["animal, grunts, snorts", "cling, speak, dishes"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "mechanisms are operating and water is splashing "], "question": "which entity is about a woman speaking and laughing and an animal grunts and snorts?", "label": 0}, {"captions": ["a person sniffs and sneezes", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["uRlbY6aoBU", "y8WEcpOlT3I"], "start_seconds": ["0", "40"], "properties": ["sneezes, person, sniffs", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be a person", "label": 0}, {"captions": ["a vehicle engine accelerating then running on idle", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vYkA3cfXp5Q", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["engine, accelerate, idle", "a woman, laughs, animal"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["u7C-AEBQM", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["ticks, rhythmic, quiet", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a ticktock of a clock", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "water flows as men speak and yell"], "sample_ids": ["vs65y4qmyBE", "vJ7JPEFhyLA"], "start_seconds": ["340", "16"], "properties": ["engine, run, man", "water, flow, men"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["children speak as a female ask them questions", "water is sprayed across a hard surface"], "sample_ids": ["wEBlkGWVWwE", "sQwlkXjQabo"], "start_seconds": ["260", "10"], "properties": ["female, speak, questions", "water, spray, surface"], "captions_pred_video": ["shows a person writing on the whiteboard", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["someone is burping continuously", "a car speeding up in the distance"], "sample_ids": ["y636gklDioE", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["burps, burps, burps", "distance, car, speed"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", null], "captions_pred_audio": ["a person burps loudly several times", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["several insects fly while two men talk", "some tunes played by whistling"], "sample_ids": ["s-T9OVOiMLo", "u6BnG6YZqJ4"], "start_seconds": ["330", "0"], "properties": ["several, fly, men", "tune, play, whistling"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a car accelerates and wind blows"], "sample_ids": ["tDVADusiIoc", "u0TrcHhkPQ"], "start_seconds": ["60", "20"], "properties": ["man, radio, blows", "accelerates, wind, blows"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vfYTJq7nU", "tiDFTC-5vU"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking as others laugh?", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "paper folding and crinkling"], "sample_ids": ["tK4VlLsNxak", "zPpG3RD8lSs"], "start_seconds": ["120", "20"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "paper, fold, crinkle"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["bees buzz and wind blows", "a machine engine runs and a man speaks"], "sample_ids": ["tMJne1a4AFI", "vs65y4qmyBE"], "start_seconds": ["0", "340"], "properties": ["bees buzz, wind blows, bees", "engine, run, man"], "captions_pred_video": ["a swarm of bees on the ground", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["a swarm of bees buzzing around", "a heavy engine is running and men are speaking "], "question": "which entity is a machine?", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xjhAnI2q6hM", "wDVMhEdTiVw"], "start_seconds": ["6", "30"], "properties": ["engine revs, vehicle, people", "gun, shoot, water"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun?", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "some tunes played by whistling"], "sample_ids": ["zuua6-5goWw", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["birds, chirp, quiet, man, speaks", "tune, play, whistling"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uEU-Hg5MTN8", "sSMl2vc3ek"], "start_seconds": ["27", "20"], "properties": ["animal, grunts, snorts", "loud, multiple, distance"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "people cheer as a vehicle engine revs"], "sample_ids": ["v7jJS8aAyA", "xjhAnI2q6hM"], "start_seconds": ["10", "6"], "properties": ["wind, blows, loudly", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["tDlysoZiA1I", "vlS6YMeWAPo"], "start_seconds": ["0", "40"], "properties": ["animal, grunt, chirp", "sheep, baa, birds"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a goat bleats and birds chirp"], "question": "which entity is a sheep?", "label": 1}, {"captions": ["a kid speaks followed by music playing", "water splashes as an animal walks through"], "sample_ids": ["tQWGZLItBXk", "w1ir-sZ3Im8"], "start_seconds": ["170", "90"], "properties": ["music, kid, speak", "animal, water, splashes"], "captions_pred_video": ["worms revolution screenshots", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sQGXqGcwOTc", "wqZ135Ssz0"], "start_seconds": ["3", "60"], "properties": ["audio, kid, giggles", "two men, woman, birds"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "a woman speaks as she rubs two objects together"], "sample_ids": ["tdWhHV3X25Q", "vzxHnu-SFEw"], "start_seconds": ["60", "80"], "properties": ["applause, audience, yells", "two objects, woman, speak"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["water splashes and a door squeaks", "several insects fly while two men talk"], "sample_ids": ["sdXV-ylviw", "s-T9OVOiMLo"], "start_seconds": ["190", "330"], "properties": ["sound, splash, door", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more flying insects", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "an infant crying frantically"], "sample_ids": ["x9JovgqUcs", "zwOBqeFTgiU"], "start_seconds": ["500", "30"], "properties": ["a, man, speaks, keyboard", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["ziUT9IFTkjg", "zj2R0XoFr5k"], "start_seconds": ["10", "50"], "properties": ["background, birds, rustling", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a man speaks while water drains", "a child speaks in closed space"], "sample_ids": ["vSeGhaZt-aI", "yW6FWLSLkx4"], "start_seconds": ["50", "40"], "properties": ["water, drain, man", "child, space, speak"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a stream of water runs briefly", "a vehicle engine accelerating then running on idle"], "sample_ids": ["x-PeY8Yb8M4", "vYkA3cfXp5Q"], "start_seconds": ["300", "30"], "properties": ["stream, water, run", "engine, accelerate, idle"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a car is driving on a wet road ", "an engine is idling"], "question": "which entity is a source of water", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["yaln9y8I7ms", "rqu8iB22IY"], "start_seconds": ["230", "5"], "properties": ["female, flushes, toilet", "sound, repeats, laugh"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and a man speaks", "a dog barks and a man speaks while music plays "], "question": "which entity has a female speaking?", "label": 0}, {"captions": ["a telephone rings and a bird vocalizes", "a small musical boom and then birds tweet and a few dogs pant"], "sample_ids": ["skd2PphS6oI", "y2ZBGpgbhHM"], "start_seconds": ["190", "30"], "properties": ["ring, bird, vocalize", "birds, tweet, pant"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", null], "captions_pred_audio": ["a telephone bell rings repeatedly ", "birds chirping and a dog panting"], "question": "which entity has more birds", "label": 1}, {"captions": ["an insect buzzes around continuously", "a propeller rotates loudly and intensely"], "sample_ids": ["v25l1jef3JY", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["buzzes, continuously, insect", "loud, intense, propeller"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a stream of water runs briefly"], "sample_ids": ["w5W5Kqtc8E", "x-PeY8Yb8M4"], "start_seconds": ["100", "300"], "properties": ["wind, engine, scream", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a vehicle engine revs and tires squeal"], "sample_ids": ["zl9Dqx-j7q4", "yDoT73BWsdA"], "start_seconds": ["6", "10"], "properties": ["motors rev, laugh, loudly", "engine revs, tires squeal, vehicle"], "captions_pred_video": ["footage of a man driving a car in the dark", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a jet engine roars ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vJvryTwuAV8", "wz7N8YRy74I"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "rooster, crow, background, men"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "someone is typing on a computer keyboard"], "sample_ids": ["xNMovAf3o50", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["rain, thunder, music", "keyboard, type, computer"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "how to make money on youtube in spanish"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a person is typing on a keyboard"], "question": "which entity is a person", "label": 1}, {"captions": ["a person snoring", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["t8tv5YRMJUg", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["a person, snore, loud", "female, spraying, scream"], "captions_pred_video": ["of a man getting his face licked by another man", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a woman is speaking and a baby is crying"], "question": "which entity is louder", "label": 1}, {"captions": ["an engine runs loudly", "an airplane engine runs"], "sample_ids": ["vqZuVbG6-HI", "yVPZ2MNWpms"], "start_seconds": ["130", "0"], "properties": ["loud, engine, run", "engine, airplane, runs"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a car is driving by on the road "], "question": "which entity is running", "label": 1}, {"captions": ["someone snores nearby", "a man speaks as a car is passing by"], "sample_ids": ["spJCm8tD9Zo", "sK4u5T8hW78"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "a, car, pass"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "water flows as men speak and yell"], "sample_ids": ["xNMovAf3o50", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["rain, thunder, music", "water, flow, men"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "people speak as gunfire rings out"], "sample_ids": ["vzceMbklWc", "wqTCwqVRDlk"], "start_seconds": ["180", "80"], "properties": ["water, faucet, sink", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["water is running and a man is speaking", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "vehicles pass by on a roadway"], "sample_ids": ["sapQIQUhFc", "tgbONvsP47Y"], "start_seconds": ["280", "0"], "properties": ["water, stream, trickles", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a infant makes noise and is excited", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wIJK3-5y0kA", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["noise, excited, infant", "engine, accelerate, idle"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a baby cries and a woman speaks", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zY3icUyMdh8", "zFjIWfSD-4"], "start_seconds": ["20", "410"], "properties": ["dog, bark, engine", "People, motor, brakes"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a vehicle?", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "a telephone rings followed by a woman talking"], "sample_ids": ["x6ijhqRY38s", "tGcFnX0GHI"], "start_seconds": ["250", "0"], "properties": ["something metal, glass, hit", "ring, talk, woman"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a woman speaks with water running", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wTideSjRFS0", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["water, running, woman", "music, gunfire, explosion"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more action", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a train horn blows as it passes by"], "sample_ids": ["zofjfKhqLk8", "zVacuqSb4LI"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "horn, blows, train"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "water pouring and bubbling"], "sample_ids": ["vSeGhaZt-aI", "uyRfq-jKPpo"], "start_seconds": ["50", "50"], "properties": ["water, bubbles, run", "water, bubbles, pouring"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "water is running from a faucet"], "question": "which entity has more water", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "a toilet flushes and water drains"], "sample_ids": ["t8CV69hcvF0", "sfAvvZwdLCY"], "start_seconds": ["210", "20"], "properties": ["person, sneeze, follow", "water drains, flushes, water"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman sneezes and speaks", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a infant makes noise and is excited", "birds vocalize and chirp continuously"], "sample_ids": ["wIJK3-5y0kA", "w1mlz3Pe4fU"], "start_seconds": ["30", "300"], "properties": ["noise, excited, infant", "vocalize, chirp, continuously"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "of a bird in a cage"], "captions_pred_audio": ["a baby cries and a woman speaks", "birds are chirping and singing"], "question": "which entity is more vocal", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a infant makes noise and is excited"], "sample_ids": ["voJh2gJxXhA", "wIJK3-5y0kA"], "start_seconds": ["50", "30"], "properties": ["music, frog, croak", "noise, excited, infant"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["vehicles pass by on a roadway", "people speak as gunfire rings out"], "sample_ids": ["tgbONvsP47Y", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["pass, vehicle, roadway", "gunfire, ring, speak"], "captions_pred_video": ["footage of a fire truck entering a garage", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car is driving on the road ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["a woman sneezes then speaks", "a man speaks in the background while a slow tick repeats"], "sample_ids": ["x4dZyf9Gbj0", "vZAw4apG0Es"], "start_seconds": ["130", "30"], "properties": ["sneezes, speaks, woman", "background, tick, repeat"], "captions_pred_video": ["footage is blurry and out of focus", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a woman sneezes and speaks", "a clock is ticking and people are talking"], "question": "which entity is a man speaking in the background?", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["w34HjHr6gAY", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["beeps, squawk, child speaking", "a woman, a television program, a bird"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a car speeding up in the distance"], "sample_ids": ["xC8kbrKJmco", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["background, goat, scream", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "small dogs yip and bark sharply"], "sample_ids": ["wRBHTgrbiwg", "v-wcQf4BDY0"], "start_seconds": ["50", "120"], "properties": ["bird, owl, speak", "bark, yip, sharply"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "pigeons vocalize and birds chirp"], "sample_ids": ["xV7Mg1QucSc", "uiS58TNyUiw"], "start_seconds": ["14", "430"], "properties": ["alarm, ticktocks, laughs", "vocalize, bird, chirp"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "of the pigeon in the cage"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a vehicle engine accelerating then running on idle"], "sample_ids": ["y8dSeubCNI", "vYkA3cfXp5Q"], "start_seconds": ["4", "30"], "properties": ["men, women, car", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["an engine revving and people talking in the background", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "some tunes played by whistling"], "sample_ids": ["uEU-Hg5MTN8", "u6BnG6YZqJ4"], "start_seconds": ["27", "0"], "properties": ["animal, grunts, snorts", "tune, play, whistling"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a toilet flushes and water drains", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sfAvvZwdLCY", "y8WEcpOlT3I"], "start_seconds": ["20", "40"], "properties": ["water drains, flushes, water", "harsh, wind, blows"], "captions_pred_video": ["footage of the toilet in the bathroom", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with wind noise in the background "], "question": "which entity is a source of water", "label": 0}, {"captions": ["some liquid flows while a woman laughs and man talks", "people cheer as a vehicle engine revs"], "sample_ids": ["vddP56-ogds", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["liquid, laughs, man", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a truck is revving its engine and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["birds tweet and squawk", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["w1mlz3Pe4fU", "zl9Dqx-j7q4"], "start_seconds": ["300", "6"], "properties": ["squawk, tweet, scream", "engine, laugh, loud"], "captions_pred_video": ["of a bird in a cage", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and singing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a woman speaks happily and an animal chirps"], "sample_ids": ["zofjfKhqLk8", "uWAAAL4CIoc"], "start_seconds": ["10", "0"], "properties": ["background, metal, clank", "a woman, chirps, animal"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a baby laugh at a sputter", "water flows and trickles"], "sample_ids": ["sLUnaPT5gM8", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["laugh, sputter, baby", "water, flow, trickle"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["siJFXfGWgDk", "uZesmtKZGSw"], "start_seconds": ["50", "250"], "properties": ["a, bird, vehicle", "men, talk, cars"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sWZzXuWYY", "uEU-Hg5MTN8"], "start_seconds": ["420", "27"], "properties": ["male, clanks, thumps", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["wTideSjRFS0", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["food, sizzle, woman", "two men, woman, birds"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yeFvk9x0wWI", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["clack, bird, chirp", "rooster, crow, background, men"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["people speak as gunfire rings out", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wqTCwqVRDlk", "y8WEcpOlT3I"], "start_seconds": ["80", "40"], "properties": ["gunfire, ring, speak", "harsh, wind, blows"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "music plays and animals vocalize as a cartoon character makes sounds"], "sample_ids": ["xl2PIWyXaM", "weDbePuc-Xc"], "start_seconds": ["160", "40"], "properties": ["chirp, man, younger person", "cartoon character, music, vocalize"], "captions_pred_video": [null, "a cartoon frog and a butterfly are sitting on the ground next to each other"], "captions_pred_audio": ["birds are chirping and people are talking", "a man is speaking and birds are chirping with a frog croaking in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sG7TyPnFDR0", "sLUnaPT5gM8"], "start_seconds": ["180", "0"], "properties": ["beeps, machine, smoke alarm", "loud, laughter, intermittent"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a child babbles as a woman speaks", "a man speaks as a machine runs"], "sample_ids": ["wEBlkGWVWwE", "vD6lYD1l0BY"], "start_seconds": ["260", "330"], "properties": ["a, babble, woman", "a, machine, run"], "captions_pred_video": ["shows a person writing on the whiteboard", "game controller being held in the hands of the person"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking and dishes are being washed "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "wind blows and a vehicle blows a hard then a train blows a horn"], "sample_ids": ["zALy31PjDl0", "wnpJndXuxLc"], "start_seconds": ["21", "50"], "properties": ["a man, a vehicle, a horn", "blows, vehicle, train"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity has a train blowing a horn?", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a man speaks as a motor runs in the background"], "sample_ids": ["zofjfKhqLk8", "xZepNM9qcRA"], "start_seconds": ["10", "30"], "properties": ["background, metal, clings", "background, motor, run"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "pigeons vocalize and birds chirp"], "sample_ids": ["wvKpEYswXO0", "uiS58TNyUiw"], "start_seconds": ["150", "430"], "properties": ["sound, water, running", "vocalize, bird, chirp"], "captions_pred_video": ["of the person preparing food in the kitchen", "of the pigeon in the cage"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uZesmtKZGSw", "su6FAOcOA8c"], "start_seconds": ["250", "4"], "properties": ["car, track, man", "engine, idle, woman"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a door slams shut and an object moves on a hard surface", "a child speaks in closed space"], "sample_ids": ["zkKdxzNC97Y", "yW6FWLSLkx4"], "start_seconds": ["27", "40"], "properties": ["hard, surface, door", "child, space, speak"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a door is opened and closed", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ugHJF0hfYkg", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["engine, running, continuously", "female, spraying, scream"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "a dog barks and whimpers"], "sample_ids": ["xzKKf9bKNUo", "sShpyu2l4YQ"], "start_seconds": ["10", "0"], "properties": ["background, noise, snoring", "barks, whimpers, dog"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "the puppies are playing with a toy"], "captions_pred_audio": ["a person snoring loudly", "a dog is barking and growling"], "question": "which entity is more active", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["rwtmaKiCcQU", "uWPRNLnpy7Y"], "start_seconds": ["30", "10"], "properties": ["nozzle, depressed, spray can", "accelerate, laugh, vehicle"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "is taken from a car driving down the street"], "captions_pred_audio": ["spraying and people speaking", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["some men converse over an engine running", "paper is crumpling consistently"], "sample_ids": ["sCiy7QS1U", "v5cSxLaHADY"], "start_seconds": ["300", "0"], "properties": ["men, converse, engine", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["shmR4OZtzqA", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["man, engine, idle", "airplane, boy, fly"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man speaks while a motor runs", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a toilet flushes and a female speaks"], "sample_ids": ["yeFvk9x0wWI", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["clack, bird, chirp", "female, flushes, toilet"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage is blurry and out of focus"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a toilet flushes and a man speaks"], "question": "which entity is accompanied by a female speaking?", "label": 1}, {"captions": ["a man talks while vehicles pass by", "an airplane engine runs"], "sample_ids": ["sK4u5T8hW78", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["a, man, talk", "engine, airplane, runs"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "pigeons vocalize and birds chirp"], "sample_ids": ["sxYkFKFIZD0", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["screech, man, door", "vocalize, bird, chirp"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["water splashes as an animal walks through", "an infant crying as a woman laughs"], "sample_ids": ["w1ir-sZ3Im8", "xhmRY9yhC7c"], "start_seconds": ["90", "20"], "properties": ["animal, water, splashes", "a, laugh, infant"], "captions_pred_video": ["footage of a group of people riding horses through a river", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a vehicle engine revs and tires squeal"], "sample_ids": ["xBxDz0CFVn0", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["stream, water, flow", "engine revs, tires squeal, vehicle"], "captions_pred_video": ["footage is blurry and out of focus", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a child speaks in closed space"], "sample_ids": ["zofjfKhqLk8", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["background, metal, clank", "child, space, speak"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "an electric engine works nearby followed by a child talking"], "sample_ids": ["wz7N8YRy74I", "xSKJGCItUWE"], "start_seconds": ["30", "10"], "properties": ["rooster, crow, background, people", "engine, work, child"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a high pitched engine is running and a child speaks"], "question": "which entity has a child talking?", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a man speaks as a car is passing by"], "sample_ids": ["sQwlkXjQabo", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["water, spray, surface", "a, car, pass"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["some men converse over an engine running", "an emergency vehicle engine runs then a horn blows and siren sounds"], "sample_ids": ["sCiy7QS1U", "y2bVZ7rz-5M"], "start_seconds": ["300", "280"], "properties": ["men, converse, engine", "engine, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["an infant crying frantically", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zwOBqeFTgiU", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["cry, infant, frantically", "two men, woman, birds"], "captions_pred_video": ["of the baby crying in the car seat", null], "captions_pred_audio": ["a baby cries loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a human", "label": 1}, {"captions": ["goats bleat and people speak", "several insects fly while two men talk"], "sample_ids": ["z5iUE5h0EPs", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["goats bleat, people speak, language", "several, fly, men"], "captions_pred_video": ["of the goat in the barn", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a goat bleats and a man speaks", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["vehicles pass by on a roadway", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tgbONvsP47Y", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["pass, vehicle, roadway", "rustling, ducks, quack"], "captions_pred_video": ["footage of a fire truck entering a garage", null], "captions_pred_audio": ["a car is driving on the road ", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and water drains", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sfAvvZwdLCY", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["water drains, flushes, water", "a woman, something, fried"], "captions_pred_video": ["footage of the toilet in the bathroom", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a person talking?", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "wind blowing followed by a zoom"], "sample_ids": ["smDKStoHBJo", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["a, infant, speaking", "wind, blow, zoom"], "captions_pred_video": ["a man holding a crying baby in his arms", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a person is snoring while sleeping", "a stream of water runs briefly"], "sample_ids": ["vJrjSeP17yE", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["a person is sleeping, snoring, person", "stream, water, run"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person snoring loudly", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a person is snoring while sleeping", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vJrjSeP17yE", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["a person is sleeping, snoring, person", "People, motor, brakes"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a person?", "label": 0}, {"captions": ["continuous snoring", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["sLkeqCDJIyw", "zY3icUyMdh8"], "start_seconds": ["120", "20"], "properties": ["loud, snoring, noise", "dog, bark, engine"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a person is snoring loudly", "a car is driving and dogs are barking and squealing "], "question": "which entity is louder", "label": 1}, {"captions": ["a person speaks briefly", "an insect buzzes around continuously"], "sample_ids": ["zOZleIRqZm4", "v25l1jef3JY"], "start_seconds": ["80", "0"], "properties": ["person, talk, brief", "buzzes, continuously, insect"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a fly is buzzing around a microphone "], "question": "which entity is not a person?", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sZPuqDgX2V0", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["commentator, race, track", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "an infant crying as a woman laughs"], "sample_ids": ["w5W5Kqtc8E", "xhmRY9yhC7c"], "start_seconds": ["100", "20"], "properties": ["wind, blow, vehicle", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a motorcycle engine is idling", "dog barking and vehicle engine idling followed shortly by vehicle engine revving"], "sample_ids": ["vZAqdHZ81yA", "zY3icUyMdh8"], "start_seconds": ["180", "20"], "properties": ["engine, motorcycle, idling", "dog, bark, engine"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["an engine is idling loudly", "a car is driving and dogs are barking and squealing "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "pigeons vocalize and birds chirp"], "sample_ids": ["zkKdxzNC97Y", "uiS58TNyUiw"], "start_seconds": ["27", "430"], "properties": ["hard, surface, door", "vocalize, bird, chirp"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of the pigeon in the cage"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an airplane engine runs", "pigeons vocalize and birds chirp"], "sample_ids": ["yVPZ2MNWpms", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["engine, airplane, runs", "vocalize, bird, chirp"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "of the pigeon in the cage"], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "water flows and trickles"], "sample_ids": ["xKB8O8LTs6s", "tB7hWb9gTuQ"], "start_seconds": ["70", "30"], "properties": ["music, radio, gunshots", "water, flow, trickle"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a man speaks as a car is passing by"], "sample_ids": ["vcmWSmvti8", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["music, man, fire", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a man speaking as a car passes by?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["vimzuGQvdcU", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["a, man, yells", "background, birds, rustling"], "captions_pred_video": ["a group of people are rafting down a river", null], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speak as gunfire rings out", "wind blowing followed by a zoom"], "sample_ids": ["wqTCwqVRDlk", "vr8ZXjEBhMQ"], "start_seconds": ["80", "150"], "properties": ["gunfire, ring, speak", "wind, blow, zoom"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and a gun is fired", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more calm", "label": 1}, {"captions": ["a small engine spits as it runs", "two women and a man talk while a kid cries"], "sample_ids": ["sZvwOuuPGP0", "wyllXV6PjKo"], "start_seconds": ["50", "30"], "properties": ["spits, engine, runs", "a kid, talk, cry"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", null], "captions_pred_audio": ["a medium engine is running ", "a woman speaks and a baby cries"], "question": "which entity is a person", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["wy1eKjR7KC0", "tDlysoZiA1I"], "start_seconds": ["30", "0"], "properties": ["people, talk, distance", "animal, grunts, chirps"], "captions_pred_video": ["two police officers riding motorcycles down the street", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is speaking and a siren is going off", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a man speaks and is typing on a keyboard"], "sample_ids": ["ukg5L09Wpvo", "x9JovgqUcs"], "start_seconds": ["150", "500"], "properties": ["a train, a horn, a bell", "a, man, speaks, keyboard"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man speaks and types on a keyboard"], "question": "which entity is stationary", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "pigeons vocalize and birds chirp"], "sample_ids": ["w2JXXIAdUdg", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["snoring, distance, person", "vocalize, bird, chirp"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of the pigeon in the cage"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a man speaks followed by another man speaking outside"], "sample_ids": ["xjvTpk2Zpr8", "viuTg1M-dqg"], "start_seconds": ["70", "30"], "properties": ["wind, blows, vehicle", "two men, speak, follow"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a single person speaking?", "label": 0}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "people applaud and hoot and chat quietly"], "sample_ids": ["y2ZBGpgbhHM", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["birds, tweet, pant", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a performance", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "water flows and trickles"], "sample_ids": ["w6RTHR6AeAg", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["call, owl, screech", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["vddP56-ogds", "t25U-v4k4ts"], "start_seconds": ["30", "40"], "properties": ["water, flow, laugh", "a, chirps, bird"], "captions_pred_video": [null, "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking and bees are buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "pigeons vocalize and birds chirp"], "sample_ids": ["w34HjHr6gAY", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["beeps, squawk, child speaking", "vocalize, bird, chirp"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "of the pigeon in the cage"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a man speaks as a motor runs in the background"], "sample_ids": ["vimzuGQvdcU", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "background, motor, run"], "captions_pred_video": ["a group of people are rafting down a river", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["people clap and speak in the distance", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wwyfGO2J4", "xKB8O8LTs6s"], "start_seconds": ["90", "70"], "properties": ["clap, distance, speak", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "people applaud and hoot and chat quietly"], "sample_ids": ["tDlysoZiA1I", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["animal, grunt, multiple", "people, applaud, hoot"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["smDKStoHBJo", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["a, infant, speaking", "a train, a horn, a bell"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["zj2R0XoFr5k", "uWAAAL4CIoc"], "start_seconds": ["50", "0"], "properties": ["airplane, fly, woman", "a woman, chirps, animal"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", null], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a dog is barking "], "question": "which entity is a bird?", "label": 1}, {"captions": ["a male speaks over some small clicks", "wind blows as people chatter quietly"], "sample_ids": ["uXxVebHsGZ8", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["male, clicks, speak", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a woman speaks happily and an animal chirps"], "sample_ids": ["wztCSUxOf8", "uWAAAL4CIoc"], "start_seconds": ["130", "0"], "properties": ["a crowd, yells, applauds", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vhJWZheqaE", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["water drains unevenly, toilet flushes, water drains", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a toilet is flushed", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a jet engine spools up and takes off", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vBslzh7saPw", "sSMl2vc3ek"], "start_seconds": ["90", "20"], "properties": ["engine, spools, takes", "loud, multiple, distance"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a child speaks in closed space"], "sample_ids": ["xZepNM9qcRA", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["background, motor, run", "child, space, speak"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a propeller rotates loudly and intensely"], "sample_ids": ["sYITalLZjj4", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["water, rushes, background, birds", "loud, intense, propeller"], "captions_pred_video": ["two ducks are swimming in the water near each other", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["wind blows and birds chirp", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sjlVMgdGSK0", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["accelerates, vehicle, race car", "a, scream, girl"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "an airplane engine runs"], "sample_ids": ["weDbePuc-Xc", "yVPZ2MNWpms"], "start_seconds": ["40", "0"], "properties": ["cartoon character, music, vocalize", "engine, airplane, runs"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a horn honks and then loudly blares", "an infant crying frantically"], "sample_ids": ["wnpJndXuxLc", "zwOBqeFTgiU"], "start_seconds": ["50", "30"], "properties": ["horn, honk, loud", "cry, infant, frantically"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "of the baby crying in the car seat"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a baby cries loudly"], "question": "which entity is louder", "label": 0}, {"captions": ["wind noise makes sound into a microphone", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["w8uLijTqtlU", "yajyRTUQk3U"], "start_seconds": ["70", "400"], "properties": ["wind, microphone, noise", "a woman, something, fried"], "captions_pred_video": ["footage is blurry and shaky", "- a woman cooking in the kitchen"], "captions_pred_audio": ["the wind is blowing strongly", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zl9Dqx-j7q4", "vfYTJq7nU"], "start_seconds": ["6", "130"], "properties": ["motors rev, laugh, loudly", "rustling, ducks, quack"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a duck quacks and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "water flows as men speak and yell"], "sample_ids": ["slZLHwNbbt4", "vJ7JPEFhyLA"], "start_seconds": ["300", "16"], "properties": ["train, horn, sound", "water, flow, men"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a moving object", "label": 0}, {"captions": ["a clock ticks quietly and rhythmically", "some men converse over an engine running"], "sample_ids": ["u7C-AEBQM", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["ticks, rhythmic, quiet", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more active", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["uPDn2BFTHk", "uYT5gxnyMWM"], "start_seconds": ["140", "50"], "properties": ["lady, laugh, baby", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity has a baby?", "label": 0}, {"captions": ["a person is snoring while sleeping", "small dogs yip and bark sharply"], "sample_ids": ["vJrjSeP17yE", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["a person is sleeping, snoring, person", "bark, yip, sharply"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a person snoring loudly", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "several insects fly while two men talk"], "sample_ids": ["su6FAOcOA8c", "s-T9OVOiMLo"], "start_seconds": ["4", "330"], "properties": ["engine, run, woman", "several, fly, men"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has a woman making an announcement?", "label": 0}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sxYkFKFIZD0", "wz7N8YRy74I"], "start_seconds": ["20", "30"], "properties": ["screech, man, door", "rooster, crow, background, men"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a stream of water flows quickly", "children speak and play together"], "sample_ids": ["wbHTKEJZyhc", "yVVP8XvWJTo"], "start_seconds": ["20", "260"], "properties": ["stream, water, flow", "children, speak, play"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "children are speaking and breathing with background noise "], "question": "which entity is moving faster", "label": 0}, {"captions": ["an adult woman and an adult man speak", "a car accelerates and wind blows"], "sample_ids": ["zTLVJCo4WEE", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["two people, adult, speak", "accelerates, wind, blows"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a race car accelerates and revs its engine "], "question": "which is not a person", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yRx9txMcBl0", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["accelerates, tires, squeals", "clickety-clack, train, whistle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "some tunes played by whistling"], "sample_ids": ["sHbXC6na9hg", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["a person, saw, wood", "tune, play, whistling"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["an engine is idling and vibrating", "a person whistling a song"], "question": "which entity is not a person?", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xyL9F5VrjkE", "yDoT73BWsdA"], "start_seconds": ["20", "10"], "properties": ["wind, blows, vehicle", "engine, revs, vehicle"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving", "label": 1}, {"captions": ["food is frying then a woman speaks", "a machine beeps continuously"], "sample_ids": ["ukxt9I7eMMg", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["food, woman, speak", "beeps, machine, continuously"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a door opens and closes", "a clock ticktocks"], "sample_ids": ["vBHyYJ8pL0", "v-g-j2uTByM"], "start_seconds": ["2", "30"], "properties": ["open, close, door", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a clock is ticking loudly"], "question": "which entity is a clock", "label": 1}, {"captions": ["dogs barking and whimpering", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["tIY7qOV3rEM", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "rooster, crow, background, men"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xV7Mg1QucSc", "xKB8O8LTs6s"], "start_seconds": ["14", "70"], "properties": ["alarm, ticktocks, laughs", "music, gunfire, explosion"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a child yells and another yells", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vMDHu7Lxcgw", "wDVMhEdTiVw"], "start_seconds": ["410", "30"], "properties": ["two, yell, child", "gun, shoot, water"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["yRx9txMcBl0", "wDVMhEdTiVw"], "start_seconds": ["40", "30"], "properties": ["accelerates, tires, squeals", "gun, shoot, water"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun", "label": 1}, {"captions": ["an engine runs loudly", "a man speaks and is typing on a keyboard"], "sample_ids": ["vqZuVbG6-HI", "x9JovgqUcs"], "start_seconds": ["130", "500"], "properties": ["loud, engine, run", "a, man, speaks, keyboard"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man speaks and types on a keyboard"], "question": "which entity is quieter", "label": 1}, {"captions": ["a person is snoring while sleeping", "people speak and tapping occurs"], "sample_ids": ["vJrjSeP17yE", "tFCUUGdREgA"], "start_seconds": ["40", "70"], "properties": ["a person is sleeping, snoring, person", "people, tap, speak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a person riding a white horse in an indoor arena"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and walking with wind noise in the background "], "question": "which entity is a person", "label": 0}, {"captions": ["a child babbles as a woman speaks", "a clock ticktocks"], "sample_ids": ["wEBlkGWVWwE", "v-g-j2uTByM"], "start_seconds": ["260", "30"], "properties": ["a, babble, woman", "ticktocks, clock, ticktocks"], "captions_pred_video": ["shows a person writing on the whiteboard", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["zsLxS-uLJTw", "vlJS7LN2XyM"], "start_seconds": ["20", "30"], "properties": ["horn, blast, train", "background, clocks, ticking"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a ticktock of a clock"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "a woman speaks happily and an animal chirps"], "sample_ids": ["vVhthZ45k3Y", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["cat, purr, hiss", "a woman, chirps, animal"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a woman is speaking and a dog is barking "], "question": "which entity is more likely to be a bird", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "water is sprayed across a hard surface"], "sample_ids": ["smGI3C1NZc", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["water, drain, toilet", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a toilet is flushed", "spraying followed by silence"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yDoT73BWsdA", "uZesmtKZGSw"], "start_seconds": ["10", "250"], "properties": ["engine revs, tires squeal, vehicle", "men, talk, cars"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["a man speaks while water drains", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vSeGhaZt-aI", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["water, drain, man", "a, scream, girl"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a person sneezes followed by another person speaking"], "sample_ids": ["tOSWIURC-4", "t8CV69hcvF0"], "start_seconds": ["0", "210"], "properties": ["engine, work, nearby", "person, sneeze, follow"], "captions_pred_video": [null, "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a lawn mower is running ", "a woman sneezes and speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "some men converse over an engine running"], "sample_ids": ["uWAAAL4CIoc", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["a, dog, vocalize", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a door opens and closes"], "sample_ids": ["slZLHwNbbt4", "vBHyYJ8pL0"], "start_seconds": ["300", "2"], "properties": ["clap, distance, horn", "open, close, door"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is more likely to be a door", "label": 1}, {"captions": ["speaking following by laughing and clapping", "an airplane engine runs"], "sample_ids": ["u2f5NpsoHBg", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["person, laugh, clap", "engine, airplane, runs"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a car is driving by on the road "], "question": "which entity is not a person?", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "an engine runs loudly"], "sample_ids": ["sQwlkXjQabo", "vqZuVbG6-HI"], "start_seconds": ["10", "130"], "properties": ["liquid, surface, spray", "loud, engine, run"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage is blurry because it's raining outside"], "captions_pred_audio": ["spraying followed by silence", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["people speak in a closed space", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sTpirNYo8vQ", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["people, space, speak", "a woman, something, fried"], "captions_pred_video": ["of a man taking a selfie on a bus", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vh30P49Po6s", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["loud, continuous, quacks", "a woman, something, fried"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "wind blowing followed by a zoom"], "sample_ids": ["sShpyu2l4YQ", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["growl, bark, yip", "wind, blow, zoom"], "captions_pred_video": ["the puppies are playing with a toy", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a dog is barking and growling", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["v-wcQf4BDY0", "xfaoyyzw2WU"], "start_seconds": ["120", "180"], "properties": ["bark, yip, sharply", "loud, jet engine, roar"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a dog barks and growls", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["v0x1odnXtP0", "vlS6YMeWAPo"], "start_seconds": ["210", "40"], "properties": ["keyboard, type, computer", "sheep, baa, birds"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a person is typing on a keyboard", "a goat bleats and birds chirp"], "question": "which entity is not a person?", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a infant makes noise and is excited"], "sample_ids": ["xjvTpk2Zpr8", "wIJK3-5y0kA"], "start_seconds": ["70", "30"], "properties": ["engine, run, wind", "noise, excited, infant"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "paper is crumpling consistently"], "sample_ids": ["skd2PphS6oI", "v5cSxLaHADY"], "start_seconds": ["190", "0"], "properties": ["ring, bird, vocalize", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaking with light rustling", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["zOZleIRqZm4", "yDoT73BWsdA"], "start_seconds": ["80", "10"], "properties": ["light, rustling, man", "engine, revs, vehicle"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "an infant crying frantically"], "sample_ids": ["zkKdxzNC97Y", "zwOBqeFTgiU"], "start_seconds": ["27", "30"], "properties": ["hard, surface, door", "cry, infant, frantically"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of the baby crying in the car seat"], "captions_pred_audio": ["a door is opened and closed", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "ducks quack as a man speaks and makes a duck sound"], "sample_ids": ["vVhthZ45k3Y", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["cat, purr, hiss", "ducks, quack, man"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a duck quacks and a woman speaks"], "question": "which entity is a man speaking to animals?", "label": 0}, {"captions": ["dogs bark as an engine runs and a person whistles", "pigeons vocalize and birds chirp"], "sample_ids": ["zY3icUyMdh8", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["dog, bark, engine", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of the pigeon in the cage"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "paper folding and crinkling"], "sample_ids": ["wztCSUxOf8", "zPpG3RD8lSs"], "start_seconds": ["130", "20"], "properties": ["a crowd, yells, applauds", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "the wind blows and a mouse clicks "], "question": "which is not a crowd", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a car accelerates and wind blows"], "sample_ids": ["xjvTpk2Zpr8", "u0TrcHhkPQ"], "start_seconds": ["70", "20"], "properties": ["wind, blows, vehicle", "accelerates, wind, blows"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "a race car accelerates and revs its engine "], "question": "which entity is more likely to be a car", "label": 1}, {"captions": ["a clock ticktocks briefly", "a car speeding up in the distance"], "sample_ids": ["u7C-AEBQM", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["ticktocks, clock, ticktocks briefly", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a clock ticktocks"], "sample_ids": ["wPz6QRAkEb4", "v-g-j2uTByM"], "start_seconds": ["60", "30"], "properties": ["chirps, tweets, song", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a bird in a cage on top of a pole", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a man speaks as crickets sing"], "sample_ids": ["w5W5Kqtc8E", "ryFDPxgDOGc"], "start_seconds": ["100", "570"], "properties": ["wind, engine, scream", "a, crickets, sing"], "captions_pred_video": [null, "a group of people dressed in camouflage and hunting gear in the dark"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking with crickets chirping in the background"], "question": "which entity is quieter", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a horn rings out as a machine runs by"], "sample_ids": ["wSVhSdj0F0", "slZLHwNbbt4"], "start_seconds": ["10", "300"], "properties": ["horn honks, keys jingle, slam", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["an engine runs loudly", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vqZuVbG6-HI", "wDVMhEdTiVw"], "start_seconds": ["130", "30"], "properties": ["loud, engine, run", "gun, shoot, water"], "captions_pred_video": ["footage is blurry because it's raining outside", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yajyRTUQk3U", "vJ7JPEFhyLA"], "start_seconds": ["400", "16"], "properties": ["a woman, something, fried", "three men, wind, flow"], "captions_pred_video": ["- a woman cooking in the kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about cooking?", "label": 0}, {"captions": ["a clock ticks quietly and rhythmically", "a man speaks over intermittent keyboard taps"], "sample_ids": ["u7C-AEBQM", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["ticks, rhythmic, quiet", "audio, man, keyboard"], "captions_pred_video": [null, "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a ticktock of a clock", "a man speaks and types on a computer keyboard "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a cat meows as a young woman speaks", "people speak as gunfire rings out"], "sample_ids": ["x5cuQjOdM3E", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["cat, meows, young woman", "gunfire, ring, speak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["plastic is tapped on while someone speaks", "music plays and animals vocalize as a cartoon character makes sounds"], "sample_ids": ["wvKpEYswXO0", "weDbePuc-Xc"], "start_seconds": ["150", "40"], "properties": ["plastic, tap, speak", "cartoon character, music, vocalize"], "captions_pred_video": ["of the person preparing food in the kitchen", "a cartoon frog and a butterfly are sitting on the ground next to each other"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking and birds are chirping with a frog croaking in the background "], "question": "which entity is a cartoon character?", "label": 1}, {"captions": ["women speak and laugh as wind blows", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["un9VQlzgZM", "zFjIWfSD-4"], "start_seconds": ["5", "410"], "properties": ["wind, speak, laugh", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "some tunes played by whistling"], "sample_ids": ["sOa7g-44Dag", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["background, man, spray", "tune, play, whistling"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["a person snoring several times", "several insects fly while two men talk"], "sample_ids": ["spJCm8tD9Zo", "s-T9OVOiMLo"], "start_seconds": ["90", "330"], "properties": ["snore, person, several", "several, fly, men"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a person?", "label": 0}, {"captions": ["a clock alarm sounds and gears turn", "water splashes as an animal walks through"], "sample_ids": ["w2M4i1mklOA", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["alarm, gears, turn", "animal, water, splashes"], "captions_pred_video": ["footage of an antique clock", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a child speaks in closed space"], "sample_ids": ["su6FAOcOA8c", "yW6FWLSLkx4"], "start_seconds": ["4", "40"], "properties": ["engine, run, woman", "child, space, speak"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["someone is snoring while sleeping", "a helicopter engine runs continuously"], "sample_ids": ["ujMt0-D-x2k", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["snore, sleep, someone", "engine, running, continuously"], "captions_pred_video": ["of the dog playing with a toy on the floor", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a person is snoring loudly", "a helicopter is flying overhead "], "question": "which entity is not running continuously?", "label": 0}, {"captions": ["a helicopter engine runs continuously", "small dogs yip and bark sharply"], "sample_ids": ["ugHJF0hfYkg", "v-wcQf4BDY0"], "start_seconds": ["10", "120"], "properties": ["engine, running, continuously", "bark, yip, sharply"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a helicopter is flying overhead ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vbpKkWvfOu4", "vb1fPSDI4c"], "start_seconds": ["560", "30"], "properties": ["a, man, speaks", "multiple, people, yell"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "people applaud and hoot and chat quietly"], "sample_ids": ["zsLxS-uLJTw", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["horn, blast, train", "people, applaud, hoot"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", null], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["zj2R0XoFr5k", "vzxHnu-SFEw"], "start_seconds": ["50", "80"], "properties": ["airplane, fly, woman", "two objects, woman, speak"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is being rubbed together", "label": 0}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a clock ticktocks"], "sample_ids": ["s7knHCFW82w", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["blow horn, get close, train", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["continuous sneezing together with speech", "cats meow and then a person begins to talk while the cats continue to meow"], "sample_ids": ["x4dZyf9Gbj0", "x5cuQjOdM3E"], "start_seconds": ["130", "30"], "properties": ["continuous, sneeze, speech", "cat, talk, meow"], "captions_pred_video": ["footage is blurry and out of focus", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a woman sneezes and speaks", "a cat meows and a woman speaks"], "question": "which entity is a cat?", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "men speak and a nozzle sprays liquid"], "sample_ids": ["xM4joTqDVp4", "wRV8yMk886E"], "start_seconds": ["160", "0"], "properties": ["background, chirp, birds", "liquid, spray, nozzle"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "two cars are parked in a parking lot at night"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man speaks followed by a loud burst"], "question": "which entity is more likely to be used in a science class", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zhx6hoYrHeI", "w5W5Kqtc8E"], "start_seconds": ["160", "100"], "properties": ["engine, sputter, rough", "wind, blow, vehicle"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "wind loudly blowing while people speak in the background followed by a horn blowing"], "sample_ids": ["sapQIQUhFc", "xjhAnI2q6hM"], "start_seconds": ["280", "6"], "properties": ["water, stream, trickles", "wind, blow, loudly"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["someone is burping continuously", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["y636gklDioE", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["burps, burps, burps", "music, gunfire, explosion"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a person burps loudly several times", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "a man speaks followed by another man speaking outside"], "sample_ids": ["zj2R0XoFr5k", "viuTg1M-dqg"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, overhead", "two men, speak, follow"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a male speaks and another male speaks", "water splashes and wind noise is made into a microphone"], "sample_ids": ["viuTg1M-dqg", "sDSppXIlJrs"], "start_seconds": ["30", "27"], "properties": ["two males, speaking, male", "microphone, water, wind"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "a man is paddling a small wooden boat in the water"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "the wind is blowing and water is splashing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a clock ticktocks briefly", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["u7C-AEBQM", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks briefly", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a clock?", "label": 0}, {"captions": ["wind blows strongly and a young man speaks", "small dogs yip and bark sharply"], "sample_ids": ["vs65y4qmyBE", "v-wcQf4BDY0"], "start_seconds": ["340", "120"], "properties": ["wind, blows, strongly", "bark, yip, sharply"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["someone snores nearby", "birds chirp and objects are moved around"], "sample_ids": ["spJCm8tD9Zo", "yPUYU6t3rwo"], "start_seconds": ["90", "370"], "properties": ["someone snores, nearby, someone", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a person is snoring loudly", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a machine beeps continuously"], "sample_ids": ["smDKStoHBJo", "y682ml90jGw"], "start_seconds": ["0", "11"], "properties": ["a, talk, baby, cry", "beeps, machine, continuously"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["some clanking with distant murmuring", "birds chirp and objects are moved around"], "sample_ids": ["uMTTDZ2mb4", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["clanking, murmuring, distant", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a man speaks as water trickles down a stream"], "sample_ids": ["y2bVZ7rz-5M", "sapQIQUhFc"], "start_seconds": ["280", "280"], "properties": ["motor noise, horn, siren", "water, stream, trickles"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["v7jJS8aAyA", "tiDFTC-5vU"], "start_seconds": ["10", "30"], "properties": ["wind, blows, loudly", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "someone whistles a tune"], "sample_ids": ["uPDn2BFTHk", "sIXTftIuUgw"], "start_seconds": ["140", "90"], "properties": ["woman, laughs, speaks", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a person whistling a song"], "question": "which entity is a person", "label": 1}, {"captions": ["food is frying while a woman speaks", "water is sprayed across a hard surface"], "sample_ids": ["yhQ2Lg-7qDY", "sQwlkXjQabo"], "start_seconds": ["130", "10"], "properties": ["food, woman, speak", "water, spray, surface"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a faucet is running and a man is speaking", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "small dogs yip and bark sharply"], "sample_ids": ["zF8yoL0rkbI", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["engine, run, someone", "bark, yip, sharply"], "captions_pred_video": ["footage of the traffic on the street at night", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["children cry and people talk", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xLwHe825Zs", "zj2R0XoFr5k"], "start_seconds": ["18", "50"], "properties": ["people talk, children cry, people talk", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about a boy speaking?", "label": 1}, {"captions": ["a clock ticktocks", "vehicles pass by on a roadway"], "sample_ids": ["v-g-j2uTByM", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["ticktocks, clock, ticktocks", "pass, vehicle, roadway"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a clock is ticking loudly", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vW4x7S1VfQc", "tdWhHV3X25Q"], "start_seconds": ["150", "60"], "properties": ["clacking, oil, woman", "applause, audience, yells"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["food sizzles in a frying pan", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["s7knHCFW82w", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["blow horn, get close, train", "male, duck, laugh"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", null], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "wind blows as people chatter quietly"], "sample_ids": ["y8dSeubCNI", "xBxDz0CFVn0"], "start_seconds": ["4", "30"], "properties": ["engine revving, people speaking, motorcycle", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["an engine revving and people talking in the background", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["x6ijhqRY38s", "ukg5L09Wpvo"], "start_seconds": ["250", "150"], "properties": ["something metal, glass, hit", "clickety-clack, train, whistle"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "three men talk while wind blows and some liquid flows"], "sample_ids": ["s6DESzUTGjY", "vJ7JPEFhyLA"], "start_seconds": ["16", "16"], "properties": ["wind, laugh, woman", "three men, wind, flow"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a woman laughing", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["yZmhM1HcsyE", "xfaoyyzw2WU"], "start_seconds": ["4", "180"], "properties": ["engine, roar, water", "loud, jet engine, roar"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a duck quacks several times", "birds chirp and objects are moved around"], "sample_ids": ["vh30P49Po6s", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["quacks, duck, several", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a duck is quacking loudly", "insects buzz and a man speaks"], "question": "which entity is a bird", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ylpYOorfH4o", "uYT5gxnyMWM"], "start_seconds": ["410", "50"], "properties": ["motor, run, steady", "female, spraying, scream"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["w2bYrCVLT60", "xjvTpk2Zpr8"], "start_seconds": ["120", "70"], "properties": ["ducks, speak, quack", "wind, blows, vehicle"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a jet engine roars and wind blows "], "question": "which entity is more active", "label": 1}, {"captions": ["leaves rustle while man speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zOZleIRqZm4", "vfYTJq7nU"], "start_seconds": ["80", "130"], "properties": ["leaves, rustle, speak", "rustling, ducks, quack"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a duck quacks and a woman speaks"], "question": "which entity is about a duck?", "label": 1}, {"captions": ["birds chirp and objects are moved around", "paper is crumpling consistently"], "sample_ids": ["yPUYU6t3rwo", "v5cSxLaHADY"], "start_seconds": ["370", "0"], "properties": ["birds chirp, objects are moved around, birds", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["insects buzz and a man speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zTLVJCo4WEE", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["a, crickets, sing", "a woman, laughs, animal"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and an animal snorting?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vimzuGQvdcU", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["a, man, yells", "clickety-clack, train, whistle"], "captions_pred_video": ["a group of people are rafting down a river", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a train blows its whistle and blows its horn "], "question": "which entity is quieter", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a woman speaks as she rubs two objects together"], "sample_ids": ["v0x1odnXtP0", "vzxHnu-SFEw"], "start_seconds": ["210", "80"], "properties": ["keyboard, type, computer", "two objects, woman, speak"], "captions_pred_video": ["how to make money on youtube in spanish", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is being rubbed together", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a man speaks as a car is passing by"], "sample_ids": ["uRExseg-0XI", "sK4u5T8hW78"], "start_seconds": ["210", "30"], "properties": ["woman, man, water", "a, car, pass"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking as a car passes by?", "label": 1}, {"captions": ["a woman and man speak while food is frying", "water pouring and bubbling"], "sample_ids": ["zk-xJGQU8-4", "uyRfq-jKPpo"], "start_seconds": ["130", "50"], "properties": ["food, man, woman", "water, bubbles, pouring"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uYT5gxnyMWM", "sSMl2vc3ek"], "start_seconds": ["50", "20"], "properties": ["person, spray, yell", "loud, multiple, distance"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "wind loudly blowing while people speak in the background followed by a horn blowing"], "sample_ids": ["sSMl2vc3ek", "xjhAnI2q6hM"], "start_seconds": ["20", "6"], "properties": ["a person, laughs, snores", "wind, blow, loudly"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person snoring loudly", "a truck is revving its engine and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["zgUgkpk78xU", "yDoT73BWsdA"], "start_seconds": ["70", "10"], "properties": ["clinking, humming, horn", "engine, revs, vehicle"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sZPuqDgX2V0", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["engine, accelerate, intercom", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a stream of water runs briefly"], "sample_ids": ["uEU-Hg5MTN8", "x-PeY8Yb8M4"], "start_seconds": ["27", "300"], "properties": ["animal, grunts, snorts", "stream, water, run"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vJrjSeP17yE", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["a person is sleeping, snoring, person", "clickety-clack, train, whistle"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a person snoring loudly", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vdoxuJn9lTc", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "stream, water, flow"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "footage is blurry and out of focus"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a train horn blows as it passes by"], "sample_ids": ["zTLVJCo4WEE", "zVacuqSb4LI"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "horn, blows, train"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["a few ducks quack and scamper and a man speaks", "a propeller rotates loudly and intensely"], "sample_ids": ["w2bYrCVLT60", "ugHJF0hfYkg"], "start_seconds": ["120", "10"], "properties": ["ducks, speak, quack", "loud, intense, propeller"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "paper folding and crinkling"], "sample_ids": ["smDKStoHBJo", "zPpG3RD8lSs"], "start_seconds": ["0", "20"], "properties": ["a, infant, speaking", "paper, fold, crinkle"], "captions_pred_video": ["a man holding a crying baby in his arms", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "a infant makes noise and is excited"], "sample_ids": ["w-4gHptFNuU", "wIJK3-5y0kA"], "start_seconds": ["21", "30"], "properties": ["engine revs, accelerates, bump", "noise, excited, infant"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sSMl2vc3ek", "zj2R0XoFr5k"], "start_seconds": ["20", "50"], "properties": ["a person, laughs, snores", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person snoring loudly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["bees buzz and wind blows", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tMJne1a4AFI", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["bees buzz, wind blows, bees", "engine, accelerate, idle"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a swarm of bees buzzing around", "an engine is idling"], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a large crowd cheers and applauds", "wind blows as people chatter quietly"], "sample_ids": ["rqfQRErjfk8", "xBxDz0CFVn0"], "start_seconds": ["170", "30"], "properties": ["crowd, cheers, applauds", "wind, chatter, people"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage is blurry and out of focus"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "someone is typing on a computer keyboard"], "sample_ids": ["t97k0cejSQE", "v0x1odnXtP0"], "start_seconds": ["250", "210"], "properties": ["bird, chirp, insect", "keyboard, type, computer"], "captions_pred_video": ["a bee on a purple thistle flower", "how to make money on youtube in spanish"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["water flows as men speak and yell", "an infant crying as a woman laughs"], "sample_ids": ["vJ7JPEFhyLA", "xhmRY9yhC7c"], "start_seconds": ["16", "20"], "properties": ["water, flow, men", "a, laugh, infant"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["frogs croak and vocalize", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["yswmmRZFItk", "vlJS7LN2XyM"], "start_seconds": ["0", "30"], "properties": ["croak, vocalize, frog", "background, clocks, ticking"], "captions_pred_video": ["a close up of a frog in the water", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a frog is croaking", "a ticktock of a clock"], "question": "which entity is silent", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["xvDdE3zNf8Y", "y8WEcpOlT3I"], "start_seconds": ["120", "40"], "properties": ["a, female, speaks", "harsh, wind, blows"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["water splashes and a motorboat passes as people yell", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["w5W5Kqtc8E", "wz7N8YRy74I"], "start_seconds": ["100", "30"], "properties": ["water, splashes, motorboat", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zk-xJGQU8-4", "tiDFTC-5vU"], "start_seconds": ["130", "30"], "properties": ["food, man, woman", "male, duck, laugh"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", null], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a man is speaking and ducks are quacking"], "question": "which entity has more people", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "some clanking with distant murmuring"], "sample_ids": ["vZAw4apG0Es", "uMTTDZ2mb4"], "start_seconds": ["30", "30"], "properties": ["people, clock, converse", "clanking, murmuring, distant"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "people are talking and a car is driving by with wind noise in the background "], "question": "which entity is more distant", "label": 1}, {"captions": ["birds fly and flutter around", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["wGKgwOP3h30", "vzceMbklWc"], "start_seconds": ["30", "180"], "properties": ["fly, flutter, around", "water, faucet, sink"], "captions_pred_video": ["of the pigeons in the coop", null], "captions_pred_audio": ["pigeons coo and flap their wings", "water is running and a man is speaking"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a person screams glaringly", "a frog croaks as other frogs croak in the background"], "sample_ids": ["xC8kbrKJmco", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["glaringly, screams, person", "background, frog, croak"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["a goat is bleating ", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "an engine runs loudly"], "sample_ids": ["wRBHTgrbiwg", "vqZuVbG6-HI"], "start_seconds": ["50", "130"], "properties": ["birds, chirp, cooing", "loud, engine, run"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage is blurry because it's raining outside"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a lawn mower is running and men are speaking "], "question": "which is quieter", "label": 1}, {"captions": ["an engine starts and increases in power", "an infant crying as a woman laughs"], "sample_ids": ["zjTG0gaGCUI", "xhmRY9yhC7c"], "start_seconds": ["80", "20"], "properties": ["power, increase, engine", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["an infant crying as a woman laughs", "people speak softly as food sizzles"], "sample_ids": ["xhmRY9yhC7c", "yhQ2Lg-7qDY"], "start_seconds": ["20", "130"], "properties": ["a, laugh, infant", "food, sizzle, speak"], "captions_pred_video": ["of a baby crying in a baby bouncer", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a baby cries and a woman speaks", "a faucet is running and a man is speaking"], "question": "which entity is about a woman laughing?", "label": 0}, {"captions": ["a door opens and birds chirp", "water splashes as an animal walks through"], "sample_ids": ["yeFvk9x0wWI", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["door, open, birds", "animal, water, splashes"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "paper folding and crinkling"], "sample_ids": ["uEU-Hg5MTN8", "zPpG3RD8lSs"], "start_seconds": ["27", "20"], "properties": ["animal, grunts, snorts", "paper, fold, crinkle"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "the wind blows and a mouse clicks "], "question": "which entity is more likely to be a toy", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wqN6IIHw3po", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["rain, surface, fall", "loud, jet engine, roar"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking and water is splashing", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vlS6YMeWAPo", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["noise, bleat, call", "multiple, people, yell"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a child speaks in closed space"], "sample_ids": ["wnpJndXuxLc", "yW6FWLSLkx4"], "start_seconds": ["50", "40"], "properties": ["blows, vehicle, train", "child, space, speak"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "a horse runs while two women talk"], "sample_ids": ["zkKdxzNC97Y", "sdvI1mHAsc"], "start_seconds": ["27", "20"], "properties": ["hard, surface, door", "two women, horse, run"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "horses clip-clop and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vKrYfzleLB8", "w5W5Kqtc8E"], "start_seconds": ["110", "100"], "properties": ["a, ring, gunshots", "wind, blow, vehicle"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", null], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "a horn rings out as a machine runs by"], "sample_ids": ["xjhAnI2q6hM", "slZLHwNbbt4"], "start_seconds": ["6", "300"], "properties": ["wind, blow, loudly", "a, horn, run"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a man speaks as a car is passing by"], "sample_ids": ["vbZ-0lGPneg", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a woman, a television program, a bird", "a, car, pass"], "captions_pred_video": ["of a man holding a baby duck in his hands", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a bird in it?", "label": 0}, {"captions": ["electronic beeps occur in a short series", "a woman speaks and other women and a man talk with her"], "sample_ids": ["y682ml90jGw", "vbpKkWvfOu4"], "start_seconds": ["11", "560"], "properties": ["beeps, series, electronic", "a, woman, man"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking and a man is speaking"], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vYkA3cfXp5Q", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["speed, idle, accelerate", "a woman, laughs, animal"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a child speaks in closed space", "birds chirp and objects are moved around"], "sample_ids": ["yW6FWLSLkx4", "yPUYU6t3rwo"], "start_seconds": ["40", "370"], "properties": ["child, space, speak", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["an aircraft engine runs", "a woman speaks as she rubs two objects together"], "sample_ids": ["yLCORCnd35Q", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["engine, aircraft, runs", "two objects, woman, speak"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is moving", "label": 0}, {"captions": ["music plays followed by gunshots and then an explosion", "a toilet flushes and a female speaks"], "sample_ids": ["xKB8O8LTs6s", "yaln9y8I7ms"], "start_seconds": ["70", "230"], "properties": ["music, gunshots, explosion", "female, flushes, toilet"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage is blurry and out of focus"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a toilet flushes and a man speaks"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "an insect buzzes around continuously"], "sample_ids": ["u--KhUW8l1Y", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["engine, sound, horn", "buzzes, continuously, insect"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a fly is buzzing around a microphone "], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zofjfKhqLk8", "wDVMhEdTiVw"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "gun, shoot, water"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a helicopter engine runs continuously", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["ugHJF0hfYkg", "zFjIWfSD-4"], "start_seconds": ["10", "410"], "properties": ["engine, running, continuously", "People, motor, brakes"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running continuously", "label": 0}, {"captions": ["a door slams shut and an object moves on a hard surface", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zkKdxzNC97Y", "wz7N8YRy74I"], "start_seconds": ["27", "30"], "properties": ["hard, surface, door", "rooster, crow, background, men"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a child speaks", "a stream of water runs briefly"], "sample_ids": ["yW6FWLSLkx4", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["a, child, speaks", "stream, water, run"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["water running down a sink while a man is talking", "wind blows as people chatter quietly"], "sample_ids": ["vSeGhaZt-aI", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["water, sink, talk", "wind, chatter, people"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wtDqrBygTcU", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["man, engine, run", "applause, audience, yells"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and a motor is running", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["sawing of wood and rustling with leaves blowing in the distance", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["uiItxDsDMFI", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["sound, distance, leaves", "a woman, a television program, a bird"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a saw is being used with background noise ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program playing far away?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a vehicle engine runs while a siren and horn sound"], "sample_ids": ["xvDdE3zNf8Y", "u--KhUW8l1Y"], "start_seconds": ["120", "0"], "properties": ["a, female, speaks", "engine, sound, horn"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a firefighter spraying water from a fire hydrant at night"], "captions_pred_audio": ["a woman speaks and crumples paper", "a fire truck siren blares and a horn blows "], "question": "which entity is louder", "label": 1}, {"captions": ["water runs into a sink while men speak", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vzceMbklWc", "wDVMhEdTiVw"], "start_seconds": ["180", "30"], "properties": ["water, sink, run", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["water is running and a man is speaking", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is about water running into a sink?", "label": 0}, {"captions": ["a motor idles, accelerates, then slows down.", "several insects fly while two men talk"], "sample_ids": ["vYkA3cfXp5Q", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["speed, idle, accelerate", "several, fly, men"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["an engine is idling", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a living thing", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a train horn blows as it passes by"], "sample_ids": ["xSKJGCItUWE", "zVacuqSb4LI"], "start_seconds": ["10", "30"], "properties": ["engine, run, boy", "horn, blows, train"], "captions_pred_video": ["footage of the helicopter flying in the room", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a woman speaks happily and an animal chirps"], "sample_ids": ["sZPuqDgX2V0", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["engine, accelerate, intercom", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a stream of water runs briefly"], "sample_ids": ["rqfQRErjfk8", "x-PeY8Yb8M4"], "start_seconds": ["170", "300"], "properties": ["crowd, cheers, applauds", "stream, water, run"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tPJvjq9QePY", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["animal, bleat, moo", "water, radio, man"], "captions_pred_video": ["a dog and a sheep in a barn", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a baby cries and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["v7jJS8aAyA", "wz7N8YRy74I"], "start_seconds": ["10", "30"], "properties": ["wind, blows, loudly", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["a cat meows and children speak", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["x5cuQjOdM3E", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["cat, speak, children", "applause, audience, yells"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "a vehicle engine accelerating then running on idle"], "sample_ids": ["y8dSeubCNI", "vYkA3cfXp5Q"], "start_seconds": ["4", "30"], "properties": ["engine revving, people speaking, motorcycle", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["an engine revving and people talking in the background", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "paper is crumpling consistently"], "sample_ids": ["zgUgkpk78xU", "v5cSxLaHADY"], "start_seconds": ["70", "0"], "properties": ["clinking, humming, horn", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "winds blows roughly as a vehicle races past"], "sample_ids": ["tMJne1a4AFI", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["wind, buzz, rustling", "wind, blows, vehicle"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a swarm of bees buzzing around", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "water splashes and a door squeaks"], "sample_ids": ["vJvryTwuAV8", "sdXV-ylviw"], "start_seconds": ["16", "190"], "properties": ["audience, cheer, man", "sound, splash, door"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", null], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a dog barks and taps with background noise "], "question": "which entity has a door that squeaks?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wy1eKjR7KC0", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["people, talk, distance", "engine, idle, woman"], "captions_pred_video": ["two police officers riding motorcycles down the street", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["an airplane engine spools and people speak", "people applaud and hoot and chat quietly"], "sample_ids": ["wTjoRj1se3U", "wwyfGO2J4"], "start_seconds": ["390", "90"], "properties": ["airplane, engine, spool", "people, applaud, hoot"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["continuous sneezing together with speech", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["x4dZyf9Gbj0", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["continuous, sneeze, speech", "animal, grunts, snorts"], "captions_pred_video": ["footage is blurry and out of focus", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman sneezes and speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sQwlkXjQabo", "zj2R0XoFr5k"], "start_seconds": ["10", "50"], "properties": ["water, spray, surface", "airplane, boy, fly"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["spraying followed by silence", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["bees buzz as wind blows", "water is sprayed across a hard surface"], "sample_ids": ["tMJne1a4AFI", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["bees, buzz, wind", "water, spray, surface"], "captions_pred_video": ["a swarm of bees on the ground", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a swarm of bees buzzing around", "spraying followed by silence"], "question": "which entity is not a living thing", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wyllXV6PjKo", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["a kid, talk, cry", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman speaks and a baby cries", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sU53zg9Jp7s", "uYT5gxnyMWM"], "start_seconds": ["380", "50"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "a, scream, girl"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a doorbell?", "label": 0}, {"captions": ["a jet engine screams, then increases its power", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vBslzh7saPw", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["power, scream, increase", "stream, water, flow"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage is blurry and out of focus"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["wind blows as people chatter quietly", "several insects fly while two men talk"], "sample_ids": ["xBxDz0CFVn0", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["wind, chatter, people", "several, fly, men"], "captions_pred_video": ["footage is blurry and out of focus", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["some people speak", "a car accelerates and wind blows"], "sample_ids": ["vbZ-0lGPneg", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "accelerates, wind, blows"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "pigeons vocalize and birds chirp"], "sample_ids": ["u6jIvCtKarQ", "uiS58TNyUiw"], "start_seconds": ["70", "430"], "properties": ["a, man, speaks", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a person using a blender on a stove top", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["uqFtmnhuqA8", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["a, b, c", "engine, laugh, loud"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "footage of a man driving a car in the dark"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a child speaks in closed space"], "sample_ids": ["ugHJF0hfYkg", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["loud, propeller, move", "child, space, speak"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "people speak as gunfire rings out"], "sample_ids": ["vimzuGQvdcU", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["a, man, yells", "gunfire, ring, speak"], "captions_pred_video": ["a group of people are rafting down a river", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a motor idles, accelerates, then slows down.", "a woman speaks as she rubs two objects together"], "sample_ids": ["vYkA3cfXp5Q", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["speed, idle, accelerate", "two objects, woman, speak"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of speed", "label": 0}, {"captions": ["a motorcycle engine works nearby", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tOSWIURC-4", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["engine, work, nearby", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a lawn mower is running ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a scene from a movie", "label": 1}, {"captions": ["a person is burping while a girl speaks", "someone whistles a tune"], "sample_ids": ["vdoxuJn9lTc", "sIXTftIuUgw"], "start_seconds": ["40", "90"], "properties": ["person, burp, girl", "someone, tune, whistle"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", null], "captions_pred_audio": ["a child speaks followed by a burp", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "a race car approaches quickly and slows down squealing tires"], "sample_ids": ["xvDdE3zNf8Y", "sEprKHm8Sj8"], "start_seconds": ["120", "90"], "properties": ["A, crumple, paper", "car, tires, slows"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a woman speaks and crumples paper", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["uEU-Hg5MTN8", "zFjIWfSD-4"], "start_seconds": ["27", "410"], "properties": ["a woman, laughs, animal", "People, motor, brakes"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["s4Uz1Ffgo04", "w5W5Kqtc8E"], "start_seconds": ["100", "100"], "properties": ["water, rushes, vehicle", "wind, blow, vehicle"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle zooming past water?", "label": 0}, {"captions": ["a stream runs then someone speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wbHTKEJZyhc", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["stream, run, someone", "men, talk, cars"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "wind blows as people chatter quietly"], "sample_ids": ["s4Uz1Ffgo04", "xBxDz0CFVn0"], "start_seconds": ["100", "30"], "properties": ["water, rushes, motorcycle", "wind, chatter, people"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a helicopter engine idles continuously", "a vehicle engine accelerating then running on idle"], "sample_ids": ["ugHJF0hfYkg", "vYkA3cfXp5Q"], "start_seconds": ["10", "30"], "properties": ["engine, idle, continuously", "engine, accelerate, idle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a helicopter is flying overhead ", "an engine is idling"], "question": "which engine is running on idle", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a drill runs and two people laugh"], "sample_ids": ["vhJWZheqaE", "tEE3MpBt1sg"], "start_seconds": ["0", "50"], "properties": ["water drains unevenly, toilet flushes, water drains", "two people, laugh, drill"], "captions_pred_video": [null, "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a toilet is flushed", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["goats bleat and metal clings", "wind blows as people chatter quietly"], "sample_ids": ["tH17JPjDPnc", "xBxDz0CFVn0"], "start_seconds": ["260", "30"], "properties": ["bleat, metal, clings", "wind, chatter, people"], "captions_pred_video": ["feed of the goats eating hay in the barn", "footage is blurry and out of focus"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a goat screams and people speak in the background", "an infant crying as a woman laughs"], "sample_ids": ["xC8kbrKJmco", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["background, goat, scream", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a goat is bleating ", "a baby cries and a woman speaks"], "question": "which entity is crying", "label": 1}, {"captions": ["some people speak", "wind blows as people chatter quietly"], "sample_ids": ["vbZ-0lGPneg", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "wind, chatter, people"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks while water drains", "some tunes played by whistling"], "sample_ids": ["vSeGhaZt-aI", "u6BnG6YZqJ4"], "start_seconds": ["50", "0"], "properties": ["water, drain, man", "tune, play, whistling"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wqZ135Ssz0", "vb1fPSDI4c"], "start_seconds": ["60", "30"], "properties": ["two men, woman, birds", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["people speak and tapping occurs", "waves crash against a shoreline and people speak"], "sample_ids": ["tFCUUGdREgA", "yFB25fqfU8I"], "start_seconds": ["70", "300"], "properties": ["people, tap, speak", "wave, crash, shoreline"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be in a desert?", "label": 0}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["uOpoD0gGXcs", "tDlysoZiA1I"], "start_seconds": ["120", "0"], "properties": ["chirps, woman, bird", "animal, grunts, chirps"], "captions_pred_video": ["a herd of cows grazing in the field", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["birds are chirping and a man is speaking", "birds are chirping and a rooster is crowing "], "question": "which entity is a response to a woman chirping for the birds?", "label": 0}, {"captions": ["people clap and speak in the distance", "a small voice speaks, music plays followed by a double whoosh, and then a bell dings"], "sample_ids": ["wwyfGO2J4", "tQWGZLItBXk"], "start_seconds": ["90", "170"], "properties": ["clap, distance, speak", "voice, music, whoosh"], "captions_pred_video": [null, "worms revolution screenshots"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has music", "label": 1}, {"captions": ["frogs croak and vocalize", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yswmmRZFItk", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["croak, vocalize, frog", "male, duck, laugh"], "captions_pred_video": ["a close up of a frog in the water", null], "captions_pred_audio": ["a frog is croaking", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["a baby cries and a woman speaks", "a infant makes noise and is excited"], "sample_ids": ["tMbMDvT50j8", "wIJK3-5y0kA"], "start_seconds": ["12", "30"], "properties": ["a, cry, woman", "noise, excited, infant"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a baby cries and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is a baby", "label": 0}, {"captions": ["a airplane flies overhead as a woman speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zj2R0XoFr5k", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["airplane, fly, woman", "a woman, something, fried"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a vehicle engine accelerating then running on idle"], "sample_ids": ["uZesmtKZGSw", "vYkA3cfXp5Q"], "start_seconds": ["250", "30"], "properties": ["men, talk, cars", "engine, accelerate, idle"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "water splashes as an animal walks through"], "sample_ids": ["vK93VuO0yNc", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["male voice, bus, rumble", "animal, water, splashes"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["roadway noise occurs and a truck accelerates", "a toilet flushes and water drains"], "sample_ids": ["tgbONvsP47Y", "sfAvvZwdLCY"], "start_seconds": ["0", "20"], "properties": ["noise, truck, accelerate", "water drains, flushes, water"], "captions_pred_video": ["footage of a fire truck entering a garage", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a car is driving on the road ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a goat bleats as a person speaks", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tPJvjq9QePY", "xfaoyyzw2WU"], "start_seconds": ["40", "180"], "properties": ["bleats, person, speak", "loud, jet engine, roar"], "captions_pred_video": ["a dog and a sheep in a barn", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a baby cries and a man speaks", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a man speaks over intermittent keyboard taps"], "sample_ids": ["vSeGhaZt-aI", "tw76HGONaKg"], "start_seconds": ["50", "570"], "properties": ["water, bubbles, run", "audio, man, keyboard"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "an infant crying as a woman laughs"], "sample_ids": ["xZepNM9qcRA", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["background, motor, run", "a, laugh, infant"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "a car speeding up in the distance"], "sample_ids": ["vK93VuO0yNc", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["male voice, bus, rumble", "distance, car, speed"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", null], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["food is frying while a woman speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["yhQ2Lg-7qDY", "wwyfGO2J4"], "start_seconds": ["130", "90"], "properties": ["food, woman, speak", "people, applaud, hoot"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a man speaks followed by another man speaking outside"], "sample_ids": ["u2f5NpsoHBg", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["person, laugh, clap", "two men, speak, follow"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two people speaking?", "label": 1}, {"captions": ["a man speaks uses a drill", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["x5eIC7S0fbg", "vfYTJq7nU"], "start_seconds": ["60", "130"], "properties": ["A man is speaking, uses a drill, and is a tool", "rustling, ducks, quack"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", null], "captions_pred_audio": ["a man is speaking and using a power tool ", "a duck quacks and a woman speaks"], "question": "which entity is a tool", "label": 0}, {"captions": ["a dog whimpers and a woman briefly talks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["y1saVTXsKwc", "wz7N8YRy74I"], "start_seconds": ["80", "30"], "properties": ["a, dog, talk", "rooster, crow, background, men"], "captions_pred_video": ["a dog playing with a pink ball", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a dog barks and a man speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people talking", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a clock ticktocks"], "sample_ids": ["ziUT9IFTkjg", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["background, birds, rustling", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["several ducks are quacking and squawking", "water pouring and bubbling"], "sample_ids": ["wfHeoPDLMaM", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["quacking, squawking, ducks", "water, bubbles, pouring"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["ducks are quacking", "water is running from a faucet"], "question": "which entity is silent", "label": 1}, {"captions": ["wind blows as people chatter quietly", "an airplane engine spools and people speak"], "sample_ids": ["xBxDz0CFVn0", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["wind, chatter, people", "airplane, engine, spool"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a jet engine is running and people are talking"], "question": "which entity is more quiet", "label": 0}, {"captions": ["someone is snoring while sleeping", "wind blows as people chatter quietly"], "sample_ids": ["ujMt0-D-x2k", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["snore, sleep, someone", "wind, chatter, people"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage is blurry and out of focus"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sjlVMgdGSK0", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["car, revving, loudly", "two men, woman, birds"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a machine runs continuously", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wdXV3Pv0jiY", "vb1fPSDI4c"], "start_seconds": ["11", "30"], "properties": ["machine, running, continuously", "multiple, people, yell"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a crowd of people are talking and laughing"], "question": "which entity is not silent", "label": 1}, {"captions": ["a horse runs while two women talk", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sdvI1mHAsc", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["two women, horse, run", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 0}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["su6FAOcOA8c", "su6FAOcOA8c"], "start_seconds": ["4", "4"], "properties": ["engine, run, woman", "engine, idle, woman"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a woman is speaking and a subway train is moving "], "question": "which entity has a running engine", "label": 0}, {"captions": ["electronic beeps occur in a short series", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["y682ml90jGw", "w5W5Kqtc8E"], "start_seconds": ["11", "100"], "properties": ["beeps, series, electronic", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "water pouring and bubbling"], "sample_ids": ["w5W5Kqtc8E", "uyRfq-jKPpo"], "start_seconds": ["100", "50"], "properties": ["wind, blow, vehicle", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sfAvvZwdLCY", "vYkA3cfXp5Q"], "start_seconds": ["20", "30"], "properties": ["flushes, drains, water", "engine, accelerate, idle"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a toilet is flushed", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "a motor runs steadily as a man speaks, then the motor revs twice"], "sample_ids": ["yDoT73BWsdA", "ylpYOorfH4o"], "start_seconds": ["10", "410"], "properties": ["engine, revs, vehicle", "motor, run, steady"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and an engine is revving"], "question": "which motor is running steadily", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "someone is typing on a computer keyboard"], "sample_ids": ["uRlbY6aoBU", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["a, distance, sneeze", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is sneezing ", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["wind blowing followed by a zoom", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vr8ZXjEBhMQ", "tdWhHV3X25Q"], "start_seconds": ["150", "60"], "properties": ["wind, blow, zoom", "applause, audience, yells"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man is speaking and a crowd is clapping"], "question": "which entity is a person", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a man speaks as horns blow"], "sample_ids": ["xyL9F5VrjkE", "tHyNqRyK34A"], "start_seconds": ["20", "24"], "properties": ["engine, run, wind", "a, man, speaks"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "being taken from inside a vehicle on the street at night"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking and a car is honking with background noise "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "paper is crumpling consistently"], "sample_ids": ["u7C-AEBQM", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["ticks, rhythmic, quiet", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a ticktock of a clock", "paper is crumpled and crinkled"], "question": "which entity is crumpling consistently", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "a propeller rotates loudly and intensely"], "sample_ids": ["uPDn2BFTHk", "ugHJF0hfYkg"], "start_seconds": ["140", "10"], "properties": ["woman, laughs, speaks", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wtDqrBygTcU", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["man, engine, run", "water, radio, man"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and a motor is running", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["someone snores nearby", "a woman speaks happily and an animal chirps"], "sample_ids": ["spJCm8tD9Zo", "uWAAAL4CIoc"], "start_seconds": ["90", "0"], "properties": ["someone snores, nearby, someone", "a woman, chirps, animal"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a dog is barking "], "question": "which entity is more active", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "some men converse over an engine running"], "sample_ids": ["yVumC9TGknc", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["humming, clock, birds", "men, converse, engine"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", null], "captions_pred_audio": ["a series of beeps and chirps", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xSKJGCItUWE", "tDVADusiIoc"], "start_seconds": ["10", "60"], "properties": ["engine, run, boy", "water, radio, man"], "captions_pred_video": ["footage of the helicopter flying in the room", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a boy speaking?", "label": 0}, {"captions": ["some men converse over an engine running", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sCiy7QS1U", "uYT5gxnyMWM"], "start_seconds": ["300", "50"], "properties": ["men, converse, engine", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["yZp6xizR0yU", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["animal, bleat, cry", "animal, grunts, snorts"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a woman is speaking and a baby is crying"], "question": "which animal is grunting and snorting", "label": 1}, {"captions": ["an airplane engine spools and people speak", "some tunes played by whistling"], "sample_ids": ["wTjoRj1se3U", "u6BnG6YZqJ4"], "start_seconds": ["390", "0"], "properties": ["airplane, engine, spool", "tune, play, whistling"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a jet engine is running and people are talking", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "vehicles pass by on a roadway"], "sample_ids": ["viuTg1M-dqg", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["two men, speak, follow", "pass, vehicle, roadway"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a car is driving on the road "], "question": "which entity is more likely to be in a city", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["sWZzXuWYY", "tDlysoZiA1I"], "start_seconds": ["420", "0"], "properties": ["male, speech, banging", "animal, grunts, chirps"], "captions_pred_video": [null, "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["ugHJF0hfYkg", "su6FAOcOA8c"], "start_seconds": ["10", "4"], "properties": ["loud, propeller, move", "engine, idle, woman"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a subway train is moving "], "question": "which is quieter", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "someone is typing on a computer keyboard"], "sample_ids": ["vimzuGQvdcU", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["a, man, yells", "keyboard, type, computer"], "captions_pred_video": ["a group of people are rafting down a river", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a person is typing on a keyboard"], "question": "which is a more active activity", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wfHeoPDLMaM", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["quacking, squawking, ducks", "airplane, boy, fly"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["ducks are quacking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "dishes cling together then a man begins to speak"], "sample_ids": ["slZLHwNbbt4", "sQGXqGcwOTc"], "start_seconds": ["300", "3"], "properties": ["a, horn, run", "cling, speak, dishes"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "mechanisms are operating and water is splashing "], "question": "what is the man doing in the second image?", "label": 0}, {"captions": ["a woman talks while something is fried and objects are tapped", "a infant makes noise and is excited"], "sample_ids": ["yajyRTUQk3U", "wIJK3-5y0kA"], "start_seconds": ["400", "30"], "properties": ["a woman, something, fried", "noise, excited, infant"], "captions_pred_video": ["- a woman cooking in the kitchen", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a woman speaks as she rubs two objects together"], "sample_ids": ["y4tPJXBKDig", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["a, noise, talk", "two objects, woman, speak"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["water flows followed by women screaming", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w5W5Kqtc8E", "wDVMhEdTiVw"], "start_seconds": ["100", "30"], "properties": ["water, flow, women", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is about water flowing?", "label": 0}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "an infant crying frantically"], "sample_ids": ["uOpoD0gGXcs", "zwOBqeFTgiU"], "start_seconds": ["120", "30"], "properties": ["chirps, woman, bird", "cry, infant, frantically"], "captions_pred_video": ["a herd of cows grazing in the field", "of the baby crying in the car seat"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "someone whistles briefly"], "sample_ids": ["tZGN5a7ybxo", "uFoga8sHpiw"], "start_seconds": ["60", "90"], "properties": ["ring, train, horn", "sound, duration, pitch"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "footage of a bird in a cage"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a person whistles a song"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["dogs barking and whimpering", "a motor idles, accelerates, then slows down."], "sample_ids": ["tIY7qOV3rEM", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "speed, idle, accelerate"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["a man is filing a hard object", "a man speaks followed by another man speaking outside"], "sample_ids": ["vveS8HT7Uog", "viuTg1M-dqg"], "start_seconds": ["100", "30"], "properties": ["a man, hard, object", "two men, speak, follow"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about speaking", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "small dogs yip and bark sharply"], "sample_ids": ["sWZzXuWYY", "v-wcQf4BDY0"], "start_seconds": ["420", "120"], "properties": ["male, speech, banging", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "male speech with light ticking"], "sample_ids": ["spYNpeN7rPY", "xO-Q2BlIIPU"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "male, speech, ticking"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a clock with a green glowing display showing the time 09 07 2016 12 31 2016"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a clock ticking?", "label": 0}, {"captions": ["a weapon fires multiple times", "a clock ticktocks"], "sample_ids": ["sMC07Ucy7kg", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["weapon, fire, multiple", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is from a car's point of view", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "water flows and trickles"], "sample_ids": ["wqADXCzngMw", "tB7hWb9gTuQ"], "start_seconds": ["340", "30"], "properties": ["engine, idle, man", "water, flow, trickle"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a mechanical buzzing getting louder"], "sample_ids": ["su6FAOcOA8c", "sEprKHm8Sj8"], "start_seconds": ["4", "90"], "properties": ["engine, run, woman", "noise, loud, buzzing"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["people speak then an engine runs", "winds blows roughly as a vehicle races past"], "sample_ids": ["uMTTDZ2mb4", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["engine, run, people", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle racing past?", "label": 1}, {"captions": ["a machine beeps continuously", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["y682ml90jGw", "su6FAOcOA8c"], "start_seconds": ["11", "4"], "properties": ["beeps, machine, continuously", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking and a subway train is moving "], "question": "which entity is a machine?", "label": 0}, {"captions": ["a person is whistling", "a toilet flushes and a female speaks"], "sample_ids": ["sIXTftIuUgw", "yaln9y8I7ms"], "start_seconds": ["90", "230"], "properties": ["person, whistling, person", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistling a song", "a toilet flushes and a man speaks"], "question": "which entity is a person", "label": 0}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a woman speaks happily and an animal chirps"], "sample_ids": ["s4Uz1Ffgo04", "uWAAAL4CIoc"], "start_seconds": ["100", "0"], "properties": ["roars, background, people speaking", "a woman, chirps, animal"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman is speaking and a dog is barking "], "question": "which entity is quieter", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "winds blows roughly as a vehicle races past"], "sample_ids": ["zl9Dqx-j7q4", "xjvTpk2Zpr8"], "start_seconds": ["6", "70"], "properties": ["motors rev, laugh, loudly", "wind, blows, vehicle"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a jet engine roars ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman and man are speaking", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["vbpKkWvfOu4", "xV7Mg1QucSc"], "start_seconds": ["560", "14"], "properties": ["two people, speaking, woman, man", "alarm, ticktocks, laughs"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "an alarm clock ticks and a woman laughs"], "question": "which entity is about a clock ticktocking and a man laughing?", "label": 1}, {"captions": ["electronic beeps occur in a short series", "an infant crying frantically"], "sample_ids": ["y682ml90jGw", "zwOBqeFTgiU"], "start_seconds": ["11", "30"], "properties": ["beeps, series, electronic", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a beeping sound is being made ", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "an infant crying as a woman laughs"], "sample_ids": ["wz7N8YRy74I", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["rooster, crow, background, men", "a, laugh, infant"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a person is whistling", "paper is crumpling consistently"], "sample_ids": ["sIXTftIuUgw", "v5cSxLaHADY"], "start_seconds": ["90", "0"], "properties": ["person, whistling, person", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person whistling a song", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a motorcycle engine is idling", "pigeons vocalize and birds chirp"], "sample_ids": ["vZAqdHZ81yA", "uiS58TNyUiw"], "start_seconds": ["180", "430"], "properties": ["engine, motorcycle, idling", "vocalize, bird, chirp"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "of the pigeon in the cage"], "captions_pred_audio": ["an engine is idling loudly", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a piece of wood is being placed down and sawed", "a door opens and birds chirp"], "sample_ids": ["uiItxDsDMFI", "yeFvk9x0wWI"], "start_seconds": ["30", "30"], "properties": ["wood, piece, saw", "door, open, birds"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a saw is being used with background noise ", "birds chirp in the background as a car drives by "], "question": "which entity is a door?", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sQwlkXjQabo", "tiDFTC-5vU"], "start_seconds": ["10", "30"], "properties": ["water, spray, surface", "male, duck, laugh"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "pigeons vocalize and birds chirp"], "sample_ids": ["tDVADusiIoc", "uiS58TNyUiw"], "start_seconds": ["60", "430"], "properties": ["water, radio, man", "vocalize, bird, chirp"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["food is frying while a woman speaks", "a car speeding up in the distance"], "sample_ids": ["yhQ2Lg-7qDY", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["food, woman, speak", "distance, car, speed"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a church bell rings several times", "a man speaks as a car is passing by"], "sample_ids": ["sUVVjE3Ucp8", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["ring, bell, several", "a, car, pass"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a church bell is ringing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["an audience gives applause", "a dog whimpers as someone inhales/exhales briefly"], "sample_ids": ["x6iCUDmRpKQ", "vmrxwuAMb2I"], "start_seconds": ["38", "40"], "properties": ["applause, audience, give", "a dog, inhales, exhales"], "captions_pred_video": ["a black background with the moon and stars in the sky", "of the dog laying on the bed with his head out of the blanket"], "captions_pred_audio": ["a group of people are clapping and cheering", "a dog barks and growls"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a woman speaks happily and an animal chirps"], "sample_ids": ["wtDqrBygTcU", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["man, engine, run", "a woman, chirps, animal"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", null], "captions_pred_audio": ["a man is speaking and a motor is running", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "wind blows as people chatter quietly"], "sample_ids": ["ylpYOorfH4o", "xBxDz0CFVn0"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "wind, chatter, people"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a helicopter engine runs continuously"], "sample_ids": ["wTjoRj1se3U", "ugHJF0hfYkg"], "start_seconds": ["390", "10"], "properties": ["engine, run, people", "engine, running, continuously"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a jet engine is running and people are talking", "a helicopter is flying overhead "], "question": "which entity has a running engine", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "multiple people speak and children yell while water gurgles"], "sample_ids": ["u6jIvCtKarQ", "vb1fPSDI4c"], "start_seconds": ["70", "30"], "properties": ["a, man, speaks", "multiple, people, yell"], "captions_pred_video": ["footage of a person using a blender on a stove top", null], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["w34HjHr6gAY", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["beeps, hit, woman", "animal, grunts, snorts"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a woman is speaking and a baby is crying"], "question": "which entity has a woman talking?", "label": 0}, {"captions": ["multiple adults speaking, and a child shouting in the background", "water flows and trickles"], "sample_ids": ["yks4cLgIDMc", "tB7hWb9gTuQ"], "start_seconds": ["170", "30"], "properties": ["background, speaking, child", "water, flow, trickle"], "captions_pred_video": ["footage of two kids wrestling on the floor", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and a child is crying", "water is splashing and gurgling"], "question": "which entity is a moving object", "label": 1}, {"captions": ["someone snores nearby", "paper is crumpling consistently"], "sample_ids": ["spJCm8tD9Zo", "v5cSxLaHADY"], "start_seconds": ["90", "0"], "properties": ["someone snores, nearby, someone", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person is snoring loudly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "pigeons vocalize and birds chirp"], "sample_ids": ["wqADXCzngMw", "uiS58TNyUiw"], "start_seconds": ["340", "430"], "properties": ["engine, idle, man", "vocalize, bird, chirp"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "of the pigeon in the cage"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a car accelerates and wind blows"], "sample_ids": ["sQwlkXjQabo", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["water, spray, surface", "accelerates, wind, blows"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["t69a8aRKhmc", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "male, duck, laugh"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck?", "label": 1}, {"captions": ["speaking following by laughing and clapping", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["u2f5NpsoHBg", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["person, laugh, clap", "two men, woman, birds"], "captions_pred_video": ["is being projected on a screen at the front of the stage", null], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tiDFTC-5vU", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["male, duck, laugh", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a child speaks in closed space"], "sample_ids": ["v0x1odnXtP0", "yW6FWLSLkx4"], "start_seconds": ["210", "40"], "properties": ["keyboard, type, computer", "child, space, speak"], "captions_pred_video": ["how to make money on youtube in spanish", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a train horn sounds as a railroad passing bell rings", "a child speaks in closed space"], "sample_ids": ["zgUgkpk78xU", "yW6FWLSLkx4"], "start_seconds": ["70", "40"], "properties": ["horn, bell, train", "child, space, speak"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["x9JovgqUcs", "wqZ135Ssz0"], "start_seconds": ["500", "60"], "properties": ["a, man, speaks, keyboard", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a keyboard", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "waves crash against a shoreline and people speak"], "sample_ids": ["u7C-AEBQM", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["ticks, rhythmic, quiet", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more quiet", "label": 0}, {"captions": ["some liquid flows while a woman laughs and man talks", "birds chirp and an insect buzzes around"], "sample_ids": ["vddP56-ogds", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["liquid, laughs, man", "bird, chirp, insect"], "captions_pred_video": [null, "a bee on a purple thistle flower"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a bee buzzes and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "an airplane engine runs"], "sample_ids": ["yajyRTUQk3U", "yVPZ2MNWpms"], "start_seconds": ["400", "0"], "properties": ["a woman, something, fried", "engine, airplane, runs"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["w2M4i1mklOA", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["loud, chime, bell", "a woman, a television program, a bird"], "captions_pred_video": ["footage of an antique clock", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman is speaking and a dog is whimpering"], "question": "which entity is quieter", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "winds blows roughly as a vehicle races past"], "sample_ids": ["ukg5L09Wpvo", "xjvTpk2Zpr8"], "start_seconds": ["150", "70"], "properties": ["a train, a horn, a bell", "wind, blows, vehicle"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yks4cLgIDMc", "zj2R0XoFr5k"], "start_seconds": ["170", "50"], "properties": ["background, speaking, child", "airplane, boy, fly"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and a child is crying", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["siJFXfGWgDk", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["man, woman, vehicle", "a woman, laughs, animal"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a man speaks while playing a video game on a keyboard", "an airplane engine runs"], "sample_ids": ["tw76HGONaKg", "yVPZ2MNWpms"], "start_seconds": ["570", "0"], "properties": ["A, game, keyboard", "engine, airplane, runs"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a car is driving by on the road "], "question": "which is not a video game", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "water pouring and bubbling"], "sample_ids": ["wqADXCzngMw", "uyRfq-jKPpo"], "start_seconds": ["340", "50"], "properties": ["engine, idle, man", "water, bubbles, pouring"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "water flows as men speak and yell"], "sample_ids": ["s6DESzUTGjY", "vJ7JPEFhyLA"], "start_seconds": ["16", "16"], "properties": ["wind, laugh, woman", "water, flow, men"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "an infant crying as a woman laughs"], "sample_ids": ["vddP56-ogds", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["water, splash, person, laugh", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a baby cries and a woman speaks"], "question": "which entity is about a person laughing?", "label": 0}, {"captions": ["some liquid flows while a woman laughs and man talks", "a duck quacks continuously"], "sample_ids": ["vddP56-ogds", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["liquid, laughs, man", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "vehicles pass by on a roadway"], "sample_ids": ["slZLHwNbbt4", "tgbONvsP47Y"], "start_seconds": ["300", "0"], "properties": ["clap, distance, horn", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a car is driving on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["some tunes played by whistling", "a person snores loudly multiple times at a close distance"], "sample_ids": ["u6BnG6YZqJ4", "sSMl2vc3ek"], "start_seconds": ["0", "20"], "properties": ["tune, play, whistling", "loud, multiple, distance"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", null], "captions_pred_audio": ["a person whistling a song", "a person snoring loudly"], "question": "which entity is not a tune", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sOa7g-44Dag", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["audio, scratching, man", "multiple, people, yell"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a man speaks as a car is passing by"], "sample_ids": ["uzQnlJXBbOM", "sK4u5T8hW78"], "start_seconds": ["50", "30"], "properties": ["ringing, beep, stop", "a, car, pass"], "captions_pred_video": ["footage of a person using a cell phone on a table", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a telephone rings and a man speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a woman speaks happily and an animal chirps"], "sample_ids": ["uzQnlJXBbOM", "uWAAAL4CIoc"], "start_seconds": ["50", "0"], "properties": ["ringing, beep, stop", "a woman, chirps, animal"], "captions_pred_video": ["footage of a person using a cell phone on a table", null], "captions_pred_audio": ["a telephone rings and a man speaks", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["soTOh3zYJfY", "wqZ135Ssz0"], "start_seconds": ["40", "60"], "properties": ["vehicle, skid, tires", "two men, woman, birds"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a drill drills through something then people begin laughing", "a man speaks as a car is passing by"], "sample_ids": ["tEE3MpBt1sg", "sK4u5T8hW78"], "start_seconds": ["50", "30"], "properties": ["drill, something, laugh", "a, car, pass"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an insect buzzes around continuously", "a woman speaks as she rubs two objects together"], "sample_ids": ["v25l1jef3JY", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["buzzes, continuously, insect", "two objects, woman, speak"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is not a person?", "label": 0}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "an adult male speaks and dials a rotary phone"], "sample_ids": ["wnpJndXuxLc", "tK4VlLsNxak"], "start_seconds": ["50", "120"], "properties": ["blows, vehicle, train", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking and using a sewing machine"], "question": "which entity is a person", "label": 1}, {"captions": ["a dark barks and whimpers", "a woman speaks happily and an animal chirps"], "sample_ids": ["sYj4hpDUZDQ", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["barks, whimpers, dark", "a woman, chirps, animal"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", null], "captions_pred_audio": ["a dog barks and a cat meows", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wz7N8YRy74I", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, people", "engine, accelerate, idle"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vXlk0lIQBFo", "w5W5Kqtc8E"], "start_seconds": ["470", "100"], "properties": ["wind, speak, vocalize", "wind, blow, vehicle"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", null], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "waves crash against a shoreline and people speak"], "sample_ids": ["ugHJF0hfYkg", "yFB25fqfU8I"], "start_seconds": ["10", "300"], "properties": ["engine, running, continuously", "wave, crash, shoreline"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a man speaks followed by another man speaking outside"], "sample_ids": ["vddP56-ogds", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["water, splash, person, laugh", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking nearby?", "label": 0}, {"captions": ["wind blowing followed by a zoom", "someone sprays a liquid onto a hard surface making a hiss sound"], "sample_ids": ["vr8ZXjEBhMQ", "zO-LSSY92ZM"], "start_seconds": ["150", "30"], "properties": ["wind, blow, zoom", "liquid, surface, sound"], "captions_pred_video": ["is taken from a motorcycle's point of view", "youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "steam is hissing and hissing"], "question": "which entity is not a zoom", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xERFUeZONz8", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["ring, approach, traffic", "rustling, ducks, quack"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", null], "captions_pred_audio": ["an emergency vehicle siren blares", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tdWhHV3X25Q", "xKB8O8LTs6s"], "start_seconds": ["60", "70"], "properties": ["applause, audience, yells", "music, gunfire, explosion"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a small musical boom and then birds tweet and a few dogs pant"], "sample_ids": ["uZesmtKZGSw", "y2ZBGpgbhHM"], "start_seconds": ["250", "30"], "properties": ["men, talk, cars", "birds, tweet, pant"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "birds chirping and a dog panting"], "question": "which entity is more quiet", "label": 1}, {"captions": ["dogs barking and whimpering", "a kid speaks followed by music playing"], "sample_ids": ["tIY7qOV3rEM", "tQWGZLItBXk"], "start_seconds": ["0", "170"], "properties": ["barking, whimpering, dog", "music, kid, speak"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "worms revolution screenshots"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "multiple people speak and children yell while water gurgles"], "sample_ids": ["x9JovgqUcs", "vb1fPSDI4c"], "start_seconds": ["500", "30"], "properties": ["a, man, speaks, keyboard", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a keyboard", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["spYNpeN7rPY", "zl9Dqx-j7q4"], "start_seconds": ["1", "6"], "properties": ["a clock, ticktock, man", "engine, laugh, loud"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "ducks quack as a man speaks and makes a duck sound"], "sample_ids": ["uYT5gxnyMWM", "vfYTJq7nU"], "start_seconds": ["50", "130"], "properties": ["female, spraying, scream", "ducks, quack, man"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a duck quacks and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "an electric engine works nearby followed by a child talking"], "sample_ids": ["yeFvk9x0wWI", "xSKJGCItUWE"], "start_seconds": ["30", "10"], "properties": ["chirp, twitter, clatter", "engine, work, child"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of the helicopter flying in the room"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a high pitched engine is running and a child speaks"], "question": "which entity is a machine", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["y4tPJXBKDig", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["a, noise, talk", "a woman, laughs, animal"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a woman is speaking and a baby is crying"], "question": "which entity is about a girl talking", "label": 0}, {"captions": ["a sleeping person snores and wheezes", "a man speaks as a car is passing by"], "sample_ids": ["spJCm8tD9Zo", "sK4u5T8hW78"], "start_seconds": ["90", "30"], "properties": ["snores, wheezes, sleeps", "a, car, pass"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zF8yoL0rkbI", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["engine, run, someone", "female, spraying, scream"], "captions_pred_video": ["footage of the traffic on the street at night", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "waves crash against a shoreline and people speak"], "sample_ids": ["sQGXqGcwOTc", "yFB25fqfU8I"], "start_seconds": ["3", "300"], "properties": ["cling, speak, dishes", "wave, crash, shoreline"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a person surfing in the ocean"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["goats bleat and people speak", "a person snores loudly multiple times at a close distance"], "sample_ids": ["z5iUE5h0EPs", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["goats bleat, people speak, language", "loud, multiple, distance"], "captions_pred_video": ["of the goat in the barn", null], "captions_pred_audio": ["a goat bleats and a man speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "a car speeding up in the distance"], "sample_ids": ["sjlVMgdGSK0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["accelerates, vehicle, race car", "distance, car, speed"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which car is speeding up in the distance", "label": 1}, {"captions": ["a machine beeps continuously", "wind blowing followed by a zoom"], "sample_ids": ["y682ml90jGw", "vr8ZXjEBhMQ"], "start_seconds": ["11", "150"], "properties": ["beeps, machine, continuously", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a beeping sound is being made ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["goats bleat and people speak", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["z5iUE5h0EPs", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["goats bleat, people speak, language", "loud, laughter, intermittent"], "captions_pred_video": ["of the goat in the barn", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a goat bleats and a man speaks", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an electric engine works nearby followed by a child talking", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xSKJGCItUWE", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["engine, work, child", "engine, revs, vehicle"], "captions_pred_video": ["footage of the helicopter flying in the room", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["people speak and tapping occurs", "a man speaks while a machine runs before a smoke alarm beeps"], "sample_ids": ["tFCUUGdREgA", "sG7TyPnFDR0"], "start_seconds": ["70", "180"], "properties": ["people, tap, speak", "beeps, machine, smoke alarm"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a person is using an espresso machine in a restaurant"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a man is speaking and a microwave oven is beeping "], "question": "which entity has a smoke alarm beep?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "water flows as men speak and yell"], "sample_ids": ["tOSWIURC-4", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["engine, work, nearby", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wTjoRj1se3U", "sSMl2vc3ek"], "start_seconds": ["390", "20"], "properties": ["engine, run, people", "loud, multiple, distance"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "winds blows roughly as a vehicle races past"], "sample_ids": ["uWAAAL4CIoc", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["a, dog, vocalize", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["uPDn2BFTHk", "sQGXqGcwOTc"], "start_seconds": ["140", "3"], "properties": ["woman, laughs, speaks", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a baby laughs and a woman speaks", "mechanisms are operating and water is splashing "], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["birds chirp as a bell rings", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["ziUT9IFTkjg", "wqZ135Ssz0"], "start_seconds": ["10", "60"], "properties": ["chirp, bell, ring", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is about birds?", "label": 0}, {"captions": ["a goat screams and people speak in the background", "water flows as men speak and yell"], "sample_ids": ["xC8kbrKJmco", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["background, goat, scream", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a goat is bleating ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "someone whistles a tune"], "sample_ids": ["yks4cLgIDMc", "sIXTftIuUgw"], "start_seconds": ["170", "90"], "properties": ["background, speaking, child", "someone, tune, whistle"], "captions_pred_video": ["footage of two kids wrestling on the floor", null], "captions_pred_audio": ["a man is speaking and a child is crying", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wqN6IIHw3po", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["rain, surface, fall", "applause, audience, yells"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and water is splashing", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["birds tweet and squawk", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["w1mlz3Pe4fU", "yDoT73BWsdA"], "start_seconds": ["300", "10"], "properties": ["squawk, tweet, scream", "engine, revs, vehicle"], "captions_pred_video": ["of a bird in a cage", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["birds are chirping and singing", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "vehicle tires screech and a man speaks before a car door opens"], "sample_ids": ["wyllXV6PjKo", "sxYkFKFIZD0"], "start_seconds": ["30", "20"], "properties": ["a kid, talk, cry", "screech, man, door"], "captions_pred_video": [null, "2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking while a car is revving and accelerating with a squeal in the background "], "question": "which entity has a door open?", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a infant makes noise and is excited"], "sample_ids": ["sDSppXIlJrs", "wIJK3-5y0kA"], "start_seconds": ["27", "30"], "properties": ["microphone, water, wind", "noise, excited, infant"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a baby cries and a woman speaks"], "question": "which noise is made by a human", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yDoT73BWsdA", "zj2R0XoFr5k"], "start_seconds": ["10", "50"], "properties": ["engine, revs, vehicle", "airplane, boy, fly"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zhx6hoYrHeI", "tiDFTC-5vU"], "start_seconds": ["160", "30"], "properties": ["engine, sputter, rough", "male, duck, laugh"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "paper is crumpling consistently"], "sample_ids": ["vmrxwuAMb2I", "v5cSxLaHADY"], "start_seconds": ["40", "0"], "properties": ["a dog, inhales, exhales", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a dog barks and growls", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a propeller rotates loudly and intensely"], "sample_ids": ["ugHJF0hfYkg", "ugHJF0hfYkg"], "start_seconds": ["10", "10"], "properties": ["loud, intense, propeller", "loud, intense, propeller"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a helicopter is flying overhead ", "a helicopter is flying overhead "], "question": "which propeller rotates loudly and intensely", "label": 1}, {"captions": ["small dogs yip and bark sharply", "water quietly rushes by while birds chirp in the background"], "sample_ids": ["v-wcQf4BDY0", "sYITalLZjj4"], "start_seconds": ["120", "30"], "properties": ["bark, yip, sharply", "water, rushes, background, birds"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "two ducks are swimming in the water near each other"], "captions_pred_audio": ["a dog barks and growls", "wind blows and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["goats bleat and people speak", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["z5iUE5h0EPs", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["goats bleat, people speak, language", "female, spraying, scream"], "captions_pred_video": ["of the goat in the barn", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a goat bleats and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a person is whistling a tune", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["scYRUkrFLiQ", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["a, tune, whistle", "People, motor, brakes"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", null], "captions_pred_audio": ["a person whistling a song", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 0}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a machine engine runs and a man speaks"], "sample_ids": ["xKB8O8LTs6s", "vs65y4qmyBE"], "start_seconds": ["70", "340"], "properties": ["music, radio, gunshots", "engine, run, man"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a heavy engine is running and men are speaking "], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a man speaks with another voice speaking in the background"], "sample_ids": ["uiItxDsDMFI", "u21-Z5gJCB8"], "start_seconds": ["30", "30"], "properties": ["wood, piece, saw", "background, voice, man"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["a saw is being used with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a woman sneezes then speaks", "water flows and trickles"], "sample_ids": ["x4dZyf9Gbj0", "tB7hWb9gTuQ"], "start_seconds": ["130", "30"], "properties": ["sneezes, speaks, woman", "water, flow, trickle"], "captions_pred_video": ["footage is blurry and out of focus", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman sneezes and speaks", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["yYEVLuqEytU", "xfaoyyzw2WU"], "start_seconds": ["40", "180"], "properties": ["animal, pig, background", "loud, jet engine, roar"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["several sheep bleat and a man speaks", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a person is snoring while sleeping"], "sample_ids": ["v5P-ThUCINM", "vJrjSeP17yE"], "start_seconds": ["400", "40"], "properties": ["background, chirp, bird", "a person is sleeping, snoring, person"], "captions_pred_video": [null, "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["some men converse over an engine running", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sCiy7QS1U", "uYT5gxnyMWM"], "start_seconds": ["300", "50"], "properties": ["men, converse, engine", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["two frogs croak at each other", "some tunes played by whistling"], "sample_ids": ["zg0X6BnhOLQ", "u6BnG6YZqJ4"], "start_seconds": ["410", "0"], "properties": ["two frogs, croak, at each other", "tune, play, whistling"], "captions_pred_video": ["footage of lightning in the sky at night", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a frog is croaking", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a woman talking as an infant is crying", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tMbMDvT50j8", "sSMl2vc3ek"], "start_seconds": ["12", "20"], "properties": ["a, talk, infant", "loud, multiple, distance"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a stream of water runs briefly"], "sample_ids": ["zF8yoL0rkbI", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["engine, run, someone", "stream, water, run"], "captions_pred_video": ["footage of the traffic on the street at night", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a clock ticktocks in wind", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["yVumC9TGknc", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["ticktocks, clock, wind", "loud, laughter, intermittent"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a series of beeps and chirps", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["there are rhythmical snoring nearby", "a frog croaks as other frogs croak in the background"], "sample_ids": ["ujMt0-D-x2k", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["snoring, rhythmical, nearby", "background, frog, croak"], "captions_pred_video": ["of the dog playing with a toy on the floor", "a close up of a frog in the water"], "captions_pred_audio": ["a person is snoring loudly", "a frog is croaking"], "question": "which entity is not a frog?", "label": 0}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "an infant crying as a woman laughs"], "sample_ids": ["x5cuQjOdM3E", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["cat, talk, meow", "a, laugh, infant"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a cat meows and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is crying", "label": 1}, {"captions": ["an animal quacks rapidly", "a machine beeps continuously"], "sample_ids": ["vh30P49Po6s", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["animal, quacks, rapidly", "beeps, machine, continuously"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "an airplane engine spools and people speak"], "sample_ids": ["sQwlkXjQabo", "wTjoRj1se3U"], "start_seconds": ["10", "390"], "properties": ["liquid, surface, spray", "airplane, engine, spool"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["spraying followed by silence", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vXlk0lIQBFo", "wDVMhEdTiVw"], "start_seconds": ["470", "30"], "properties": ["wind, speak, vocalize", "gun, shoot, water"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sxYkFKFIZD0", "su6FAOcOA8c"], "start_seconds": ["20", "4"], "properties": ["screech, man, door", "engine, idle, woman"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["water rushes by", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["x-PeY8Yb8M4", "xKB8O8LTs6s"], "start_seconds": ["300", "70"], "properties": ["water, rushes, by", "music, gunfire, explosion"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car is driving on a wet road ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a small engine spits as it runs", "a woman speaks as she rubs two objects together"], "sample_ids": ["sZvwOuuPGP0", "vzxHnu-SFEw"], "start_seconds": ["50", "80"], "properties": ["spits, engine, runs", "two objects, woman, speak"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a medium engine is running ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is not a person?", "label": 0}, {"captions": ["water quietly rushes by while birds chirp in the background", "a stream of water runs briefly"], "sample_ids": ["sYITalLZjj4", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["water, rushes, background, birds", "stream, water, run"], "captions_pred_video": ["two ducks are swimming in the water near each other", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["wind blows and birds chirp", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vs65y4qmyBE", "uZesmtKZGSw"], "start_seconds": ["340", "250"], "properties": ["engine, run, man", "men, talk, cars"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a helicopter engine idles continuously", "paper is crumpling consistently"], "sample_ids": ["ugHJF0hfYkg", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["engine, idle, continuously", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a helicopter is flying overhead ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a machine runs continuously", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["wdXV3Pv0jiY", "zl9Dqx-j7q4"], "start_seconds": ["11", "6"], "properties": ["machine, running, continuously", "engine, laugh, loud"], "captions_pred_video": ["footage is blurry and shaky", "footage of a man driving a car in the dark"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a jet engine roars "], "question": "which entity is not a machine?", "label": 1}, {"captions": ["a dog barks and whimpers", "a propeller rotates loudly and intensely"], "sample_ids": ["sShpyu2l4YQ", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["barks, whimpers, dog", "loud, intense, propeller"], "captions_pred_video": ["the puppies are playing with a toy", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a dog is barking and growling", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uKCSGgof8gI", "uYT5gxnyMWM"], "start_seconds": ["12", "50"], "properties": ["chirps, distance, signal", "female, spraying, scream"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a duck quacks continuously"], "sample_ids": ["wjsXBsc7M40", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "quacks, continuously, duck"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a person speaks briefly", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zOZleIRqZm4", "tDVADusiIoc"], "start_seconds": ["80", "60"], "properties": ["person, talk, brief", "water, radio, man"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a stream of water flows as people talk and wind blows"], "sample_ids": ["tw76HGONaKg", "xBxDz0CFVn0"], "start_seconds": ["570", "30"], "properties": ["A, game, keyboard", "stream, water, flow"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage is blurry and out of focus"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["water flows followed by women screaming", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["w5W5Kqtc8E", "zj2R0XoFr5k"], "start_seconds": ["100", "50"], "properties": ["water, flow, women", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["ylpYOorfH4o", "ziUT9IFTkjg"], "start_seconds": ["410", "10"], "properties": ["engine, run, loud", "background, birds, rustling"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "birds are chirping and a chime is ringing "], "question": "which entity is quieter", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "paper folding and crinkling"], "sample_ids": ["u7C-AEBQM", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["ticks, rhythmic, quiet", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a ticktock of a clock", "the wind blows and a mouse clicks "], "question": "which entity is not quiet", "label": 1}, {"captions": ["a motorcycle engine is idling", "a car speeding up in the distance"], "sample_ids": ["vZAqdHZ81yA", "u0TrcHhkPQ"], "start_seconds": ["180", "20"], "properties": ["engine, motorcycle, idling", "distance, car, speed"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a race car accelerates and revs its engine "], "question": "which object is moving faster", "label": 0}, {"captions": ["an electric engine works nearby followed by a child talking", "waves crash against a shoreline and people speak"], "sample_ids": ["xSKJGCItUWE", "yFB25fqfU8I"], "start_seconds": ["10", "300"], "properties": ["engine, work, child", "wave, crash, shoreline"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a man speaks as horns blow", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tHyNqRyK34A", "vYkA3cfXp5Q"], "start_seconds": ["24", "30"], "properties": ["a, man, speaks", "engine, accelerate, idle"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "wind blows as people chatter quietly"], "sample_ids": ["tqR406bGiE", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["flush, water, gurgle", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "dishes cling together then a man begins to speak"], "sample_ids": ["sDSppXIlJrs", "sQGXqGcwOTc"], "start_seconds": ["27", "3"], "properties": ["microphone, water, wind", "cling, speak, dishes"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["the wind is blowing and water is splashing", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "people applaud and hoot and chat quietly"], "sample_ids": ["xZepNM9qcRA", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["background, motor, run", "people, applaud, hoot"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["material crumbles into a microphone", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vofpvUo6NAw", "wDVMhEdTiVw"], "start_seconds": ["220", "30"], "properties": ["material, crumbles, microphone", "gun, shoot, water"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "a infant makes noise and is excited"], "sample_ids": ["zliInBdC98Y", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["a, baby, cries, wails", "noise, excited, infant"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a baby cries and a woman speaks", "a baby cries and a woman speaks"], "question": "which is a more active infant", "label": 1}, {"captions": ["someone snores nearby", "multiple people speak and children yell while water gurgles"], "sample_ids": ["spJCm8tD9Zo", "vb1fPSDI4c"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "multiple, people, yell"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a person burps loudly for a long time nearby", "birds chirp and objects are moved around"], "sample_ids": ["vf44CgrjT0A", "yPUYU6t3rwo"], "start_seconds": ["20", "370"], "properties": ["loud, long, person", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a loud burp", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zl9Dqx-j7q4", "vb1fPSDI4c"], "start_seconds": ["6", "30"], "properties": ["motors rev, laugh, loudly", "multiple, people, yell"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tgbONvsP47Y", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["noise, truck, accelerate", "wind, blow, vehicle"], "captions_pred_video": ["footage of a fire truck entering a garage", null], "captions_pred_audio": ["a car is driving on the road ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a man speaks uses a drill"], "sample_ids": ["wvKpEYswXO0", "x5eIC7S0fbg"], "start_seconds": ["150", "60"], "properties": ["sound, water, running", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["of the person preparing food in the kitchen", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking and using a power tool "], "question": "which entity is a tool", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "heavy rain splashes as it falls"], "sample_ids": ["uZesmtKZGSw", "wP8ZKrlx3oA"], "start_seconds": ["250", "40"], "properties": ["car, track, man", "fall, rain, splash"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a heavy rain is falling on a surface"], "question": "which entity is not a person", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "a horn rings out as a machine runs by"], "sample_ids": ["sjlVMgdGSK0", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["accelerates, vehicle, race car", "a, horn, run"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "an engine runs loudly"], "sample_ids": ["y8WEcpOlT3I", "vqZuVbG6-HI"], "start_seconds": ["40", "130"], "properties": ["harsh, wind, blows", "loud, engine, run"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["yks4cLgIDMc", "tDlysoZiA1I"], "start_seconds": ["170", "0"], "properties": ["background, speaking, child", "animal, grunts, chirps"], "captions_pred_video": ["footage of two kids wrestling on the floor", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is speaking and a child is crying", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "small dogs yip and bark sharply"], "sample_ids": ["wz7N8YRy74I", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["rooster, crow, background, people", "bark, yip, sharply"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a train horn blows as it passes by", "a baby laugh at a sputter"], "sample_ids": ["zVacuqSb4LI", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["horn, blows, train", "laugh, sputter, baby"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is a person", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a woman speaks as she rubs two objects together"], "sample_ids": ["sNB8zxXneIM", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["several, quack, cocks", "two objects, woman, speak"], "captions_pred_video": ["a group of geese in a cage", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["birds chirp as a train approaches", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["xM4joTqDVp4", "zFjIWfSD-4"], "start_seconds": ["160", "410"], "properties": ["bird, chirp, train", "People, motor, brakes"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", null], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a train?", "label": 0}, {"captions": ["a duck quacks continuously", "continuous snoring"], "sample_ids": ["vh30P49Po6s", "sLkeqCDJIyw"], "start_seconds": ["30", "120"], "properties": ["quacks, continuously, duck", "loud, snoring, noise"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["a duck is quacking loudly", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["birds fly and flutter around", "frogs croak and vocalize"], "sample_ids": ["wGKgwOP3h30", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["fly, flutter, around", "croak, vocalize, frog"], "captions_pred_video": ["of the pigeons in the coop", "a close up of a frog in the water"], "captions_pred_audio": ["pigeons coo and flap their wings", "a frog is croaking"], "question": "which animal is more likely to be a frog", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "an infant crying frantically"], "sample_ids": ["zl9Dqx-j7q4", "zwOBqeFTgiU"], "start_seconds": ["6", "30"], "properties": ["engine, laugh, loud", "cry, infant, frantically"], "captions_pred_video": ["footage of a man driving a car in the dark", "of the baby crying in the car seat"], "captions_pred_audio": ["a jet engine roars ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a toilet flushes and a female speaks"], "sample_ids": ["xBxDz0CFVn0", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["stream, water, flow", "female, flushes, toilet"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a toilet flushes and a man speaks"], "question": "which entity is a source of water", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["x6ijhqRY38s", "w5W5Kqtc8E"], "start_seconds": ["250", "100"], "properties": ["something metal, glass, hit", "wind, blow, vehicle"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["some clanking with distant murmuring", "some tunes played by whistling"], "sample_ids": ["uMTTDZ2mb4", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["clanking, murmuring, distant", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["a male speaks and another male speaks", "a toilet flushes and a female speaks"], "sample_ids": ["viuTg1M-dqg", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["two males, speaking, male", "female, flushes, toilet"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["people speak softly as food sizzles", "some tunes played by whistling"], "sample_ids": ["yhQ2Lg-7qDY", "u6BnG6YZqJ4"], "start_seconds": ["130", "0"], "properties": ["food, sizzle, speak", "tune, play, whistling"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yVumC9TGknc", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["humming, clock, birds", "a woman, a television program, a bird"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a series of beeps and chirps", "a woman is speaking and a dog is whimpering"], "question": "which entity has a clock?", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "a man speaks while rain falls onto a hard surface"], "sample_ids": ["x6ijhqRY38s", "wqN6IIHw3po"], "start_seconds": ["250", "30"], "properties": ["something metal, glass, hit", "rain, surface, fall"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "in your own words what is happening in this screenshot? blood splattered all over the place"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking and water is splashing"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a clock ticktocks continuously", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vlJS7LN2XyM", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks continuously", "multiple, people, yell"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["an adult woman and an adult man speak", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zTLVJCo4WEE", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["two people, adult, speak", "female, spraying, scream"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a woman and a man speaking?", "label": 0}, {"captions": ["an engine runs and wind blows", "birds chirp and objects are moved around"], "sample_ids": ["vs65y4qmyBE", "yPUYU6t3rwo"], "start_seconds": ["340", "370"], "properties": ["engine, run, wind", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "insects buzz and a man speaks"], "question": "which entity is moving around objects", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sG7TyPnFDR0", "wqZ135Ssz0"], "start_seconds": ["180", "60"], "properties": ["beeps, machine, smoke alarm", "two men, woman, birds"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", null], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["the revving of an engine throttle followed by a man speaking", "a vehicle accelerates squealing tires"], "sample_ids": ["tezvROoo4bs", "sd7xVssqlw"], "start_seconds": ["40", "50"], "properties": ["audio, throttle, speaking", "accelerates, tires, squealing"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", null], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a duck quacks several times", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vh30P49Po6s", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["quacks, duck, several", "gun, shoot, water"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a duck is quacking loudly", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be shot", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "people speak as gunfire rings out"], "sample_ids": ["wqADXCzngMw", "wqTCwqVRDlk"], "start_seconds": ["340", "80"], "properties": ["engine, idle, man", "gunfire, ring, speak"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tOSWIURC-4", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["engine, work, nearby", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a lawn mower is running ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a duck quacks several times"], "sample_ids": ["zcDwZ6W7E3E", "vh30P49Po6s"], "start_seconds": ["180", "30"], "properties": ["man, speak, motorcycles", "quacks, duck, several"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is a single action", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["xyL9F5VrjkE", "tiDFTC-5vU"], "start_seconds": ["20", "30"], "properties": ["wind, motor, distance", "male, duck, laugh"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a child yells and another yells", "dishes cling together then a man begins to speak"], "sample_ids": ["vMDHu7Lxcgw", "sQGXqGcwOTc"], "start_seconds": ["410", "3"], "properties": ["two, yell, child", "cling, speak, dishes"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "a woman speaks as she rubs two objects together"], "sample_ids": ["xzKKf9bKNUo", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["background, noise, snoring", "two objects, woman, speak"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "an engine runs loudly"], "sample_ids": ["w5W5Kqtc8E", "vqZuVbG6-HI"], "start_seconds": ["100", "130"], "properties": ["wind, blow, vehicle", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a lawn mower is running and men are speaking "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a motorcycle engine works nearby", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["tOSWIURC-4", "yDoT73BWsdA"], "start_seconds": ["0", "10"], "properties": ["engine, work, nearby", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a lawn mower is running ", "a race car accelerates and revs its engine "], "question": "which entity has a vehicle passing by?", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "metal clacking as food and oil sizzles followed by a woman talking"], "sample_ids": ["zcDwZ6W7E3E", "vW4x7S1VfQc"], "start_seconds": ["180", "150"], "properties": ["a, man, speak", "clacking, oil, woman"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a person cooking fish in a frying pan on a stove top"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "food sizzles in a frying pan"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["sEprKHm8Sj8", "ukg5L09Wpvo"], "start_seconds": ["90", "150"], "properties": ["noise, loud, buzzing", "clickety-clack, train, whistle"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a train blows its whistle and blows its horn "], "question": "which noise is continuous", "label": 1}, {"captions": ["some men converse over an engine running", "someone snores nearby"], "sample_ids": ["sCiy7QS1U", "spJCm8tD9Zo"], "start_seconds": ["300", "90"], "properties": ["men, converse, engine", "someone snores, nearby, someone"], "captions_pred_video": [null, "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wsHBIgzs9Fs", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["horn, continuous, buzzing", "music, gunfire, explosion"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "water flows and trickles"], "sample_ids": ["v7jJS8aAyA", "tB7hWb9gTuQ"], "start_seconds": ["10", "30"], "properties": ["wind, blows, loudly", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["tw76HGONaKg", "yeFvk9x0wWI"], "start_seconds": ["570", "30"], "properties": ["A, game, keyboard", "clack, bird, chirp"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "birds chirp in the background as a car drives by "], "question": "which entity is not a video game", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wfHeoPDLMaM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["quacking, squawking, ducks", "stream, water, flow"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage is blurry and out of focus"], "captions_pred_audio": ["ducks are quacking", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["people speak and tapping occurs", "some men converse over an engine running"], "sample_ids": ["tFCUUGdREgA", "sCiy7QS1U"], "start_seconds": ["70", "300"], "properties": ["people, tap, speak", "men, converse, engine"], "captions_pred_video": ["a person riding a white horse in an indoor arena", null], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows people speaking and tapping occurs?", "label": 0}, {"captions": ["people speak and tapping occurs", "wind blowing followed by a zoom"], "sample_ids": ["tFCUUGdREgA", "vr8ZXjEBhMQ"], "start_seconds": ["70", "150"], "properties": ["people, tap, speak", "wind, blow, zoom"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to blow", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a man speaks as a motor runs in the background"], "sample_ids": ["zk-xJGQU8-4", "xZepNM9qcRA"], "start_seconds": ["130", "30"], "properties": ["food, man, woman", "background, motor, run"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a dark barks and whimpers"], "sample_ids": ["sWZzXuWYY", "sYj4hpDUZDQ"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "barks, whimpers, dark"], "captions_pred_video": [null, "a brown and white dog standing in front of a wall with its mouth open"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a dog barks and a cat meows"], "question": "which entity is quieter", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "water pouring and bubbling"], "sample_ids": ["vXlk0lIQBFo", "uyRfq-jKPpo"], "start_seconds": ["470", "50"], "properties": ["wind, talk, vocalize", "water, bubbles, pouring"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a train horn sounds and railroad crossing ring", "paper is crumpling consistently"], "sample_ids": ["s7knHCFW82w", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["horn, sound, train", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a person screams glaringly"], "sample_ids": ["sNB8zxXneIM", "xC8kbrKJmco"], "start_seconds": ["20", "0"], "properties": ["several, quack, cocks", "glaringly, screams, person"], "captions_pred_video": ["a group of geese in a cage", null], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a goat is bleating "], "question": "which entity is a person?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "several insects fly while two men talk"], "sample_ids": ["yRx9txMcBl0", "s-T9OVOiMLo"], "start_seconds": ["40", "330"], "properties": ["motors, tires, screech", "several, fly, men"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a vehicle?", "label": 0}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "small dogs yip and bark sharply"], "sample_ids": ["vW4x7S1VfQc", "v-wcQf4BDY0"], "start_seconds": ["150", "120"], "properties": ["clacking, oil, woman", "bark, yip, sharply"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["food sizzles in a frying pan", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "gunshots ring out, a man yells, and more shots follow"], "sample_ids": ["yYEVLuqEytU", "vKrYfzleLB8"], "start_seconds": ["40", "110"], "properties": ["animal, pig, background", "a, ring, gunshots"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "stock footage of a person holding a gun in their hand"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a man is speaking with background noise and a cap gun is fired "], "question": "which entity has more gunshots", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "birds chirp and objects are moved around"], "sample_ids": ["y1saVTXsKwc", "yPUYU6t3rwo"], "start_seconds": ["80", "370"], "properties": ["a, dog, talk", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a dog playing with a pink ball", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a dog barks and a man speaks", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "some men converse over an engine running"], "sample_ids": ["vuUVPzd2FXw", "sCiy7QS1U"], "start_seconds": ["160", "300"], "properties": ["a, steam, release", "men, converse, engine"], "captions_pred_video": ["of the person cooking on the grill with a spatula", null], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a man talking?", "label": 0}, {"captions": ["a jet engine roars, almost making a man inaudible", "a man speaks as a boat engine runs"], "sample_ids": ["xfaoyyzw2WU", "wtDqrBygTcU"], "start_seconds": ["180", "30"], "properties": ["loud, jet engine, roar", "man, engine, run"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "shows a person riding on the back of a boat as it speeds through the water"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a man is speaking and a motor is running"], "question": "which engine is running", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "birds chirp and objects are moved around"], "sample_ids": ["y2bVZ7rz-5M", "yPUYU6t3rwo"], "start_seconds": ["280", "370"], "properties": ["motor noise, horn, siren", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "three men talk while wind blows and some liquid flows"], "sample_ids": ["rqu8iB22IY", "vJ7JPEFhyLA"], "start_seconds": ["5", "16"], "properties": ["sound, repeats, laugh", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man talking?", "label": 1}, {"captions": ["a person is burping while a girl speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vdoxuJn9lTc", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["person, burp, girl", "a woman, a television program, a bird"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a child speaks followed by a burp", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird nearby?", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a vehicle engine accelerates and wind blows"], "sample_ids": ["uzQnlJXBbOM", "wudZTNBtVqc"], "start_seconds": ["50", "60"], "properties": ["ringing, beep, stop", "accelerates, engine, wind"], "captions_pred_video": ["footage of a person using a cell phone on a table", "footage is of a parking lot with cars parked in it"], "captions_pred_audio": ["a telephone rings and a man speaks", "a car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks as several small engines run", "a woman and man are speaking"], "sample_ids": ["u9A6VZQCZpU", "vbpKkWvfOu4"], "start_seconds": ["30", "560"], "properties": ["a, man, talk", "two people, speaking, woman, man"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "tapping occurs then a baby cries"], "sample_ids": ["weDbePuc-Xc", "wIJK3-5y0kA"], "start_seconds": ["40", "30"], "properties": ["cartoon character, music, vocalize", "a, cry, baby"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a baby cries and a woman speaks"], "question": "which entity is a baby?", "label": 1}, {"captions": ["a toilet flushes and water drains", "an infant crying as a woman laughs"], "sample_ids": ["sfAvvZwdLCY", "xhmRY9yhC7c"], "start_seconds": ["20", "20"], "properties": ["water drains, flushes, water", "a, laugh, infant"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a toilet is flushed", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["xvDdE3zNf8Y", "x9JovgqUcs"], "start_seconds": ["120", "500"], "properties": ["a, female, speaks", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", null], "captions_pred_audio": ["a woman speaks and crumples paper", "a man speaks and types on a keyboard"], "question": "which entity is speaking", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "repeated tapping is accompanied by water running and a woman speaking softly"], "sample_ids": ["sDSppXIlJrs", "wvKpEYswXO0"], "start_seconds": ["27", "150"], "properties": ["microphone, water, wind", "sound, water, running"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "of the person preparing food in the kitchen"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a woman is speaking and tapping with background noise and water running "], "question": "which entity has water running?", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["xSKJGCItUWE", "wwyfGO2J4"], "start_seconds": ["10", "90"], "properties": ["engine, run, boy", "people, applaud, hoot"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a dark barks and whimpers", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sYj4hpDUZDQ", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["barks, whimpers, dark", "stream, water, flow"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "footage is blurry and out of focus"], "captions_pred_audio": ["a dog barks and a cat meows", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "people cheer as a vehicle engine revs"], "sample_ids": ["wvKpEYswXO0", "xjhAnI2q6hM"], "start_seconds": ["150", "6"], "properties": ["water, tap, run", "engine revs, vehicle, people"], "captions_pred_video": ["of the person preparing food in the kitchen", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["birds chirp as a bell rings", "a duck quacks continuously"], "sample_ids": ["ziUT9IFTkjg", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["chirp, bell, ring", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 0}, {"captions": ["food is frying then a woman speaks", "birds chirp as a man speaks and a younger person speaks"], "sample_ids": ["ukxt9I7eMMg", "xl2PIWyXaM"], "start_seconds": ["30", "160"], "properties": ["food, woman, speak", "chirp, man, younger person"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "birds are chirping and people are talking"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["speaking following by laughing and clapping", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["u2f5NpsoHBg", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["person, laugh, clap", "a woman, laughs, animal"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a woman is speaking and a baby is crying"], "question": "which entity has a person speaking and laughing and clapping?", "label": 0}, {"captions": ["pigeons vocalize and birds chirp", "a clock ticktocks"], "sample_ids": ["uiS58TNyUiw", "v-g-j2uTByM"], "start_seconds": ["430", "30"], "properties": ["vocalize, bird, chirp", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of the pigeon in the cage", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["water bubbles and gurgles.", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tB7hWb9gTuQ", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["bubbles, gurgles, water", "female, spraying, scream"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["water is splashing and gurgling", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and spraying?", "label": 1}, {"captions": ["an airplane engine spools and people speak", "water pouring and bubbling"], "sample_ids": ["wTjoRj1se3U", "uyRfq-jKPpo"], "start_seconds": ["390", "50"], "properties": ["airplane, engine, spool", "water, bubbles, pouring"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a jet engine is running and people are talking", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["women speak and laugh as wind blows", "pigeons vocalize and birds chirp"], "sample_ids": ["un9VQlzgZM", "uiS58TNyUiw"], "start_seconds": ["5", "430"], "properties": ["wind, speak, laugh", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["y8dSeubCNI", "yajyRTUQk3U"], "start_seconds": ["4", "400"], "properties": ["men, women, car", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["an engine revving and people talking in the background", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["continuous sneezing together with speech", "a stream of water runs briefly"], "sample_ids": ["x4dZyf9Gbj0", "x-PeY8Yb8M4"], "start_seconds": ["130", "300"], "properties": ["continuous, sneeze, speech", "stream, water, run"], "captions_pred_video": ["footage is blurry and out of focus", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman sneezes and speaks", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yI-KvObbDoY", "yajyRTUQk3U"], "start_seconds": ["260", "400"], "properties": ["sound, smack, wind", "a woman, something, fried"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "- a woman cooking in the kitchen"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uEU-Hg5MTN8", "vb1fPSDI4c"], "start_seconds": ["27", "30"], "properties": ["a woman, laughs, animal", "multiple, people, yell"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tDVADusiIoc", "uZesmtKZGSw"], "start_seconds": ["60", "250"], "properties": ["wind, radio, waves", "men, talk, cars"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "a duck quacks loudly and continuously"], "sample_ids": ["w2JXXIAdUdg", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["emits, sleeping, person", "loud, continuous, quacks"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a duck quacks continuously"], "sample_ids": ["vcmWSmvti8", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["music, man, fire", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "an airplane engine runs"], "sample_ids": ["un9VQlzgZM", "yVPZ2MNWpms"], "start_seconds": ["5", "0"], "properties": ["females, talk, laugh", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a person speaks over rustling leaves"], "sample_ids": ["rwTERCUno", "zOZleIRqZm4"], "start_seconds": ["90", "80"], "properties": ["engine, idle, sputter", "rustling, leaves, person"], "captions_pred_video": [null, "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking with crickets chirping in the background"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a horn honks and then loudly blares", "winds blows roughly as a vehicle races past"], "sample_ids": ["wnpJndXuxLc", "xjvTpk2Zpr8"], "start_seconds": ["50", "70"], "properties": ["horn, honk, loud", "wind, blows, vehicle"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a dog whimpers and a woman briefly talks", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["y1saVTXsKwc", "vVhthZ45k3Y"], "start_seconds": ["80", "30"], "properties": ["a, dog, talk", "cat, purr, hiss"], "captions_pred_video": ["a dog playing with a pink ball", "footage is blurry and out of focus"], "captions_pred_audio": ["a dog barks and a man speaks", "a man is speaking and a cat is meowing"], "question": "which animal is more active", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tEE3MpBt1sg", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["drill, something, laugh", "a woman, something, fried"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "- a woman cooking in the kitchen"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a horn blasts as warning bells ring"], "sample_ids": ["sfAvvZwdLCY", "zgUgkpk78xU"], "start_seconds": ["20", "70"], "properties": ["flushes, drains, water", "horn, bells, ring"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a toilet is flushed", "a train blows its horn as it speeds down the tracks "], "question": "which entity is louder", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "wind blows as people chatter quietly"], "sample_ids": ["sQGXqGcwOTc", "xBxDz0CFVn0"], "start_seconds": ["3", "30"], "properties": ["cling, speak, dishes", "wind, chatter, people"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "water flows followed by women screaming"], "sample_ids": ["vK93VuO0yNc", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["male voice, bus, rumble", "water, flow, women"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", null], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is followed by a soft male voice", "label": 0}, {"captions": ["a man woman speak while crickets sing", "a man speaks as a motor runs in the background"], "sample_ids": ["zTLVJCo4WEE", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "background, motor, run"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a duck quacks continuously"], "sample_ids": ["uqFtmnhuqA8", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "quacks, continuously, duck"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "wind blowing followed by a zoom"], "sample_ids": ["siJFXfGWgDk", "vr8ZXjEBhMQ"], "start_seconds": ["50", "150"], "properties": ["a, bird, vehicle", "wind, blow, zoom"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more like a natural phenomenon", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["y2bVZ7rz-5M", "wSVhSdj0F0"], "start_seconds": ["280", "10"], "properties": ["motor noise, horn, siren", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a car horn honks and keys jangle with background noise "], "question": "which entity has a horn honk and keys jingle?", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sQGXqGcwOTc", "uYT5gxnyMWM"], "start_seconds": ["3", "50"], "properties": ["cling, speak, dishes", "female, spraying, scream"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "an infant crying as a woman laughs"], "sample_ids": ["xl2PIWyXaM", "xhmRY9yhC7c"], "start_seconds": ["160", "20"], "properties": ["chirp, man, younger person", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["birds are chirping and people are talking", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["w34HjHr6gAY", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["beeps, squawk, child speaking", "two men, woman, birds"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "water splashes and a door squeaks"], "sample_ids": ["sOa7g-44Dag", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["audio, scratching, man", "sound, splash, door"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a dog barks and taps with background noise "], "question": "which entity has a door squeaking?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "water splashes as an animal walks through"], "sample_ids": ["tOSWIURC-4", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["engine, work, nearby", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a lawn mower is running ", "water splashes and gurgles as people speak"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a clock ticktocks briefly", "water pouring and bubbling"], "sample_ids": ["u7C-AEBQM", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["ticktocks, clock, ticktocks briefly", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a ticktock of a clock", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vYkA3cfXp5Q", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["engine, accelerate, idle", "airplane, boy, fly"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["an engine is idling", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a moving object", "label": 1}, {"captions": ["birds coo incessantly", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yZrFNS7GFBQ", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["coo, bird, incessant", "loud, multiple, distance"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["water flows and trickles", "waves crash against a shoreline and people speak"], "sample_ids": ["tB7hWb9gTuQ", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["water, flow, trickle", "wave, crash, shoreline"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "footage of a person surfing in the ocean"], "captions_pred_audio": ["water is splashing and gurgling", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sZPuqDgX2V0", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["engine, accelerate, intercom", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster crow?", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["zofjfKhqLk8", "s7knHCFW82w"], "start_seconds": ["10", "30"], "properties": ["background, metal, clings", "blow horn, get close, train"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a train is blowing its horn and its wheels are squealing "], "question": "which is a train", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tK4VlLsNxak", "w5W5Kqtc8E"], "start_seconds": ["120", "100"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "wind, blow, vehicle"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "water rushes and then a vehicle zooms past"], "sample_ids": ["vqZuVbG6-HI", "s4Uz1Ffgo04"], "start_seconds": ["130", "100"], "properties": ["background, male, female", "water, rushes, vehicle"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is more active", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wSVhSdj0F0", "ukg5L09Wpvo"], "start_seconds": ["10", "150"], "properties": ["horn honks, keys jingle, electronic beep", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "continuous sneezing together with speech"], "sample_ids": ["vZAw4apG0Es", "x4dZyf9Gbj0"], "start_seconds": ["30", "130"], "properties": ["people, clock, converse", "continuous, sneeze, speech"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking and people are talking", "a woman sneezes and speaks"], "question": "which entity is more likely to be a recording", "label": 1}, {"captions": ["an aircraft engine runs", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["yLCORCnd35Q", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["engine, aircraft, runs", "music, gunfire, explosion"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a movie scene?", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "an insect buzzes around continuously"], "sample_ids": ["wTjoRj1se3U", "v25l1jef3JY"], "start_seconds": ["390", "0"], "properties": ["engine, run, people", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a jet engine is running and people are talking", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["wvKpEYswXO0", "tiDFTC-5vU"], "start_seconds": ["150", "30"], "properties": ["sound, water, running", "male, duck, laugh"], "captions_pred_video": ["of the person preparing food in the kitchen", null], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking as others laugh?", "label": 1}, {"captions": ["a woman and man speak while food is frying", "small dogs yip and bark sharply"], "sample_ids": ["zk-xJGQU8-4", "v-wcQf4BDY0"], "start_seconds": ["130", "120"], "properties": ["food, man, woman", "bark, yip, sharply"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sWZzXuWYY", "tiDFTC-5vU"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking and ducks are quacking"], "question": "which entity is more humorous", "label": 1}, {"captions": ["a helicopter engine runs continuously", "small dogs yip and bark sharply"], "sample_ids": ["ugHJF0hfYkg", "v-wcQf4BDY0"], "start_seconds": ["10", "120"], "properties": ["engine, running, continuously", "bark, yip, sharply"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a helicopter is flying overhead ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a dog barks and whimpers", "pigeons vocalize and birds chirp"], "sample_ids": ["sShpyu2l4YQ", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["barks, whimpers, dog", "vocalize, bird, chirp"], "captions_pred_video": ["the puppies are playing with a toy", "of the pigeon in the cage"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["birds coo incessantly", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["yZrFNS7GFBQ", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["coo, bird, incessant", "loud, laughter, intermittent"], "captions_pred_video": ["of the bird in the cage", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["an owl hoots in the background ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a speedboat passes quickly on the water", "someone snores nearby"], "sample_ids": ["tjmoSi330GM", "spJCm8tD9Zo"], "start_seconds": ["23", "90"], "properties": ["speed, water, boat", "someone snores, nearby, someone"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a person is snoring loudly"], "question": "which entity is moving", "label": 0}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["s4Uz1Ffgo04", "uZesmtKZGSw"], "start_seconds": ["100", "250"], "properties": ["roars, background, people speaking", "men, talk, cars"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["people speak in the background as a clock ticktocks", "a car accelerates and wind blows"], "sample_ids": ["vZAw4apG0Es", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, clock, ticktocks", "accelerates, wind, blows"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a horse runs while two women talk", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sdvI1mHAsc", "wz7N8YRy74I"], "start_seconds": ["20", "30"], "properties": ["two women, horse, run", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people", "label": 1}, {"captions": ["a person is snoring while sleeping", "a cat meows as a young woman speaks"], "sample_ids": ["vJrjSeP17yE", "x5cuQjOdM3E"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "cat, meows, young woman"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a person snoring loudly", "a cat meows and a woman speaks"], "question": "which entity is a person", "label": 0}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["wSVhSdj0F0", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["horn honks, keys jingle, electronic beep", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a man woman speak while crickets sing", "three men talk while wind blows and some liquid flows"], "sample_ids": ["zTLVJCo4WEE", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["a, crickets, sing", "three men, wind, flow"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks as several small engines run", "a horn rings out as a machine runs by"], "sample_ids": ["u9A6VZQCZpU", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["a, man, talk", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "a speedboat passes quickly on the water"], "sample_ids": ["ukg5L09Wpvo", "tjmoSi330GM"], "start_seconds": ["150", "23"], "properties": ["clickety-clack, train, whistle", "speed, water, boat"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a motorboat speeds through water with wind noise "], "question": "which is moving faster", "label": 0}, {"captions": ["a jet engine screams, then increases its power", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vBslzh7saPw", "tiDFTC-5vU"], "start_seconds": ["90", "30"], "properties": ["power, scream, increase", "male, duck, laugh"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "winds blows roughly as a vehicle races past"], "sample_ids": ["sapQIQUhFc", "xjvTpk2Zpr8"], "start_seconds": ["280", "70"], "properties": ["water, trickles, flow", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a jet engine roars and wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["y8WEcpOlT3I", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["harsh, wind, blows", "engine revs, vehicle, people"], "captions_pred_video": ["on how to use a sewing machine youtube", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tPJvjq9QePY", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["animal, bleat, moo", "loud, multiple, distance"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["vdoxuJn9lTc", "y2bVZ7rz-5M"], "start_seconds": ["40", "280"], "properties": ["burp, loud, girl", "motor noise, horn, siren"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a child speaks followed by a burp", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["a duck quacks several times", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vh30P49Po6s", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["quacks, duck, several", "female, spraying, scream"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["paper is crumpling consistently", "a woman talking as an infant is crying"], "sample_ids": ["v5cSxLaHADY", "tMbMDvT50j8"], "start_seconds": ["0", "12"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "a, talk, infant"], "captions_pred_video": ["footage of the person holding a pair of scissors", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["paper is crumpled and crinkled", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["small dogs yip and bark sharply", "children speak and play together"], "sample_ids": ["v-wcQf4BDY0", "yVVP8XvWJTo"], "start_seconds": ["120", "260"], "properties": ["bark, yip, sharply", "children, speak, play"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a dog barks and growls", "children are speaking and breathing with background noise "], "question": "which entity is more social", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "an airplane engine runs"], "sample_ids": ["tEE3MpBt1sg", "yVPZ2MNWpms"], "start_seconds": ["50", "0"], "properties": ["drill, something, laugh", "engine, airplane, runs"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vhJWZheqaE", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["water drains unevenly, toilet flushes, water drains", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a toilet is flushed", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["people speak and tapping occurs", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tFCUUGdREgA", "tdWhHV3X25Q"], "start_seconds": ["70", "60"], "properties": ["people, tap, speak", "applause, audience, yells"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zkKdxzNC97Y", "sLUnaPT5gM8"], "start_seconds": ["27", "0"], "properties": ["hard, surface, door", "loud, laughter, intermittent"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a door is opened and closed", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wRBHTgrbiwg", "zFjIWfSD-4"], "start_seconds": ["50", "410"], "properties": ["bird, owl, speak", "People, motor, brakes"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a man speaking briefly?", "label": 0}, {"captions": ["a loud snarling engine is followed by a man laughing", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zl9Dqx-j7q4", "wDVMhEdTiVw"], "start_seconds": ["6", "30"], "properties": ["engine, laugh, loud", "gun, shoot, water"], "captions_pred_video": ["footage of a man driving a car in the dark", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a jet engine roars ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is followed by water sloshing", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a young woman speaks over spraying and another person yells"], "sample_ids": ["uoGVs9yUqY4", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["multiple, vocalize, wind", "person, spray, yell"], "captions_pred_video": ["for how to make a wooden shed door youtube", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "an infant crying as a woman laughs"], "sample_ids": ["xjhAnI2q6hM", "xhmRY9yhC7c"], "start_seconds": ["6", "20"], "properties": ["engine revs, vehicle, people", "a, laugh, infant"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a toilet flushes and water drains", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sfAvvZwdLCY", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["water drains, flushes, water", "music, gunfire, explosion"], "captions_pred_video": ["footage of the toilet in the bathroom", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a toilet is flushed", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a scene of a toilet flushing and water draining?", "label": 0}, {"captions": ["a kid speaks followed by music playing", "dishes cling together then a man begins to speak"], "sample_ids": ["tQWGZLItBXk", "sQGXqGcwOTc"], "start_seconds": ["170", "3"], "properties": ["music, kid, speak", "cling, speak, dishes"], "captions_pred_video": ["worms revolution screenshots", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "mechanisms are operating and water is splashing "], "question": "which entity is about a kid speaking?", "label": 0}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tZGN5a7ybxo", "xfaoyyzw2WU"], "start_seconds": ["60", "180"], "properties": ["ring, train, horn", "loud, jet engine, roar"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a train is moving and blowing its horn ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["yFB25fqfU8I", "yDoT73BWsdA"], "start_seconds": ["300", "10"], "properties": ["wave, crash, shoreline", "engine, revs, vehicle"], "captions_pred_video": ["footage of a person surfing in the ocean", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "pigeons vocalize and birds chirp"], "sample_ids": ["wnpJndXuxLc", "uiS58TNyUiw"], "start_seconds": ["50", "430"], "properties": ["blows, vehicle, train", "vocalize, bird, chirp"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "of the pigeon in the cage"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["y1saVTXsKwc", "vfYTJq7nU"], "start_seconds": ["80", "130"], "properties": ["a, dog, talk", "rustling, ducks, quack"], "captions_pred_video": ["a dog playing with a pink ball", null], "captions_pred_audio": ["a dog barks and a man speaks", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "a duck quacks loudly and continuously"], "sample_ids": ["wAAkbZToh8", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["burp, laugh, speak", "loud, continuous, quacks"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man burps and a woman speaks", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["heavy rain splashes as it falls", "a series of light horn beeps is followed by a loud steam whistle"], "sample_ids": ["wP8ZKrlx3oA", "wnpJndXuxLc"], "start_seconds": ["40", "50"], "properties": ["fall, rain, splash", "beeps, loud, whistle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["tPJvjq9QePY", "yLy-WycbVVE"], "start_seconds": ["40", "30"], "properties": ["animal, bleat, moo", "background, people, talk"], "captions_pred_video": ["a dog and a sheep in a barn", "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["a baby cries and a man speaks", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity is talking", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "wind blows as people chatter quietly"], "sample_ids": ["vJ7JPEFhyLA", "xBxDz0CFVn0"], "start_seconds": ["16", "30"], "properties": ["three men, wind, flow", "wind, chatter, people"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yFB25fqfU8I", "uEU-Hg5MTN8"], "start_seconds": ["300", "27"], "properties": ["wave, crash, shoreline", "a woman, laughs, animal"], "captions_pred_video": ["footage of a person surfing in the ocean", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a stream of water runs briefly"], "sample_ids": ["x5cuQjOdM3E", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["cat, meows, young woman", "stream, water, run"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a cat meows and a woman speaks", "a car is driving on a wet road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["an engine runs and wind blows", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vs65y4qmyBE", "yDoT73BWsdA"], "start_seconds": ["340", "10"], "properties": ["engine, run, wind", "engine, revs, vehicle"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a child speaks in closed space"], "sample_ids": ["zl9Dqx-j7q4", "yW6FWLSLkx4"], "start_seconds": ["6", "40"], "properties": ["motors rev, laugh, loudly", "child, space, speak"], "captions_pred_video": ["footage of a man driving a car in the dark", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a jet engine roars ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wwyfGO2J4", "yajyRTUQk3U"], "start_seconds": ["90", "400"], "properties": ["people, applaud, hoot", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a woman is speaking while food is frying in the background"], "question": "which entity is about food?", "label": 1}, {"captions": ["a man talks while vehicles pass by", "some men converse over an engine running"], "sample_ids": ["sK4u5T8hW78", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["a, man, talk", "men, converse, engine"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a man talking while vehicles pass by?", "label": 0}, {"captions": ["wind blows and people scream while an engine revs", "paper is crumpling consistently"], "sample_ids": ["w5W5Kqtc8E", "v5cSxLaHADY"], "start_seconds": ["100", "0"], "properties": ["wind, engine, scream", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "paper is crumpled and crinkled"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yLy-WycbVVE", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "male, duck, laugh"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", null], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a man is speaking and ducks are quacking"], "question": "which entity is more likely to be a joke", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a woman speaks as she rubs two objects together"], "sample_ids": ["slZLHwNbbt4", "vzxHnu-SFEw"], "start_seconds": ["300", "80"], "properties": ["clap, distance, horn", "two objects, woman, speak"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["xfudFO976zE", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["animal, bleats, cry", "animal, grunts, snorts"], "captions_pred_video": ["footage is blurry and shaky", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a woman is speaking and a baby is crying"], "question": "which animal is grunting and snorting", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "someone is typing on a computer keyboard"], "sample_ids": ["xyL9F5VrjkE", "v0x1odnXtP0"], "start_seconds": ["20", "210"], "properties": ["wind, blows, vehicle", "keyboard, type, computer"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "how to make money on youtube in spanish"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a person is typing on a keyboard"], "question": "which is not a vehicle", "label": 1}, {"captions": ["people speak as gunfire rings out", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wqTCwqVRDlk", "xKB8O8LTs6s"], "start_seconds": ["80", "70"], "properties": ["gunfire, ring, speak", "music, gunfire, explosion"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and a gun is fired", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a child speaks", "a toilet flushes and water drains"], "sample_ids": ["yW6FWLSLkx4", "sfAvvZwdLCY"], "start_seconds": ["40", "20"], "properties": ["a, child, speaks", "water drains, flushes, water"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "water flows as men speak and yell"], "sample_ids": ["s4Uz1Ffgo04", "vJ7JPEFhyLA"], "start_seconds": ["100", "16"], "properties": ["roars, background, people speaking", "water, flow, men"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "a propeller moves loudly nearby"], "sample_ids": ["vYkA3cfXp5Q", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["speed, idle, accelerate", "loud, propeller, move"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["an engine is idling", "a helicopter is flying overhead "], "question": "which entity is moving at a constant speed", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wqADXCzngMw", "ukg5L09Wpvo"], "start_seconds": ["340", "150"], "properties": ["engine, idle, man", "clickety-clack, train, whistle"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a train blows its whistle and blows its horn "], "question": "which train is going to be moving?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a man speaks as a car is passing by"], "sample_ids": ["ugHJF0hfYkg", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["engine, running, continuously", "a, car, pass"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is stationary", "label": 1}, {"captions": ["birds vocalize and a man speaks", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["v0wPrLBI3hg", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["vocalize, bird, speak", "a train, a horn, a bell"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a train blows its whistle and blows its horn "], "question": "which entity is a warning", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vs65y4qmyBE", "w5W5Kqtc8E"], "start_seconds": ["340", "100"], "properties": ["wind, blows, strongly", "wind, blow, vehicle"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blows?", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "a telephone rings followed by a woman talking"], "sample_ids": ["vZAw4apG0Es", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["people, clock, converse", "ring, talk, woman"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation between two people?", "label": 0}, {"captions": ["paper folding and crinkling", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zPpG3RD8lSs", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["paper, fold, crinkle", "music, gunfire, explosion"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be a movie", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a steam engine runs and whistles as it passes by"], "sample_ids": ["zY3icUyMdh8", "se87d6yxEOA"], "start_seconds": ["20", "10"], "properties": ["dog, bark, engine", "run, whistle, pass"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a train is moving and blowing its whistle "], "question": "which entity is a steam engine?", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["yJ0TePmaOo", "su6FAOcOA8c"], "start_seconds": ["390", "4"], "properties": ["two hard objects, man, speak", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a woman is speaking and a subway train is moving "], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a small engine idles continuously", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["y5WII6cTH7k", "yDoT73BWsdA"], "start_seconds": ["40", "10"], "properties": ["engine, idle, continuously", "engine, revs, vehicle"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a race car accelerates and revs its engine "], "question": "which engine is revving", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "a man talks as several small engines run"], "sample_ids": ["uiS58TNyUiw", "u9A6VZQCZpU"], "start_seconds": ["430", "30"], "properties": ["vocalize, bird, chirp", "a, man, talk"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking while a race car is revving and accelerating "], "question": "which entity is talking", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a stream of water runs briefly"], "sample_ids": ["vddP56-ogds", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["water, splash, person, laugh", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "an insect buzzes around continuously"], "sample_ids": ["wwyfGO2J4", "v25l1jef3JY"], "start_seconds": ["90", "0"], "properties": ["people, applaud, hoot", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["people speak in a closed space", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sTpirNYo8vQ", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["people, space, speak", "animal, grunts, snorts"], "captions_pred_video": ["of a man taking a selfie on a bus", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["an airplane engine spools and people speak", "the revving of an engine throttle followed by a man speaking"], "sample_ids": ["wTjoRj1se3U", "tezvROoo4bs"], "start_seconds": ["390", "40"], "properties": ["airplane, engine, spool", "audio, throttle, speaking"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a busy city street with cars parked on both sides of the road"], "captions_pred_audio": ["a jet engine is running and people are talking", "a car accelerates and revs while a man speaks "], "question": "which entity is about an airplane engine?", "label": 0}, {"captions": ["young female child snoring and breathing deeply", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sAam2NqGhLY", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["snoring, breathing, child", "rustling, ducks, quack"], "captions_pred_video": ["of a little girl sleeping on a couch", null], "captions_pred_audio": ["a person is snoring", "a duck quacks and a woman speaks"], "question": "which entity is a video of a child?", "label": 0}, {"captions": ["a man speaks then multiple motorcycles pass by", "a telephone rings followed by a woman talking"], "sample_ids": ["zcDwZ6W7E3E", "tGcFnX0GHI"], "start_seconds": ["180", "0"], "properties": ["a, man, speak", "ring, talk, woman"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a train horn blows as it passes by"], "sample_ids": ["zCrAfDfv6-A", "zVacuqSb4LI"], "start_seconds": ["30", "30"], "properties": ["person, mouse, click", "horn, blows, train"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a person whistles a song", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["wfHeoPDLMaM", "wyllXV6PjKo"], "start_seconds": ["30", "30"], "properties": ["quacking, squawking, ducks", "a baby, a woman, a man"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "a woman speaks and a baby cries"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a machine runs", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vD6lYD1l0BY", "uYT5gxnyMWM"], "start_seconds": ["330", "50"], "properties": ["a, machine, run", "a, scream, girl"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["water pouring and bubbling", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uyRfq-jKPpo", "tdWhHV3X25Q"], "start_seconds": ["50", "60"], "properties": ["water, bubbles, pouring", "applause, audience, yells"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["water is running from a faucet", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a child speaks in closed space"], "sample_ids": ["se87d6yxEOA", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["run, whistle, pass", "child, space, speak"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["s6DESzUTGjY", "w5W5Kqtc8E"], "start_seconds": ["16", "100"], "properties": ["wind, laugh, woman", "wind, blow, vehicle"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", null], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blows before women yell?", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xyL9F5VrjkE", "vb1fPSDI4c"], "start_seconds": ["20", "30"], "properties": ["wind, motor, distance", "multiple, people, yell"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["children speak as a female ask them questions", "people cheer as a vehicle engine revs"], "sample_ids": ["wEBlkGWVWwE", "xjhAnI2q6hM"], "start_seconds": ["260", "6"], "properties": ["female, speak, questions", "engine revs, vehicle, people"], "captions_pred_video": ["shows a person writing on the whiteboard", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "winds blows roughly as a vehicle races past"], "sample_ids": ["sZvwOuuPGP0", "xjvTpk2Zpr8"], "start_seconds": ["50", "70"], "properties": ["engine, diesel, truck", "wind, blows, vehicle"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a medium engine is running ", "a jet engine roars and wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["uYT5gxnyMWM", "w34HjHr6gAY"], "start_seconds": ["50", "30"], "properties": ["person, spray, yell", "beeps, hit, woman"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "paper is crumpling consistently"], "sample_ids": ["wRV8yMk886E", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["liquid, spray, nozzle", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["two cars are parked in a parking lot at night", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man speaks followed by a loud burst", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a horn rings out as a machine runs by"], "sample_ids": ["u2f5NpsoHBg", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["person, laugh, clap", "a, horn, run"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["s3cTDAj31g", "vlS6YMeWAPo"], "start_seconds": ["80", "40"], "properties": ["man, talk, woman", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["zuua6-5goWw", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["birds, chirp, quiet, man, speaks", "two objects, woman, speak"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a beep repeats multiple times", "a horn honks twice and keys jingle, followed by a slam and an electronic beep"], "sample_ids": ["y682ml90jGw", "wSVhSdj0F0"], "start_seconds": ["11", "10"], "properties": ["beep, repeat, multiple", "horn honks, keys jingle, slam"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a car horn honks and keys jangle with background noise "], "question": "which entity has a slam?", "label": 1}, {"captions": ["a woman and man are speaking", "someone is typing on a computer keyboard"], "sample_ids": ["vbpKkWvfOu4", "v0x1odnXtP0"], "start_seconds": ["560", "210"], "properties": ["two people, speaking, woman, man", "keyboard, type, computer"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["some men converse over an engine running", "a clock ticktocks briefly"], "sample_ids": ["sCiy7QS1U", "u7C-AEBQM"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "ticktocks, clock, ticktocks briefly"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a ticktock of a clock"], "question": "which entity is a clock?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vimzuGQvdcU", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["a, man, yells", "three men, wind, flow"], "captions_pred_video": ["a group of people are rafting down a river", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "a duck quacks continuously"], "sample_ids": ["tjmoSi330GM", "vh30P49Po6s"], "start_seconds": ["23", "30"], "properties": ["speed, water, boat", "quacks, continuously, duck"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xjvTpk2Zpr8", "vJ7JPEFhyLA"], "start_seconds": ["70", "16"], "properties": ["wind, blows, vehicle", "three men, wind, flow"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a vehicle racing past?", "label": 0}, {"captions": ["a person speaks briefly", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["zOZleIRqZm4", "wSVhSdj0F0"], "start_seconds": ["80", "10"], "properties": ["person, talk, brief", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a car horn honks and keys jangle with background noise "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "frogs croak and vocalize"], "sample_ids": ["uOpoD0gGXcs", "yswmmRZFItk"], "start_seconds": ["120", "0"], "properties": ["chirps, woman, bird", "croak, vocalize, frog"], "captions_pred_video": ["a herd of cows grazing in the field", "a close up of a frog in the water"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a frog is croaking"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["uC9dtII1KDI", "sLUnaPT5gM8"], "start_seconds": ["150", "0"], "properties": ["wind, gusts, distance", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an engine revs and a turning noise is made", "a person is whistling a tune"], "sample_ids": ["tOSWIURC-4", "scYRUkrFLiQ"], "start_seconds": ["0", "30"], "properties": ["noise, engine, revs", "a, tune, whistle"], "captions_pred_video": [null, "of the man wearing a bow tie and a suit jacket in front of a red door"], "captions_pred_audio": ["a lawn mower is running ", "a person whistling a song"], "question": "which entity is not a noise", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "several insects fly while two men talk"], "sample_ids": ["xfaoyyzw2WU", "s-T9OVOiMLo"], "start_seconds": ["180", "330"], "properties": ["loud, jet engine, roar", "several, fly, men"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a man is speaking while insects are buzzing in the background "], "question": "which is not a man", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a man speaks as a motor runs in the background"], "sample_ids": ["wtDqrBygTcU", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["man, engine, run", "background, motor, run"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a motor is running", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a boat?", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zkKdxzNC97Y", "wDVMhEdTiVw"], "start_seconds": ["27", "30"], "properties": ["loud, bang, noise", "gun, shoot, water"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a door is opened and closed", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity has a loud bang followed by a softer banging noise?", "label": 0}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["weDbePuc-Xc", "zl9Dqx-j7q4"], "start_seconds": ["40", "6"], "properties": ["cartoon character, music, vocalize", "engine, laugh, loud"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "an airplane engine runs"], "sample_ids": ["se87d6yxEOA", "yVPZ2MNWpms"], "start_seconds": ["10", "0"], "properties": ["run, whistle, pass", "engine, airplane, runs"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a car is driving by on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a mechanical buzzing getting louder"], "sample_ids": ["y2ZBGpgbhHM", "sEprKHm8Sj8"], "start_seconds": ["30", "90"], "properties": ["birds, tweet, pant", "noise, loud, buzzing"], "captions_pred_video": [null, "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["birds chirping and a dog panting", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["some people speak", "a goat bleats and someone makes a calling noise"], "sample_ids": ["vbZ-0lGPneg", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "noise, bleat, call"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a goat bleats and birds chirp"], "question": "which entity is a noise", "label": 1}, {"captions": ["a person speaks over rustling leaves", "loud clanking and banging with brief male speech"], "sample_ids": ["zOZleIRqZm4", "sWZzXuWYY"], "start_seconds": ["80", "420"], "properties": ["rustling, leaves, person", "male, speech, banging"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a sewing machine runs and a man speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["tQWGZLItBXk", "uYT5gxnyMWM"], "start_seconds": ["170", "50"], "properties": ["music, person, ding", "a, scream, girl"], "captions_pred_video": ["worms revolution screenshots", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "a car speeding up in the distance"], "sample_ids": ["wTideSjRFS0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["food, sizzle, woman", "distance, car, speed"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "an animal bleats and cries out and metal bangs"], "sample_ids": ["sYITalLZjj4", "xfudFO976zE"], "start_seconds": ["30", "0"], "properties": ["water, rushes, background, birds", "animal, bleats, cry"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage is blurry and shaky"], "captions_pred_audio": ["wind blows and birds chirp", "a goat bleats and birds chirp in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an airplane engine roars increasingly louder", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vBslzh7saPw", "tDVADusiIoc"], "start_seconds": ["90", "60"], "properties": ["engine, roar, louder", "water, radio, man"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "wind blows as people chatter quietly"], "sample_ids": ["zY3icUyMdh8", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "wind, chatter, people"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a person is whistling", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sIXTftIuUgw", "zFjIWfSD-4"], "start_seconds": ["90", "410"], "properties": ["person, whistling, person", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 0}, {"captions": ["a vehicle engine revs and tires squeal", "a drill drills through something then people begin laughing"], "sample_ids": ["yDoT73BWsdA", "tEE3MpBt1sg"], "start_seconds": ["10", "50"], "properties": ["engine revs, tires squeal, vehicle", "drill, something, laugh"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["a train horn sounds as a railroad passing bell rings", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zgUgkpk78xU", "vfYTJq7nU"], "start_seconds": ["70", "130"], "properties": ["horn, bell, train", "rustling, ducks, quack"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a duck quacks and a woman speaks"], "question": "which entity is about a train?", "label": 0}, {"captions": ["a loud snarling engine is followed by a man laughing", "a person sneezes followed by another person speaking"], "sample_ids": ["zl9Dqx-j7q4", "t8CV69hcvF0"], "start_seconds": ["6", "210"], "properties": ["engine, laugh, loud", "person, sneeze, follow"], "captions_pred_video": ["footage of a man driving a car in the dark", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a jet engine roars ", "a woman sneezes and speaks"], "question": "which entity is followed by a person speaking", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a man speaks as a car is passing by"], "sample_ids": ["yRx9txMcBl0", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["accelerates, tires, squeals", "a, car, pass"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["heavy rain splashes as it falls", "water flows as men speak and yell"], "sample_ids": ["wP8ZKrlx3oA", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["fall, rain, splash", "water, flow, men"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["rqfQRErjfk8", "ukg5L09Wpvo"], "start_seconds": ["170", "150"], "properties": ["crowd, cheers, applauds", "clickety-clack, train, whistle"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a train blows its whistle and blows its horn "], "question": "which entity is quieter", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wyllXV6PjKo", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["a baby, a woman, a man", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a baby is crying"], "question": "which entity has a baby?", "label": 0}, {"captions": ["an airplane flies overhead as a woman speaks", "a toilet flushes and a female speaks"], "sample_ids": ["zj2R0XoFr5k", "yaln9y8I7ms"], "start_seconds": ["50", "230"], "properties": ["airplane, fly, overhead", "female, flushes, toilet"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a toilet flushes and a man speaks"], "question": "which entity is a woman speaking?", "label": 0}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a man speaks as a motor runs in the background"], "sample_ids": ["weDbePuc-Xc", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["music, slaps, human", "background, motor, run"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks as a machine runs", "an infant crying frantically"], "sample_ids": ["vD6lYD1l0BY", "zwOBqeFTgiU"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "cry, infant, frantically"], "captions_pred_video": ["game controller being held in the hands of the person", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["multiple ducks quack continuously", "paper is crumpling consistently"], "sample_ids": ["wfHeoPDLMaM", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["multiple, quack, continuously", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["ducks are quacking", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a infant makes noise and is excited"], "sample_ids": ["zl9Dqx-j7q4", "wIJK3-5y0kA"], "start_seconds": ["6", "30"], "properties": ["engine, laugh, loud", "noise, excited, infant"], "captions_pred_video": ["footage of a man driving a car in the dark", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a jet engine roars ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a car accelerates and wind blows", "an engine sputters followed by a car zooming by"], "sample_ids": ["u0TrcHhkPQ", "u5RmF3c3Aw"], "start_seconds": ["20", "60"], "properties": ["accelerates, wind, blows", "engine, car, zoom"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a race car accelerates and skids with wind noise in the background "], "question": "which car is zooming by", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uWAAAL4CIoc", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["a, dog, vocalize", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["rwTERCUno", "xfaoyyzw2WU"], "start_seconds": ["90", "180"], "properties": ["engine, idle, sputter", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["an engine is idling and vibrating", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "water rushes and then a vehicle zooms past"], "sample_ids": ["v0x1odnXtP0", "s4Uz1Ffgo04"], "start_seconds": ["210", "100"], "properties": ["keyboard, type, computer", "water, rushes, vehicle"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is more likely to be seen in a movie", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a woman speaks and is crumpling paper"], "sample_ids": ["zgUgkpk78xU", "xvDdE3zNf8Y"], "start_seconds": ["70", "120"], "properties": ["clinking, humming, horn", "A, crumple, paper"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman speaks and crumples paper"], "question": "which entity is crumpling paper", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a duck quacks continuously"], "sample_ids": ["tOSWIURC-4", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["engine, work, nearby", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a lawn mower is running ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "a stream of water runs briefly"], "sample_ids": ["xO-Q2BlIIPU", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["two men, exclamation, speak", "stream, water, run"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uEU-Hg5MTN8", "vb1fPSDI4c"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "multiple, people, yell"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vJ7JPEFhyLA", "zl9Dqx-j7q4"], "start_seconds": ["16", "6"], "properties": ["three men, wind, flow", "engine, laugh, loud"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "birds chirp as a man speaks and a younger person speaks"], "sample_ids": ["w2M4i1mklOA", "xl2PIWyXaM"], "start_seconds": ["30", "160"], "properties": ["loud, chime, bell", "chirp, man, younger person"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "birds are chirping and people are talking"], "question": "which entity is quieter", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xERFUeZONz8", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["ring, approach, traffic", "wind, blow, vehicle"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", null], "captions_pred_audio": ["an emergency vehicle siren blares", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "small dogs yip and bark sharply"], "sample_ids": ["wztCSUxOf8", "v-wcQf4BDY0"], "start_seconds": ["130", "120"], "properties": ["a crowd, yells, applauds", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a dog barks and growls"], "question": "which entity is more likely to be a group of people", "label": 0}, {"captions": ["a sleeping person snores and wheezes", "a woman speaks as she rubs two objects together"], "sample_ids": ["spJCm8tD9Zo", "vzxHnu-SFEw"], "start_seconds": ["90", "80"], "properties": ["snores, wheezes, sleeps", "two objects, woman, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 0}, {"captions": ["continuous sneezing together with speech", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["x4dZyf9Gbj0", "t25U-v4k4ts"], "start_seconds": ["130", "40"], "properties": ["continuous, sneeze, speech", "a, chirps, bird"], "captions_pred_video": ["footage is blurry and out of focus", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking and bees are buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["w6RTHR6AeAg", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["call, owl, screech", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a duck quacks and a woman speaks"], "question": "which entity is a bird?", "label": 0}, {"captions": ["birds chirp, a woman speaks, and insects buzz", "a man speaks as a motor runs in the background"], "sample_ids": ["t97k0cejSQE", "xZepNM9qcRA"], "start_seconds": ["250", "30"], "properties": ["sound, chirp, buzz", "background, motor, run"], "captions_pred_video": ["a bee on a purple thistle flower", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wTideSjRFS0", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["food, sizzle, woman", "loud, multiple, distance"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tIY7qOV3rEM", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "two men, woman, birds"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more animals", "label": 0}, {"captions": ["a horn blasts as warning bells ring", "a child speaks in closed space"], "sample_ids": ["zgUgkpk78xU", "yW6FWLSLkx4"], "start_seconds": ["70", "40"], "properties": ["horn, bells, ring", "child, space, speak"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["an engine runs and wind blows", "a man speaks followed by another man speaking outside"], "sample_ids": ["vs65y4qmyBE", "viuTg1M-dqg"], "start_seconds": ["340", "30"], "properties": ["engine, run, wind", "two men, speak, follow"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["a baby cries and a woman speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tMbMDvT50j8", "vfYTJq7nU"], "start_seconds": ["12", "130"], "properties": ["a, cry, woman", "rustling, ducks, quack"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is about a baby?", "label": 0}, {"captions": ["a woman and man speak while food is frying", "a vehicle engine revs and tires squeal"], "sample_ids": ["zk-xJGQU8-4", "yDoT73BWsdA"], "start_seconds": ["130", "10"], "properties": ["food, man, woman", "engine revs, tires squeal, vehicle"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a church bell rings several times", "several insects fly while two men talk"], "sample_ids": ["sUVVjE3Ucp8", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["ring, bell, several", "several, fly, men"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a church bell is ringing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a large engine roars and a strong wind blows", "a sputtering motor idles roughly"], "sample_ids": ["snFy48Lv3r8", "rwTERCUno"], "start_seconds": ["30", "90"], "properties": ["engine, roar, wind", "a, idle, motor"], "captions_pred_video": [null, null], "captions_pred_audio": ["the wind is blowing and water is splashing", "an engine is idling and vibrating"], "question": "which entity is quieter", "label": 1}, {"captions": ["a stream of water flows quickly", "a woman speaks as she rubs two objects together"], "sample_ids": ["wbHTKEJZyhc", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["stream, water, flow", "two objects, woman, speak"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a physical action", "label": 1}, {"captions": ["a child speaks", "small dogs yip and bark sharply"], "sample_ids": ["yW6FWLSLkx4", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["a, child, speaks", "bark, yip, sharply"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks followed by another man speaking outside", "birds chirp and an owl hoots before a man speaks briefly"], "sample_ids": ["viuTg1M-dqg", "wRBHTgrbiwg"], "start_seconds": ["30", "50"], "properties": ["two men, speak, follow", "bird, owl, speak"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "birds are chirping and insects are buzzing"], "question": "which entity has more animals speaking", "label": 1}, {"captions": ["a person sniffs and sneezes", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uRlbY6aoBU", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["sneezes, person, sniffs", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is sneezing ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["uWPRNLnpy7Y", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["accelerate, laugh, vehicle", "animal, grunts, snorts"], "captions_pred_video": ["is taken from a car driving down the street", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vzceMbklWc", "w5W5Kqtc8E"], "start_seconds": ["180", "100"], "properties": ["water, faucet, sink", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and a man is speaking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an emergency siren wails as it passes", "some men converse over an engine running"], "sample_ids": ["vGj1XLJvNrw", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["wails, wails, pass", "men, converse, engine"], "captions_pred_video": ["footage of a police car driving down a city street", null], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["y8WEcpOlT3I", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["wind, speak, buffeting", "a woman, something, fried"], "captions_pred_video": ["on how to use a sewing machine youtube", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tQWGZLItBXk", "zl9Dqx-j7q4"], "start_seconds": ["170", "6"], "properties": ["music, kid, speak", "engine, laugh, loud"], "captions_pred_video": ["worms revolution screenshots", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a woman speaks as she rubs two objects together"], "sample_ids": ["ylpYOorfH4o", "vzxHnu-SFEw"], "start_seconds": ["410", "80"], "properties": ["motor, run, steady", "two objects, woman, speak"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is being rubbed together?", "label": 1}, {"captions": ["a person speaks over rustling leaves", "water splashes as an animal walks through"], "sample_ids": ["zOZleIRqZm4", "w1ir-sZ3Im8"], "start_seconds": ["80", "90"], "properties": ["rustling, leaves, person", "animal, water, splashes"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zY3icUyMdh8", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "stream, water, flow"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "an infant crying as a woman laughs"], "sample_ids": ["x6ijhqRY38s", "xhmRY9yhC7c"], "start_seconds": ["250", "20"], "properties": ["something metal, glass, hit", "a, laugh, infant"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman sneezes then speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["x4dZyf9Gbj0", "zl9Dqx-j7q4"], "start_seconds": ["130", "6"], "properties": ["sneezes, speaks, woman", "engine, laugh, loud"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman sneezes and speaks", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "an infant crying as a woman laughs"], "sample_ids": ["vVhthZ45k3Y", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["cat, purr, hiss", "a, laugh, infant"], "captions_pred_video": ["footage is blurry and out of focus", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["an audience gives applause", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["x6iCUDmRpKQ", "tiDFTC-5vU"], "start_seconds": ["38", "30"], "properties": ["applause, audience, give", "male, duck, laugh"], "captions_pred_video": ["a black background with the moon and stars in the sky", null], "captions_pred_audio": ["a group of people are clapping and cheering", "a man is speaking and ducks are quacking"], "question": "which is not a person", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "a train horn blows as it passes by"], "sample_ids": ["tOSWIURC-4", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["noise, engine, revs", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a lawn mower is running ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which train is making noise", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a kid speaks followed by music playing"], "sample_ids": ["rwtmaKiCcQU", "tQWGZLItBXk"], "start_seconds": ["30", "170"], "properties": ["nozzle, depressed, spray can", "music, kid, speak"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "worms revolution screenshots"], "captions_pred_audio": ["spraying and people speaking", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has a nozzle depressed", "label": 0}, {"captions": ["a car accelerates and wind blows", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["u0TrcHhkPQ", "tdWhHV3X25Q"], "start_seconds": ["20", "60"], "properties": ["accelerates, wind, blows", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xl2PIWyXaM", "wDVMhEdTiVw"], "start_seconds": ["160", "30"], "properties": ["chirp, man, younger person", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["birds are chirping and people are talking", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wTjoRj1se3U", "xfaoyyzw2WU"], "start_seconds": ["390", "180"], "properties": ["engine, run, people", "loud, jet engine, roar"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a jet engine is running and people are talking", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "water flows as men speak and yell"], "sample_ids": ["uZesmtKZGSw", "vJ7JPEFhyLA"], "start_seconds": ["250", "16"], "properties": ["men, talk, cars", "water, flow, men"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["bees buzz and wind blows", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tMJne1a4AFI", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["bees buzz, wind blows, bees", "loud, jet engine, roar"], "captions_pred_video": ["a swarm of bees on the ground", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a swarm of bees buzzing around", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "three men talk while wind blows and some liquid flows"], "sample_ids": ["tDVADusiIoc", "vJ7JPEFhyLA"], "start_seconds": ["60", "16"], "properties": ["water, radio, man", "three men, wind, flow"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["wRBHTgrbiwg", "vlS6YMeWAPo"], "start_seconds": ["50", "40"], "properties": ["birds, chirp, cooing", "sheep, baa, birds"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a goat bleats and birds chirp"], "question": "which entity is a sheep?", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a man speaks as a motor runs in the background"], "sample_ids": ["vW4x7S1VfQc", "xZepNM9qcRA"], "start_seconds": ["150", "30"], "properties": ["clacking, oil, woman", "background, motor, run"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["food sizzles in a frying pan", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a train horn sounds as a railroad passing bell rings", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["zgUgkpk78xU", "x9JovgqUcs"], "start_seconds": ["70", "500"], "properties": ["horn, bell, train", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man speaks and types on a keyboard"], "question": "which entity is a person", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vms5XGTDVQc", "w5W5Kqtc8E"], "start_seconds": ["220", "100"], "properties": ["paper, crumpled, crinkled", "wind, blow, vehicle"], "captions_pred_video": ["footage of a woman opening a black bag on a table", null], "captions_pred_audio": ["paper is crumpled and crinkled", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["wind blows and people scream while an engine revs", "sirens ring and approach with humming of distant traffic"], "sample_ids": ["w5W5Kqtc8E", "xERFUeZONz8"], "start_seconds": ["100", "0"], "properties": ["wind, engine, scream", "ring, approach, traffic"], "captions_pred_video": [null, "footage is blurry due to camera shake or motion blur"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "an emergency vehicle siren blares"], "question": "which entity is a warning", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "heavy rain splashes as it falls"], "sample_ids": ["u21-Z5gJCB8", "wP8ZKrlx3oA"], "start_seconds": ["30", "40"], "properties": ["background, voice, man", "fall, rain, splash"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a heavy rain is falling on a surface"], "question": "which entity is more likely to cause water to splash", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["w2M4i1mklOA", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["alarm, gears, turn", "rooster, crow, background, men"], "captions_pred_video": ["footage of an antique clock", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a clock?", "label": 0}, {"captions": ["a child babbles as a woman speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wEBlkGWVWwE", "zFjIWfSD-4"], "start_seconds": ["260", "410"], "properties": ["a, babble, woman", "People, motor, brakes"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a duck quacks continuously"], "sample_ids": ["rwtmaKiCcQU", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["nozzle, depressed, spray can", "quacks, continuously, duck"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["spraying and people speaking", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a car accelerates and wind blows"], "sample_ids": ["zk-xJGQU8-4", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["food, man, woman", "accelerates, wind, blows"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", null], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["tQWGZLItBXk", "sLUnaPT5gM8"], "start_seconds": ["170", "0"], "properties": ["voice, music, whoosh", "loud, laughter, intermittent"], "captions_pred_video": ["worms revolution screenshots", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["water rushes and then a vehicle zooms past", "a woman speaks as she rubs two objects together"], "sample_ids": ["s4Uz1Ffgo04", "vzxHnu-SFEw"], "start_seconds": ["100", "80"], "properties": ["water, rushes, vehicle", "two objects, woman, speak"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a vehicle zooming past?", "label": 0}, {"captions": ["vehicle engines race around a track as a man commentates", "waves crash against a shoreline and people speak"], "sample_ids": ["sZPuqDgX2V0", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["commentator, race, track", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be seen in a movie", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tqR406bGiE", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["flush, water, gurgle", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about water?", "label": 0}, {"captions": ["male speech with light ticking", "water flows as men speak and yell"], "sample_ids": ["xO-Q2BlIIPU", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["male, speech, ticking", "water, flow, men"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a duck quacks continuously"], "sample_ids": ["ukg5L09Wpvo", "vh30P49Po6s"], "start_seconds": ["150", "30"], "properties": ["a train, a horn, a bell", "quacks, continuously, duck"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a machine beeps continuously"], "sample_ids": ["sWZzXuWYY", "y682ml90jGw"], "start_seconds": ["420", "11"], "properties": ["male, speech, banging", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vfYTJq7nU", "w5W5Kqtc8E"], "start_seconds": ["130", "100"], "properties": ["rustling, ducks, quack", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a duck quacks and a woman speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sfAvvZwdLCY", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "stream, water, flow"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage is blurry and out of focus"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with wind noise in the background "], "question": "which entity is moving water", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vlJS7LN2XyM", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["background, clocks, ticking", "loud, multiple, distance"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a person is whistling", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sIXTftIuUgw", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["person, whistling, person", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman sneezes then speaks", "a small musical boom and then birds tweet and a few dogs pant"], "sample_ids": ["x4dZyf9Gbj0", "y2ZBGpgbhHM"], "start_seconds": ["130", "30"], "properties": ["sneezes, speaks, woman", "birds, tweet, pant"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a woman sneezes and speaks", "birds chirping and a dog panting"], "question": "which entity is more active", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a dark barks and whimpers"], "sample_ids": ["xKB8O8LTs6s", "sYj4hpDUZDQ"], "start_seconds": ["70", "30"], "properties": ["music, gunfire, explosion", "barks, whimpers, dark"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a brown and white dog standing in front of a wall with its mouth open"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a dog barks and a cat meows"], "question": "which entity is more quiet", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sZPuqDgX2V0", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["commentator, race, track", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a car speeding up in the distance"], "sample_ids": ["spJCm8tD9Zo", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["snores, wheezes, sleeps", "distance, car, speed"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sofxkNWaP0s", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["wind, engine, louder", "engine, idle, woman"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a toilet flushes and water drains", "water splashes as an animal walks through"], "sample_ids": ["sfAvvZwdLCY", "w1ir-sZ3Im8"], "start_seconds": ["20", "90"], "properties": ["water drains, flushes, water", "animal, water, splashes"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a toilet is flushed", "water splashes and gurgles as people speak"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "some tunes played by whistling"], "sample_ids": ["uWPRNLnpy7Y", "u6BnG6YZqJ4"], "start_seconds": ["10", "0"], "properties": ["accelerate, laugh, vehicle", "tune, play, whistling"], "captions_pred_video": ["is taken from a car driving down the street", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "wind blows as people chatter quietly"], "sample_ids": ["xSKJGCItUWE", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["engine, run, boy", "wind, chatter, people"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage is blurry and out of focus"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp and wind blows", "a man talks as something metal hits against and glass is set down"], "sample_ids": ["sxIvBMSavMQ", "x6ijhqRY38s"], "start_seconds": ["210", "250"], "properties": ["birds, chirp, wind", "something metal, glass, hit"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "a chef preparing a dish with a bottle of wine and a plate of food on a table"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking and dishes are clanging "], "question": "which entity is about something hitting something?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a man speaks over intermittent keyboard taps"], "sample_ids": ["wSVhSdj0F0", "tw76HGONaKg"], "start_seconds": ["10", "570"], "properties": ["horn honks, keys jingle, slam", "audio, man, keyboard"], "captions_pred_video": [null, "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man speaks and types on a computer keyboard "], "question": "which entity is a recording of a person speaking?", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "a train horn blows as it passes by"], "sample_ids": ["sZvwOuuPGP0", "zVacuqSb4LI"], "start_seconds": ["50", "30"], "properties": ["engine, diesel, truck", "horn, blows, train"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a medium engine is running ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is not a vehicle", "label": 1}, {"captions": ["water running down a sink while a man is talking", "wind blows strongly and a young man speaks"], "sample_ids": ["vSeGhaZt-aI", "vs65y4qmyBE"], "start_seconds": ["50", "340"], "properties": ["water, sink, talk", "wind, blows, strongly"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a heavy engine is running and men are speaking "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "people applaud and hoot and chat quietly"], "sample_ids": ["ylpYOorfH4o", "wwyfGO2J4"], "start_seconds": ["410", "90"], "properties": ["engine, run, loud", "people, applaud, hoot"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sWZzXuWYY", "w5W5Kqtc8E"], "start_seconds": ["420", "100"], "properties": ["male, clanks, thumps", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "water is sprayed across a hard surface"], "sample_ids": ["uEU-Hg5MTN8", "sQwlkXjQabo"], "start_seconds": ["27", "10"], "properties": ["animal, grunts, snorts", "water, spray, surface"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks while water drains", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["vSeGhaZt-aI", "vqZuVbG6-HI"], "start_seconds": ["50", "130"], "properties": ["water, drain, man", "background, male, female"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a lawn mower is running and men are speaking "], "question": "which entity has more background noise", "label": 1}, {"captions": ["someone snores nearby", "a car speeding up in the distance"], "sample_ids": ["spJCm8tD9Zo", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["someone snores, nearby, someone", "distance, car, speed"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a beep occurs briefly"], "sample_ids": ["sNB8zxXneIM", "xtWeJ56-U-g"], "start_seconds": ["20", "20"], "properties": ["several, quack, cocks", "beep, occur, briefly"], "captions_pred_video": ["a group of geese in a cage", "how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "mechanisms are ticking and a beep is heard "], "question": "which entity is silent", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "some tunes played by whistling"], "sample_ids": ["xSKJGCItUWE", "u6BnG6YZqJ4"], "start_seconds": ["10", "0"], "properties": ["engine, work, child", "tune, play, whistling"], "captions_pred_video": ["footage of the helicopter flying in the room", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["birds coo incessantly", "people applaud and hoot and chat quietly"], "sample_ids": ["yZrFNS7GFBQ", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["coo, bird, incessant", "people, applaud, hoot"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone is burping continuously", "winds blows roughly as a vehicle races past"], "sample_ids": ["y636gklDioE", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["burps, burps, burps", "wind, blows, vehicle"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a person burps loudly several times", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["someone snores nearby", "a woman speaks with water running"], "sample_ids": ["spJCm8tD9Zo", "wTideSjRFS0"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "water, running, woman"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking while water is running in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["s59PfAghdkM", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["bird, chirp, background, horse, neigh", "water, radio, man"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a horse in it?", "label": 0}, {"captions": ["wind blows strongly and a young man speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["vs65y4qmyBE", "xZepNM9qcRA"], "start_seconds": ["340", "30"], "properties": ["wind, blows, strongly", "background, motor, run"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["food is frying and sizzles", "vehicles pass by on a roadway"], "sample_ids": ["zNRChLjqcU", "tgbONvsP47Y"], "start_seconds": ["220", "0"], "properties": ["food is frying, sizzles, food", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["water is running from a faucet into a sink", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaking with light rustling", "a vehicle engine runs and someone speaks"], "sample_ids": ["zOZleIRqZm4", "zF8yoL0rkbI"], "start_seconds": ["80", "30"], "properties": ["light, rustling, man", "engine, run, someone"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of the traffic on the street at night"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "the wind is blowing hard and water is splashing"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a clang followed by a toilet flushing", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wNZ5thZM7XU", "tdWhHV3X25Q"], "start_seconds": ["20", "60"], "properties": ["sound, flush, toilet", "applause, audience, yells"], "captions_pred_video": ["footage of a toilet in a bathroom stall", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a toilet flushes", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a woman talks while a baby cries and a man whispers"], "sample_ids": ["vdoxuJn9lTc", "smDKStoHBJo"], "start_seconds": ["40", "0"], "properties": ["burp, loud, girl", "a, talk, baby, cry"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a man holding a crying baby in his arms"], "captions_pred_audio": ["a child speaks followed by a burp", "a baby is crying and a woman is speaking"], "question": "which entity has a baby?", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "an airplane flies overhead as a woman speaks"], "sample_ids": ["slZLHwNbbt4", "zj2R0XoFr5k"], "start_seconds": ["300", "50"], "properties": ["train, horn, sound", "airplane, fly, overhead"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman speaks while a helicopter flies overhead "], "question": "which object is flying overhead", "label": 0}, {"captions": ["the revving of an engine throttle followed by a man speaking", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["tezvROoo4bs", "rqu8iB22IY"], "start_seconds": ["40", "5"], "properties": ["audio, throttle, speaking", "sound, repeats, laugh"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", null], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a dog barks and a man speaks while music plays "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["several ducks are quacking and squawking", "a child speaks in closed space"], "sample_ids": ["wfHeoPDLMaM", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["quacking, squawking, ducks", "child, space, speak"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["ducks are quacking", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "pigeons vocalize and birds chirp"], "sample_ids": ["wTjoRj1se3U", "uiS58TNyUiw"], "start_seconds": ["390", "430"], "properties": ["engine, run, people", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "of the pigeon in the cage"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "people speak as gunfire rings out"], "sample_ids": ["rwTERCUno", "wqTCwqVRDlk"], "start_seconds": ["90", "80"], "properties": ["engine, idle, sputter", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["a dog barks and whimpers", "someone whistles a tune"], "sample_ids": ["sShpyu2l4YQ", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["barks, whimpers, dog", "someone, tune, whistle"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks as a machine runs", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vD6lYD1l0BY", "zl9Dqx-j7q4"], "start_seconds": ["330", "6"], "properties": ["a, machine, run", "engine, laugh, loud"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "an infant crying as a woman laughs"], "sample_ids": ["x9JovgqUcs", "xhmRY9yhC7c"], "start_seconds": ["500", "20"], "properties": ["a, man, speaks, keyboard", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vbZ-0lGPneg", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["a woman, a television program, a bird", "multiple, people, yell"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "people applaud and hoot and chat quietly"], "sample_ids": ["vZAw4apG0Es", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["people, clock, converse", "people, applaud, hoot"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["an infant crying as a woman laughs", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["xhmRY9yhC7c", "zFjIWfSD-4"], "start_seconds": ["20", "410"], "properties": ["a, laugh, infant", "People, motor, brakes"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["zj2R0XoFr5k", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["airplane, boy, fly", "cling, speak, dishes"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "mechanisms are operating and water is splashing "], "question": "which entity is about a boy speaking?", "label": 0}, {"captions": ["a man sprays as a scraping occurs in the background", "water flows and trickles"], "sample_ids": ["sOa7g-44Dag", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["background, man, spray", "water, flow, trickle"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an aircraft engine runs", "a man speaks as a motor runs in the background"], "sample_ids": ["yLCORCnd35Q", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["engine, aircraft, runs", "background, motor, run"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a motor?", "label": 1}, {"captions": ["a man speaks while water drains", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vSeGhaZt-aI", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["water, drain, man", "music, gunfire, explosion"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wvKpEYswXO0", "xBxDz0CFVn0"], "start_seconds": ["150", "30"], "properties": ["sound, water, running", "stream, water, flow"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking with wind noise in the background "], "question": "which entity has water flowing", "label": 1}, {"captions": ["a clock ticktocks briefly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["u7C-AEBQM", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["ticktocks, clock, ticktocks briefly", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a ticktock of a clock", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["u5RmF3c3Aw", "yajyRTUQk3U"], "start_seconds": ["60", "400"], "properties": ["engine, car, zoom", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about a car zooming by?", "label": 0}, {"captions": ["heavy rain splashes as it falls", "a car accelerates and wind blows"], "sample_ids": ["wP8ZKrlx3oA", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["fall, rain, splash", "accelerates, wind, blows"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "someone is typing on a computer keyboard"], "sample_ids": ["vW4x7S1VfQc", "v0x1odnXtP0"], "start_seconds": ["150", "210"], "properties": ["clacking, oil, woman", "keyboard, type, computer"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "how to make money on youtube in spanish"], "captions_pred_audio": ["food sizzles in a frying pan", "a person is typing on a keyboard"], "question": "which is a type of computer", "label": 1}, {"captions": ["people speak in a closed space", "a child babbles as a woman speaks"], "sample_ids": ["sTpirNYo8vQ", "wEBlkGWVWwE"], "start_seconds": ["30", "260"], "properties": ["people, space, speak", "a, babble, woman"], "captions_pred_video": ["of a man taking a selfie on a bus", "shows a person writing on the whiteboard"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a woman is speaking and a child is speaking with background noise and clapping "], "question": "which entity is a child babbles as a woman speaks?", "label": 1}, {"captions": ["women speak and laugh as wind blows", "vehicles pass by on a roadway"], "sample_ids": ["un9VQlzgZM", "tgbONvsP47Y"], "start_seconds": ["5", "0"], "properties": ["wind, speak, laugh", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "water rushes and then a vehicle zooms past"], "sample_ids": ["vJvryTwuAV8", "s4Uz1Ffgo04"], "start_seconds": ["16", "100"], "properties": ["audience, cheer, man", "water, rushes, vehicle"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is more active", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "some tunes played by whistling"], "sample_ids": ["zofjfKhqLk8", "u6BnG6YZqJ4"], "start_seconds": ["10", "0"], "properties": ["background, metal, clings", "tune, play, whistling"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["ukg5L09Wpvo", "xKB8O8LTs6s"], "start_seconds": ["150", "70"], "properties": ["clickety-clack, train, whistle", "music, gunfire, explosion"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "a car speeding up in the distance"], "sample_ids": ["sHbXC6na9hg", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["a person, saw, wood", "distance, car, speed"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", null], "captions_pred_audio": ["an engine is idling and vibrating", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a baby cries and a woman speaks", "a man speaks in the background while a slow tick repeats"], "sample_ids": ["tMbMDvT50j8", "vZAw4apG0Es"], "start_seconds": ["12", "30"], "properties": ["a, cry, woman", "background, tick, repeat"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a baby cries and a woman speaks", "a clock is ticking and people are talking"], "question": "which entity has a tick repeating in the background?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wTideSjRFS0", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["food, sizzle, woman", "a, scream, girl"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a woman is speaking and a baby is crying"], "question": "which entity is about a girl speaking?", "label": 1}, {"captions": ["a motorcycle engine is idling", "an airplane flies overhead as a woman speaks"], "sample_ids": ["vZAqdHZ81yA", "zj2R0XoFr5k"], "start_seconds": ["180", "50"], "properties": ["engine, motorcycle, idling", "airplane, fly, overhead"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["an engine is idling loudly", "a woman speaks while a helicopter flies overhead "], "question": "which object is moving", "label": 1}, {"captions": ["male speech with light ticking", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["xO-Q2BlIIPU", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["male, speech, ticking", "beeps, hit, woman"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "wind blows as people chatter quietly"], "sample_ids": ["tOSWIURC-4", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["noise, engine, revs", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "frogs croak and vocalize"], "sample_ids": ["uWAAAL4CIoc", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["a, dog, vocalize", "croak, vocalize, frog"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a frog is croaking"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["someone whistles a song", "dishes cling together then a man begins to speak"], "sample_ids": ["sIXTftIuUgw", "sQGXqGcwOTc"], "start_seconds": ["90", "3"], "properties": ["someone, song, whistle", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a person whistling a song", "mechanisms are operating and water is splashing "], "question": "which entity is a person", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["w2M4i1mklOA", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["loud, chime, bell", "beeps, hit, woman"], "captions_pred_video": ["footage of an antique clock", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a beep sounds followed by a child speaking"], "question": "which entity is quieter", "label": 1}, {"captions": ["a dog barks and whimpers", "engines sputter roughly and tires squeal"], "sample_ids": ["sShpyu2l4YQ", "zhx6hoYrHeI"], "start_seconds": ["0", "160"], "properties": ["barks, whimpers, dog", "engine, sputter, rough"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a man working on a motorcycle's tire"], "captions_pred_audio": ["a dog is barking and growling", "a car accelerates and revs its engine "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman talking as an infant is crying", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["tMbMDvT50j8", "wyllXV6PjKo"], "start_seconds": ["12", "30"], "properties": ["a, talk, infant", "a baby, a woman, a man"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman speaks and a baby cries"], "question": "which entity has a baby?", "label": 1}, {"captions": ["male speech with light ticking", "people applaud and hoot and chat quietly"], "sample_ids": ["xO-Q2BlIIPU", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["male, speech, ticking", "people, applaud, hoot"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a child speaks in closed space"], "sample_ids": ["yLy-WycbVVE", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["background, people, talk", "child, space, speak"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["s7knHCFW82w", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["blow horn, get close, train", "gun, shoot, water"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "water is sprayed across a hard surface"], "sample_ids": ["vcmWSmvti8", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["music, man, fire", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "people speak as gunfire rings out"], "sample_ids": ["zkKdxzNC97Y", "wqTCwqVRDlk"], "start_seconds": ["27", "80"], "properties": ["hard, surface, door", "gunfire, ring, speak"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a duck quacks loudly and continuously"], "sample_ids": ["w2M4i1mklOA", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["loud, chime, bell", "loud, continuous, quacks"], "captions_pred_video": ["footage of an antique clock", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a car speeding up in the distance"], "sample_ids": ["wRV8yMk886E", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["liquid, spray, nozzle", "distance, car, speed"], "captions_pred_video": ["two cars are parked in a parking lot at night", null], "captions_pred_audio": ["a man speaks followed by a loud burst", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tqR406bGiE", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["flush, water, gurgle", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["children cheer as a man speaks then an audience screams", "a duck quacks continuously"], "sample_ids": ["vJvryTwuAV8", "vh30P49Po6s"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "quacks, continuously, duck"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vfYTJq7nU", "xBxDz0CFVn0"], "start_seconds": ["130", "30"], "properties": ["ducks, quack, man", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vqZuVbG6-HI", "vb1fPSDI4c"], "start_seconds": ["130", "30"], "properties": ["background, male, female", "multiple, people, yell"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sofxkNWaP0s", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["wind, engine, louder", "two men, woman, birds"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", null], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a man talks while vehicles pass by", "an infant crying frantically"], "sample_ids": ["sK4u5T8hW78", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "cry, infant, frantically"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a child yells and another yells", "a stream of water runs briefly"], "sample_ids": ["vMDHu7Lxcgw", "x-PeY8Yb8M4"], "start_seconds": ["410", "300"], "properties": ["two, yell, child", "stream, water, run"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sofxkNWaP0s", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["wind, engine, louder", "airplane, boy, fly"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a plane flying?", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a motorcycle engine is idling"], "sample_ids": ["zuua6-5goWw", "vZAqdHZ81yA"], "start_seconds": ["30", "180"], "properties": ["birds, chirp, quiet, man, speaks", "engine, motorcycle, idling"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a motorcycle is parked on the side of the road with its rear end facing the viewer"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "an engine is idling loudly"], "question": "which is quieter", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a propeller rotates loudly and intensely"], "sample_ids": ["sSMl2vc3ek", "ugHJF0hfYkg"], "start_seconds": ["20", "10"], "properties": ["a person, laughs, snores", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a person snoring loudly", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a man speaks uses a drill"], "sample_ids": ["siJFXfGWgDk", "x5eIC7S0fbg"], "start_seconds": ["50", "60"], "properties": ["man, woman, vehicle", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and using a power tool "], "question": "which entity is a tool", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a duck quacks continuously"], "sample_ids": ["yeFvk9x0wWI", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["clack, bird, chirp", "quacks, continuously, duck"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 0}, {"captions": ["a duck quacks several times", "a toilet flushes and water drains unevenly"], "sample_ids": ["vh30P49Po6s", "vhJWZheqaE"], "start_seconds": ["30", "0"], "properties": ["quacks, duck, several", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a small engine idles continuously", "after a few seconds of silence, a loud bang occurs followed by a softer banging noise"], "sample_ids": ["y5WII6cTH7k", "zkKdxzNC97Y"], "start_seconds": ["40", "27"], "properties": ["engine, idle, continuously", "loud, bang, noise"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a door is opened and closed"], "question": "which entity is louder", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sncRqQ67iJU", "wz7N8YRy74I"], "start_seconds": ["460", "30"], "properties": ["loud, repeatedly, man", "rooster, crow, background, men"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a person is snoring", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man?", "label": 0}, {"captions": ["speaking following by laughing and clapping", "a door slams shut roughly"], "sample_ids": ["u2f5NpsoHBg", "zkKdxzNC97Y"], "start_seconds": ["30", "27"], "properties": ["person, laugh, clap", "a door, slams, shut"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a door is opened and closed"], "question": "which entity is more likely to be a door", "label": 1}, {"captions": ["people speak then an engine runs", "a horn rings out as a machine runs by"], "sample_ids": ["uMTTDZ2mb4", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["engine, run, people", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a baby laughs giddily and a woman laughs then speaks"], "sample_ids": ["sU53zg9Jp7s", "wjsXBsc7M40"], "start_seconds": ["380", "10"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "a baby laughs, a woman laughs, a woman speaks"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "footage of the baby playing with a toothbrush"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a baby laughs and a woman speaks"], "question": "which entity is more playful", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["vveS8HT7Uog", "wwyfGO2J4"], "start_seconds": ["100", "90"], "properties": ["a man, objects, speak", "people, applaud, hoot"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "people are clapping and speaking with background noise "], "question": "which entity is a group of people?", "label": 1}, {"captions": ["male speech with light ticking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xO-Q2BlIIPU", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["male, speech, ticking", "engine, laugh, loud"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a duck quacks loudly and continuously"], "sample_ids": ["yZrFNS7GFBQ", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["pigeon, buzzes, insect", "loud, continuous, quacks"], "captions_pred_video": ["of the bird in the cage", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["an owl hoots in the background ", "a duck is quacking loudly"], "question": "which animal is making a noise", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "water flows as men speak and yell"], "sample_ids": ["spYNpeN7rPY", "vJ7JPEFhyLA"], "start_seconds": ["1", "16"], "properties": ["a clock, ticktock, man", "water, flow, men"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zcDwZ6W7E3E", "uYT5gxnyMWM"], "start_seconds": ["180", "50"], "properties": ["a, man, speak", "a, scream, girl"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a small engine idles continuously"], "sample_ids": ["s4Uz1Ffgo04", "y5WII6cTH7k"], "start_seconds": ["100", "40"], "properties": ["roars, background, people speaking", "engine, idle, continuously"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "an engine is knocking and vibrating "], "question": "which entity is quieter", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["rqu8iB22IY", "tDVADusiIoc"], "start_seconds": ["5", "60"], "properties": ["sound, repeats, laugh", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "someone sprays a liquid onto a hard surface making a hiss sound"], "sample_ids": ["wztCSUxOf8", "zO-LSSY92ZM"], "start_seconds": ["130", "30"], "properties": ["a crowd, yells, applauds", "liquid, surface, sound"], "captions_pred_video": [null, "youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "steam is hissing and hissing"], "question": "which entity is not a crowd?", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "small dogs yip and bark sharply"], "sample_ids": ["tPJvjq9QePY", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["animal, bleat, moo", "bark, yip, sharply"], "captions_pred_video": ["a dog and a sheep in a barn", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a baby cries and a man speaks", "a dog barks and growls"], "question": "which entity is more vocal", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a weapon fires multiple times"], "sample_ids": ["xSKJGCItUWE", "sMC07Ucy7kg"], "start_seconds": ["10", "10"], "properties": ["engine, run, boy", "weapon, fire, multiple"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage is from a car's point of view"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "waves crash against a shoreline and people speak"], "sample_ids": ["zkKdxzNC97Y", "yFB25fqfU8I"], "start_seconds": ["27", "300"], "properties": ["hard, surface, door", "wave, crash, shoreline"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be a natural phenomenon", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["un9VQlzgZM", "zl9Dqx-j7q4"], "start_seconds": ["5", "6"], "properties": ["females, talk, laugh", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man talks as several small engines run", "some men converse over an engine running"], "sample_ids": ["u9A6VZQCZpU", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["a, man, talk", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a man talking?", "label": 0}, {"captions": ["an airplane engine runs", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yVPZ2MNWpms", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["engine, airplane, runs", "a woman, laughs, animal"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car is driving by on the road ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "someone is typing on a computer keyboard"], "sample_ids": ["uEU-Hg5MTN8", "v0x1odnXtP0"], "start_seconds": ["27", "210"], "properties": ["a woman, laughs, animal", "keyboard, type, computer"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sapQIQUhFc", "xfaoyyzw2WU"], "start_seconds": ["280", "180"], "properties": ["liquid, flow, distance", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "plastic is tapped on while someone speaks"], "sample_ids": ["vW4x7S1VfQc", "wvKpEYswXO0"], "start_seconds": ["150", "150"], "properties": ["clacking, oil, woman", "plastic, tap, speak"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "of the person preparing food in the kitchen"], "captions_pred_audio": ["food sizzles in a frying pan", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks?", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["t8CV69hcvF0", "zl9Dqx-j7q4"], "start_seconds": ["210", "6"], "properties": ["person, sneeze, follow", "engine, laugh, loud"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman sneezes and speaks", "a jet engine roars "], "question": "which entity is followed by a person speaking", "label": 0}, {"captions": ["a male is speaking and a duck quacks as others laugh", "water splashes as an animal walks through"], "sample_ids": ["tiDFTC-5vU", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["male, duck, laugh", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uZesmtKZGSw", "su6FAOcOA8c"], "start_seconds": ["250", "4"], "properties": ["men, talk, cars", "engine, idle, woman"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["electronic beeps occur in a short series", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["y682ml90jGw", "ziUT9IFTkjg"], "start_seconds": ["11", "10"], "properties": ["beeps, series, electronic", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "birds are chirping and a chime is ringing "], "question": "which entity is more natural", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vfYTJq7nU", "zj2R0XoFr5k"], "start_seconds": ["130", "50"], "properties": ["rustling, ducks, quack", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about flying", "label": 1}, {"captions": ["paper is crumpling consistently", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["v5cSxLaHADY", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "rooster, crow, background, men"], "captions_pred_video": ["footage of the person holding a pair of scissors", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a video", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "sirens ring and approach with humming of distant traffic"], "sample_ids": ["xKB8O8LTs6s", "xERFUeZONz8"], "start_seconds": ["70", "0"], "properties": ["music, gunfire, explosion", "ring, approach, traffic"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage is blurry due to camera shake or motion blur"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "an emergency vehicle siren blares"], "question": "which entity is more calm", "label": 1}, {"captions": ["bees buzz as wind blows", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tMJne1a4AFI", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["bees, buzz, wind", "a woman, a television program, a bird"], "captions_pred_video": ["a swarm of bees on the ground", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking and a dog is whimpering"], "question": "which entity is a still image?", "label": 0}, {"captions": ["a man speaks in the background while a slow tick repeats", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vZAw4apG0Es", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["background, tick, repeat", "beeps, hit, woman"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a clock is ticking and people are talking", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zj2R0XoFr5k", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["airplane, fly, overhead", "music, gunfire, explosion"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a clock ticktocks"], "sample_ids": ["su6FAOcOA8c", "v-g-j2uTByM"], "start_seconds": ["4", "30"], "properties": ["engine, idle, woman", "ticktocks, clock, ticktocks"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a person screams glaringly", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xC8kbrKJmco", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["glaringly, screams, person", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a goat is bleating ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["water is sprayed across a hard surface", "paper folding and crinkling"], "sample_ids": ["sQwlkXjQabo", "zPpG3RD8lSs"], "start_seconds": ["10", "20"], "properties": ["water, spray, surface", "paper, fold, crinkle"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["spraying followed by silence", "the wind blows and a mouse clicks "], "question": "which entity is not a liquid", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["s6DESzUTGjY", "vzceMbklWc"], "start_seconds": ["16", "180"], "properties": ["wind, laugh, woman", "water, faucet, sink"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", null], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "water is running and a man is speaking"], "question": "which entity is a video of a woman laughing?", "label": 0}, {"captions": ["a man speaks while water drains", "small dogs yip and bark sharply"], "sample_ids": ["vSeGhaZt-aI", "v-wcQf4BDY0"], "start_seconds": ["50", "120"], "properties": ["water, drain, man", "bark, yip, sharply"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["voJh2gJxXhA", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["music, frog, croak", "a woman, something, fried"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "- a woman cooking in the kitchen"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a woman is speaking while food is frying in the background"], "question": "which entity is about a woman talking?", "label": 1}, {"captions": ["a toilet door squeaks as it is opened", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sdXV-ylviw", "zl9Dqx-j7q4"], "start_seconds": ["190", "6"], "properties": ["door, toilet, squeaks", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xSKJGCItUWE", "yajyRTUQk3U"], "start_seconds": ["10", "400"], "properties": ["engine, run, boy", "a woman, something, fried"], "captions_pred_video": ["footage of the helicopter flying in the room", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["an infant crying frantically", "waves crash against a shoreline and people speak"], "sample_ids": ["zwOBqeFTgiU", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["cry, infant, frantically", "wave, crash, shoreline"], "captions_pred_video": ["of the baby crying in the car seat", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a baby cries loudly", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "some tunes played by whistling"], "sample_ids": ["wqZ135Ssz0", "u6BnG6YZqJ4"], "start_seconds": ["60", "0"], "properties": ["two men, woman, birds", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["birds fly and flutter around", "a clock alarm sounds and gears turn"], "sample_ids": ["wGKgwOP3h30", "w2M4i1mklOA"], "start_seconds": ["30", "30"], "properties": ["fly, flutter, around", "alarm, gears, turn"], "captions_pred_video": ["of the pigeons in the coop", "footage of an antique clock"], "captions_pred_audio": ["pigeons coo and flap their wings", "a clock is ticking and a bell is ringing "], "question": "which entity is a clock?", "label": 1}, {"captions": ["birds chirp then an animal grunts", "several insects fly while two men talk"], "sample_ids": ["tDlysoZiA1I", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["animal, grunt, chirp", "several, fly, men"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about animals?", "label": 0}, {"captions": ["a airplane flies overhead as a woman speaks", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["zj2R0XoFr5k", "y8WEcpOlT3I"], "start_seconds": ["50", "40"], "properties": ["airplane, fly, woman", "harsh, wind, blows"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["multiple ducks quack continuously", "people speak as gunfire rings out"], "sample_ids": ["wfHeoPDLMaM", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["multiple, quack, continuously", "gunfire, ring, speak"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["ducks are quacking", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["birds vocalize and a man speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["v0wPrLBI3hg", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["vocalize, bird, speak", "people, applaud, hoot"], "captions_pred_video": ["footage of the pigeons feeding on the ground", null], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "people speak as gunfire rings out"], "sample_ids": ["vSeGhaZt-aI", "wqTCwqVRDlk"], "start_seconds": ["50", "80"], "properties": ["water, bubbles, speak", "gunfire, ring, speak"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wRBHTgrbiwg", "tDVADusiIoc"], "start_seconds": ["50", "60"], "properties": ["bird, owl, speak", "water, radio, man"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "water pouring and bubbling"], "sample_ids": ["t97k0cejSQE", "uyRfq-jKPpo"], "start_seconds": ["250", "50"], "properties": ["bird, chirp, insect", "water, bubbles, pouring"], "captions_pred_video": ["a bee on a purple thistle flower", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "water is running from a faucet"], "question": "which entity is more quiet", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a horn rings out as a machine runs by"], "sample_ids": ["sZPuqDgX2V0", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["commentator, race, track", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vqZuVbG6-HI", "uZesmtKZGSw"], "start_seconds": ["130", "250"], "properties": ["background, male, female", "men, talk, cars"], "captions_pred_video": ["footage is blurry because it's raining outside", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a toilet door squeaks as it is opened", "water flows and trickles"], "sample_ids": ["sdXV-ylviw", "tB7hWb9gTuQ"], "start_seconds": ["190", "30"], "properties": ["door, toilet, squeaks", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a dog barks and taps with background noise ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as horns blow", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tHyNqRyK34A", "tiDFTC-5vU"], "start_seconds": ["24", "30"], "properties": ["a, man, speaks", "male, duck, laugh"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", null], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck in it?", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["w8uLijTqtlU", "yks4cLgIDMc"], "start_seconds": ["70", "170"], "properties": ["wind, microphone, noise", "background, speaking, child"], "captions_pred_video": ["footage is blurry and shaky", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and a child is crying"], "question": "which entity is a recording", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "people applaud and hoot and chat quietly"], "sample_ids": ["sTpirNYo8vQ", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["a, tone, fast", "people, applaud, hoot"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a car speeding up in the distance"], "sample_ids": ["vqZuVbG6-HI", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["background, male, female", "distance, car, speed"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "a machine beeps continuously"], "sample_ids": ["vb1fPSDI4c", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["multiple, people, yell", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a crowd of people are talking and laughing", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "vehicle engines race around a track as a man commentates"], "sample_ids": ["w2JXXIAdUdg", "sZPuqDgX2V0"], "start_seconds": ["10", "30"], "properties": ["snoring, distance, person", "commentator, race, track"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", null], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking and a helicopter is flying overhead "], "question": "which entity is a video of a race?", "label": 1}, {"captions": ["a weapon fires multiple times", "an airplane engine runs"], "sample_ids": ["sMC07Ucy7kg", "yVPZ2MNWpms"], "start_seconds": ["10", "0"], "properties": ["weapon, fire, multiple", "engine, airplane, runs"], "captions_pred_video": ["footage is from a car's point of view", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a car is driving by on the road "], "question": "which entity is not a weapon", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "water flows as men speak and yell"], "sample_ids": ["wqADXCzngMw", "vJ7JPEFhyLA"], "start_seconds": ["340", "16"], "properties": ["audio, humming, revving", "water, flow, men"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "water splashes as an animal walks through"], "sample_ids": ["vSeGhaZt-aI", "w1ir-sZ3Im8"], "start_seconds": ["50", "90"], "properties": ["water, bubbles, speak", "animal, water, splashes"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tQWGZLItBXk", "uZesmtKZGSw"], "start_seconds": ["170", "250"], "properties": ["voice, music, whoosh", "men, talk, cars"], "captions_pred_video": ["worms revolution screenshots", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["scraping and female speech with distant music", "a man speaks as a car is passing by"], "sample_ids": ["yHeVV-xeOxQ", "sK4u5T8hW78"], "start_seconds": ["130", "30"], "properties": ["female, speech, music", "a, car, pass"], "captions_pred_video": ["of a girl milking a goat's udder", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a power tool runs and touches a surface", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zfvPRf3chY", "uEU-Hg5MTN8"], "start_seconds": ["290", "27"], "properties": ["power tool, run, touch", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["water running down a sink while a man is talking", "paper folding and crinkling"], "sample_ids": ["vSeGhaZt-aI", "zPpG3RD8lSs"], "start_seconds": ["50", "20"], "properties": ["water, sink, talk", "paper, fold, crinkle"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["a cat meows and children speak", "a stream of water runs briefly"], "sample_ids": ["x5cuQjOdM3E", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["cat, speak, children", "stream, water, run"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a cat meows and a woman speaks", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sShpyu2l4YQ", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["growl, bark, yip", "a, scream, girl"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "a machine beeps continuously"], "sample_ids": ["w34HjHr6gAY", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["beeps, squawk, child speaking", "beeps, machine, continuously"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a beeping sound is being made "], "question": "which entity is a machine?", "label": 1}, {"captions": ["several insects fly while two men talk", "someone is typing on a computer keyboard"], "sample_ids": ["s-T9OVOiMLo", "v0x1odnXtP0"], "start_seconds": ["330", "210"], "properties": ["several, fly, men", "keyboard, type, computer"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a toilet flushes and water drains"], "sample_ids": ["v0x1odnXtP0", "sfAvvZwdLCY"], "start_seconds": ["210", "20"], "properties": ["keyboard, type, computer", "water drains, flushes, water"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a person is typing on a keyboard", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sSMl2vc3ek", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["a person, laughs, snores", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person?", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "some tunes played by whistling"], "sample_ids": ["yZp6xizR0yU", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["animal, bleat, cry", "tune, play, whistling"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["xzKKf9bKNUo", "xjvTpk2Zpr8"], "start_seconds": ["10", "70"], "properties": ["background, noise, snoring", "wind, blows, vehicle"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a person snoring loudly", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["t97k0cejSQE", "xKB8O8LTs6s"], "start_seconds": ["250", "70"], "properties": ["bird, chirp, insect", "music, gunfire, explosion"], "captions_pred_video": ["a bee on a purple thistle flower", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["an insect buzzes around continuously", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["v25l1jef3JY", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["buzzes, continuously, insect", "animal, grunts, snorts"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking and a baby is crying"], "question": "which entity is not a person?", "label": 0}, {"captions": ["multiple insects buzz over rustling wind", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["tMJne1a4AFI", "xV7Mg1QucSc"], "start_seconds": ["0", "14"], "properties": ["wind, buzz, rustling", "alarm, ticktocks, laughs"], "captions_pred_video": ["a swarm of bees on the ground", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a swarm of bees buzzing around", "an alarm clock ticks and a woman laughs"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["uRExseg-0XI", "ziUT9IFTkjg"], "start_seconds": ["210", "10"], "properties": ["woman, man, water", "background, birds, rustling"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", null], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "birds are chirping and a chime is ringing "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["continuous chugging with birds chirping in the background", "a person uses a saw to cut some wood"], "sample_ids": ["xM4joTqDVp4", "sHbXC6na9hg"], "start_seconds": ["160", "0"], "properties": ["background, chirp, birds", "a person, saw, wood"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["birds are chirping and a train is moving ", "an engine is idling and vibrating"], "question": "which entity is a person", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sQwlkXjQabo", "su6FAOcOA8c"], "start_seconds": ["10", "4"], "properties": ["water, spray, surface", "engine, idle, woman"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["spraying followed by silence", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a dark barks and whimpers", "a toilet door squeaks as it is opened"], "sample_ids": ["sYj4hpDUZDQ", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["barks, whimpers, dark", "door, toilet, squeaks"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", null], "captions_pred_audio": ["a dog barks and a cat meows", "a dog barks and taps with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "people applaud and hoot and chat quietly"], "sample_ids": ["xM4joTqDVp4", "wwyfGO2J4"], "start_seconds": ["160", "90"], "properties": ["background, chirp, birds", "people, applaud, hoot"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", null], "captions_pred_audio": ["birds are chirping and a train is moving ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "a stream of water runs briefly"], "sample_ids": ["zVacuqSb4LI", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["blares, fades, train", "stream, water, run"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a motorcycle engine is idling", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vZAqdHZ81yA", "xfaoyyzw2WU"], "start_seconds": ["180", "180"], "properties": ["engine, motorcycle, idling", "loud, jet engine, roar"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["an engine is idling loudly", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["a baby laugh at a sputter", "an insect buzzes around continuously"], "sample_ids": ["sLUnaPT5gM8", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["laugh, sputter, baby", "buzzes, continuously, insect"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "dog barking and vehicle engine idling followed shortly by vehicle engine revving"], "sample_ids": ["w2JXXIAdUdg", "zY3icUyMdh8"], "start_seconds": ["10", "20"], "properties": ["snoring, distance, person", "dog, bark, engine"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a car is driving and dogs are barking and squealing "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a infant makes noise and is excited", "a machine beeps continuously"], "sample_ids": ["wIJK3-5y0kA", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["noise, excited, infant", "beeps, machine, continuously"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp as a bell rings", "speaking following by laughing and clapping"], "sample_ids": ["ziUT9IFTkjg", "u2f5NpsoHBg"], "start_seconds": ["10", "30"], "properties": ["chirp, bell, ring", "person, laugh, clap"], "captions_pred_video": [null, "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a woman is speaking and a crowd is clapping"], "question": "which entity is more likely to be a person", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a child speaks in closed space"], "sample_ids": ["w2M4i1mklOA", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["alarm, gears, turn", "child, space, speak"], "captions_pred_video": ["footage of an antique clock", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "birds chirp and an owl hoots before a man speaks briefly"], "sample_ids": ["spYNpeN7rPY", "wRBHTgrbiwg"], "start_seconds": ["1", "50"], "properties": ["a clock, ticktock, man", "bird, owl, speak"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "birds are chirping and insects are buzzing"], "question": "which entity has a clock ticking?", "label": 0}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["ukg5L09Wpvo", "zFjIWfSD-4"], "start_seconds": ["150", "410"], "properties": ["a train, a horn, a bell", "People, motor, brakes"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor?", "label": 1}, {"captions": ["people speak softly as food sizzles", "a pigeon cooing as an insect buzzes by briefly"], "sample_ids": ["yhQ2Lg-7qDY", "yZrFNS7GFBQ"], "start_seconds": ["130", "30"], "properties": ["food, sizzle, speak", "pigeon, buzzes, insect"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "of the bird in the cage"], "captions_pred_audio": ["a faucet is running and a man is speaking", "an owl hoots in the background "], "question": "which entity is a bird?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "people applaud and hoot and chat quietly"], "sample_ids": ["sxYkFKFIZD0", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["screech, man, door", "people, applaud, hoot"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an insect buzzes around continuously", "water runs into a sink while men speak"], "sample_ids": ["v25l1jef3JY", "vzceMbklWc"], "start_seconds": ["0", "180"], "properties": ["buzzes, continuously, insect", "water, sink, run"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "water is running and a man is speaking"], "question": "which entity is not a living thing", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "people cheer as a vehicle engine revs"], "sample_ids": ["tw76HGONaKg", "xjhAnI2q6hM"], "start_seconds": ["570", "6"], "properties": ["music, click, man", "engine revs, vehicle, people"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "someone is typing on a computer keyboard"], "sample_ids": ["s7knHCFW82w", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["blow horn, get close, train", "keyboard, type, computer"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "how to make money on youtube in spanish"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a person is typing on a keyboard"], "question": "which is not a type of computer", "label": 0}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uEU-Hg5MTN8", "vfYTJq7nU"], "start_seconds": ["27", "130"], "properties": ["animal, grunts, snorts", "rustling, ducks, quack"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 0}, {"captions": ["a speedboat passes quickly on the water", "an infant crying as a woman laughs"], "sample_ids": ["tjmoSi330GM", "xhmRY9yhC7c"], "start_seconds": ["23", "20"], "properties": ["speed, water, boat", "a, laugh, infant"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a baby cries and a woman speaks"], "question": "which is not a person", "label": 0}, {"captions": ["white noise and snoring with some rustling in the background", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xzKKf9bKNUo", "ukg5L09Wpvo"], "start_seconds": ["10", "150"], "properties": ["background, noise, snoring", "clickety-clack, train, whistle"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a person snoring loudly", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a frog croaks as other frogs croak in the background"], "sample_ids": ["sa6TLVbooCc", "yswmmRZFItk"], "start_seconds": ["240", "0"], "properties": ["people, laugh, child", "background, frog, croak"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a frog is croaking"], "question": "which entity is a croaking animal?", "label": 1}, {"captions": ["a man speaks as horns blow", "people applaud and hoot and chat quietly"], "sample_ids": ["tHyNqRyK34A", "wwyfGO2J4"], "start_seconds": ["24", "90"], "properties": ["a, man, speaks", "people, applaud, hoot"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", null], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a duck quacks continuously"], "sample_ids": ["wTideSjRFS0", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["food, sizzle, woman", "quacks, continuously, duck"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "an airplane engine runs"], "sample_ids": ["vzceMbklWc", "yVPZ2MNWpms"], "start_seconds": ["180", "0"], "properties": ["water, faucet, sink", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["water is running and a man is speaking", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["goats bleat and people speak", "a duck quacks continuously"], "sample_ids": ["z5iUE5h0EPs", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["goats bleat, people speak, language", "quacks, continuously, duck"], "captions_pred_video": ["of the goat in the barn", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a goat bleats and a man speaks", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 1}, {"captions": ["an infant crying as a woman laughs", "rain falls on a surface as men speak and thunder roars"], "sample_ids": ["xhmRY9yhC7c", "w0xsN8X18Y"], "start_seconds": ["20", "30"], "properties": ["a, laugh, infant", "rain, thunder, surface"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while a motorboat is moving in the background "], "question": "which entity is a natural event", "label": 1}, {"captions": ["water bubbles and gurgles.", "a man speaks as a car is passing by"], "sample_ids": ["tB7hWb9gTuQ", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["bubbles, gurgles, water", "a, car, pass"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["water is splashing and gurgling", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a moving object", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a woman speaks as she rubs two objects together"], "sample_ids": ["vfYTJq7nU", "vzxHnu-SFEw"], "start_seconds": ["130", "80"], "properties": ["rustling, ducks, quack", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["a door slams shut roughly", "several insects fly while two men talk"], "sample_ids": ["zkKdxzNC97Y", "s-T9OVOiMLo"], "start_seconds": ["27", "330"], "properties": ["a door, slams, shut", "several, fly, men"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be in a zoo", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "someone snores nearby"], "sample_ids": ["vBslzh7saPw", "spJCm8tD9Zo"], "start_seconds": ["90", "90"], "properties": ["power, scream, increase", "someone snores, nearby, someone"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a person is snoring loudly"], "question": "which is quieter", "label": 0}, {"captions": ["birds coo incessantly", "a duck quacks continuously"], "sample_ids": ["yZrFNS7GFBQ", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["coo, bird, incessant", "quacks, continuously, duck"], "captions_pred_video": ["of the bird in the cage", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["an owl hoots in the background ", "a duck is quacking loudly"], "question": "which bird is more likely to be a male", "label": 1}, {"captions": ["wind blowing followed by a zoom", "water splashes as an animal walks through"], "sample_ids": ["vr8ZXjEBhMQ", "w1ir-sZ3Im8"], "start_seconds": ["150", "90"], "properties": ["wind, blow, zoom", "animal, water, splashes"], "captions_pred_video": ["is taken from a motorcycle's point of view", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a clock ticks quietly and rhythmically"], "sample_ids": ["wPz6QRAkEb4", "u7C-AEBQM"], "start_seconds": ["60", "30"], "properties": ["chirps, tweets, song", "ticks, rhythmic, quiet"], "captions_pred_video": ["a bird in a cage on top of a pole", null], "captions_pred_audio": ["birds are chirping in the background ", "a ticktock of a clock"], "question": "which entity is quieter", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "small dogs yip and bark sharply"], "sample_ids": ["sLUnaPT5gM8", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["loud, laughter, intermittent", "bark, yip, sharply"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "an infant crying as a woman laughs"], "sample_ids": ["wqUmIEzuNz4", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["frog, bird, vocalize", "a, laugh, infant"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a cat meows and rustles", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["speaking following by laughing and clapping", "women speak as water runs briefly, children call out, and a man speaks"], "sample_ids": ["u2f5NpsoHBg", "uRExseg-0XI"], "start_seconds": ["30", "210"], "properties": ["person, laugh, clap", "woman, man, water"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking while water is running and birds are chirping "], "question": "which entity has more people", "label": 1}, {"captions": ["a clock ticktocks briefly", "an airplane engine spools and people speak"], "sample_ids": ["u7C-AEBQM", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["ticktocks, clock, ticktocks briefly", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a ticktock of a clock", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["someone whistles briefly", "some men converse over an engine running"], "sample_ids": ["uFoga8sHpiw", "sCiy7QS1U"], "start_seconds": ["90", "300"], "properties": ["sound, duration, pitch", "men, converse, engine"], "captions_pred_video": ["footage of a bird in a cage", null], "captions_pred_audio": ["a person whistles a song", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is longer", "label": 1}, {"captions": ["a person is snoring while sleeping", "an airplane engine spools and people speak"], "sample_ids": ["vJrjSeP17yE", "wTjoRj1se3U"], "start_seconds": ["40", "390"], "properties": ["a person is sleeping, snoring, person", "airplane, engine, spool"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a person snoring loudly", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["a rumble grows louder", "a man speaks as a car is passing by"], "sample_ids": ["y4MY9mp8-TA", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["loudness, increase, rumble", "a, car, pass"], "captions_pred_video": ["a helicopter flying in the sky", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a helicopter flies overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a rumble", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a infant makes noise and is excited"], "sample_ids": ["xKB8O8LTs6s", "wIJK3-5y0kA"], "start_seconds": ["70", "30"], "properties": ["music, gunshots, explosion", "noise, excited, infant"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motorcycle engine works nearby", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tOSWIURC-4", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["engine, work, nearby", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a lawn mower is running ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program playing far away?", "label": 1}, {"captions": ["water is sprayed across a hard surface", "someone sprays a liquid onto a hard surface making a hiss sound"], "sample_ids": ["sQwlkXjQabo", "zO-LSSY92ZM"], "start_seconds": ["10", "30"], "properties": ["water, spray, surface", "liquid, surface, sound"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'"], "captions_pred_audio": ["spraying followed by silence", "steam is hissing and hissing"], "question": "which entity is sprayed across a hard surface", "label": 0}, {"captions": ["an small aircraft engine runs and a boy speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xSKJGCItUWE", "vb1fPSDI4c"], "start_seconds": ["10", "30"], "properties": ["engine, run, boy", "multiple, people, yell"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["loud, continuous burping", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["y636gklDioE", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["loud, continuous, burping", "background, male, female"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a person burps loudly several times", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a child speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yW6FWLSLkx4", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["a, child, speaks", "stream, water, flow"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a horn rings out as a machine runs by"], "sample_ids": ["zk-xJGQU8-4", "slZLHwNbbt4"], "start_seconds": ["130", "300"], "properties": ["food, man, woman", "a, horn, run"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["w2M4i1mklOA", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["alarm, gears, turn", "a woman, laughs, animal"], "captions_pred_video": ["footage of an antique clock", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman is speaking and a baby is crying"], "question": "which entity is a clock?", "label": 0}, {"captions": ["a woman sneezes then speaks", "wind blowing followed by a zoom"], "sample_ids": ["x4dZyf9Gbj0", "vr8ZXjEBhMQ"], "start_seconds": ["130", "150"], "properties": ["sneezes, speaks, woman", "wind, blow, zoom"], "captions_pred_video": ["footage is blurry and out of focus", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman sneezes and speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wDVMhEdTiVw", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["gun, shoot, water", "engine, revs, vehicle"], "captions_pred_video": ["a blurry image of trees and water in the forest", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a man speaks as a vehicle engine idles", "a train horn blows as it passes by"], "sample_ids": ["shmR4OZtzqA", "zVacuqSb4LI"], "start_seconds": ["30", "30"], "properties": ["man, engine, idle", "horn, blows, train"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man speaks while a motor runs", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zgUgkpk78xU", "vb1fPSDI4c"], "start_seconds": ["70", "30"], "properties": ["clinking, humming, horn", "multiple, people, yell"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["people speak and laugh as a child speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sa6TLVbooCc", "zFjIWfSD-4"], "start_seconds": ["240", "410"], "properties": ["people, laugh, child", "People, motor, brakes"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", null], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a child speaking?", "label": 0}, {"captions": ["a woman and man are speaking", "a machine beeps continuously"], "sample_ids": ["vbpKkWvfOu4", "y682ml90jGw"], "start_seconds": ["560", "11"], "properties": ["two people, speaking, woman, man", "beeps, machine, continuously"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["an audience gives applause", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["x6iCUDmRpKQ", "tdWhHV3X25Q"], "start_seconds": ["38", "60"], "properties": ["applause, audience, give", "applause, audience, yells"], "captions_pred_video": ["a black background with the moon and stars in the sky", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a group of people are clapping and cheering", "a man is speaking and a crowd is clapping"], "question": "which audience is giving applause", "label": 1}, {"captions": ["continuous snoring", "plastic is tapped on while someone speaks"], "sample_ids": ["sLkeqCDJIyw", "wvKpEYswXO0"], "start_seconds": ["120", "150"], "properties": ["loud, snoring, noise", "plastic, tap, speak"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "women speak as water runs briefly, children call out, and a man speaks"], "sample_ids": ["uEU-Hg5MTN8", "uRExseg-0XI"], "start_seconds": ["27", "210"], "properties": ["animal, grunts, snorts", "woman, man, water"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while water is running and birds are chirping "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "paper is crumpling consistently"], "sample_ids": ["ujMt0-D-x2k", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["snoring, rhythmical, nearby", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person is snoring loudly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "some men converse over an engine running"], "sample_ids": ["sjlVMgdGSK0", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["car, revving, loudly", "men, converse, engine"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man speaks while a motorcycle revs and accelerates "], "question": "which is not a car", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sOa7g-44Dag", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["audio, scratching, man", "rustling, ducks, quack"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a duck quacks and a woman speaks"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a airplane flies overhead as a woman speaks", "someone is typing on a computer keyboard"], "sample_ids": ["zj2R0XoFr5k", "v0x1odnXtP0"], "start_seconds": ["50", "210"], "properties": ["airplane, fly, woman", "keyboard, type, computer"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a person is typing on a keyboard"], "question": "which is not a person", "label": 0}, {"captions": ["a vehicle engine runs while a siren and horn sound", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["u--KhUW8l1Y", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["engine, sound, horn", "rooster, crow, background, men"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["wind blows as people chatter quietly", "small dogs yip and bark sharply"], "sample_ids": ["xBxDz0CFVn0", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["wind, chatter, people", "bark, yip, sharply"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "birds chirp and objects are moved around"], "sample_ids": ["uqFtmnhuqA8", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["a, b, c", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "insects buzz and a man speaks"], "question": "which entity is more like a bird chirping?", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "small dogs yip and bark sharply"], "sample_ids": ["y2ZBGpgbhHM", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["dog, chirp, breathe", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["birds chirping and a dog panting", "a dog barks and growls"], "question": "which dog is more active", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "gunshots ring out, a man yells, and more shots follow"], "sample_ids": ["yaln9y8I7ms", "vKrYfzleLB8"], "start_seconds": ["230", "110"], "properties": ["female, flushes, toilet", "a, ring, gunshots"], "captions_pred_video": ["footage is blurry and out of focus", "stock footage of a person holding a gun in their hand"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is speaking with background noise and a cap gun is fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a woman speaks happily and an animal chirps"], "sample_ids": ["uKCSGgof8gI", "uWAAAL4CIoc"], "start_seconds": ["12", "0"], "properties": ["chirps, distance, signal", "a woman, chirps, animal"], "captions_pred_video": ["footage of a street in a small town on a sunny day", null], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "water flows as men speak and yell"], "sample_ids": ["y8WEcpOlT3I", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["harsh, wind, blows", "water, flow, men"], "captions_pred_video": ["on how to use a sewing machine youtube", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "a propeller rotates loudly and intensely"], "sample_ids": ["uiS58TNyUiw", "ugHJF0hfYkg"], "start_seconds": ["430", "10"], "properties": ["audio, man, speaking", "loud, intense, propeller"], "captions_pred_video": ["of the pigeon in the cage", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["multiple ducks quack continuously", "a clock ticktocks"], "sample_ids": ["wfHeoPDLMaM", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "ticktocks, clock, ticktocks"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["ducks are quacking", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "someone is typing on a computer keyboard"], "sample_ids": ["slZLHwNbbt4", "v0x1odnXtP0"], "start_seconds": ["300", "210"], "properties": ["clap, distance, horn", "keyboard, type, computer"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "how to make money on youtube in spanish"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["someone snores nearby", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["spJCm8tD9Zo", "uYT5gxnyMWM"], "start_seconds": ["90", "50"], "properties": ["someone snores, nearby, someone", "female, spraying, scream"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["several ducks are quacking and squawking", "water splashes and a door squeaks"], "sample_ids": ["wfHeoPDLMaM", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["quacking, squawking, ducks", "sound, splash, door"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "a dog barks and taps with background noise "], "question": "which entity is silent", "label": 1}, {"captions": ["scraping and female speech with distant music", "ticking continues without interruption"], "sample_ids": ["yHeVV-xeOxQ", "v-g-j2uTByM"], "start_seconds": ["130", "30"], "properties": ["female, speech, music", "ticking, continuous, clock"], "captions_pred_video": ["of a girl milking a goat's udder", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a clock is ticking loudly"], "question": "which entity is a clock", "label": 1}, {"captions": ["an electronic device bleeps once", "a child speaks in closed space"], "sample_ids": ["tHJ6JSa8Y4", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["bleeps, electronic, device", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a clock is ticking and beeping", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "water pouring and bubbling"], "sample_ids": ["siJFXfGWgDk", "uyRfq-jKPpo"], "start_seconds": ["50", "50"], "properties": ["a, bird, vehicle", "water, bubbles, pouring"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "a man speaks as a motor runs in the background"], "sample_ids": ["w9lpbUn0hPc", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["male, wind, rustling", "background, motor, run"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["an engine runs and wind blows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vs65y4qmyBE", "su6FAOcOA8c"], "start_seconds": ["340", "4"], "properties": ["engine, run, wind", "engine, idle, woman"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a woman speaks as she rubs two objects together"], "sample_ids": ["wRBHTgrbiwg", "vzxHnu-SFEw"], "start_seconds": ["50", "80"], "properties": ["bird, owl, speak", "two objects, woman, speak"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is about a woman speaking?", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "water splashes as an animal walks through"], "sample_ids": ["v5P-ThUCINM", "w1ir-sZ3Im8"], "start_seconds": ["400", "90"], "properties": ["background, chirp, bird", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and birds are chirping", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a person speaks over rustling leaves", "some people speak"], "sample_ids": ["zOZleIRqZm4", "vbZ-0lGPneg"], "start_seconds": ["80", "30"], "properties": ["rustling, leaves, person", "some people speak English, some people speak Spanish, some people speak French"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a dog is whimpering"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["water runs into a sink while men speak", "people speak as gunfire rings out"], "sample_ids": ["vzceMbklWc", "wqTCwqVRDlk"], "start_seconds": ["180", "80"], "properties": ["water, sink, run", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["water is running and a man is speaking", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "a loud engine muffles a man as he speaks"], "sample_ids": ["vlS6YMeWAPo", "xyx6eNVEYRY"], "start_seconds": ["40", "380"], "properties": ["sheep, baa, birds", "loud, engine, muffles"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a goat bleats and birds chirp", "an aircraft engine is running and a man is speaking "], "question": "which entity is muffled", "label": 1}, {"captions": ["a jet engine spools up and takes off", "children speak and play together"], "sample_ids": ["vBslzh7saPw", "yVVP8XvWJTo"], "start_seconds": ["90", "260"], "properties": ["engine, spools, takes", "children, speak, play"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a jet engine roars and accelerates ", "children are speaking and breathing with background noise "], "question": "which entity is more likely to be in motion", "label": 0}, {"captions": ["birds chirp and objects are moved around", "wind blowing followed by a zoom"], "sample_ids": ["yPUYU6t3rwo", "vr8ZXjEBhMQ"], "start_seconds": ["370", "150"], "properties": ["birds chirp, objects are moved around, birds", "wind, blow, zoom"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["insects buzz and a man speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be a natural phenomenon", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["wEBlkGWVWwE", "xZepNM9qcRA"], "start_seconds": ["260", "30"], "properties": ["a, babble, woman", "background, motor, run"], "captions_pred_video": ["shows a person writing on the whiteboard", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["vs65y4qmyBE", "xjhAnI2q6hM"], "start_seconds": ["340", "6"], "properties": ["engine, run, man", "engine revs, vehicle, people"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a truck is revving its engine and a man is speaking "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a male speaks and another male speaks", "a door opens and closes"], "sample_ids": ["viuTg1M-dqg", "vBHyYJ8pL0"], "start_seconds": ["30", "2"], "properties": ["two males, speaking, male", "open, close, door"], "captions_pred_video": ["footage of water coming out of a hole in the ground", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is a door?", "label": 1}, {"captions": ["a man talks while vehicles pass by", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sK4u5T8hW78", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "rooster, crow, background, men"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people in it", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "an engine runs loudly"], "sample_ids": ["wqZ135Ssz0", "vqZuVbG6-HI"], "start_seconds": ["60", "130"], "properties": ["man, woman, squawks", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["a church bell rings several times", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sUVVjE3Ucp8", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["ring, bell, several", "three men, wind, flow"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a church bell is ringing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a bell ringing?", "label": 0}, {"captions": ["a woman speaks and other women and a man talk with her", "a propeller rotates loudly and intensely"], "sample_ids": ["vbpKkWvfOu4", "ugHJF0hfYkg"], "start_seconds": ["560", "10"], "properties": ["a, woman, man", "loud, intense, propeller"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a vehicle engine accelerating then running on idle"], "sample_ids": ["weDbePuc-Xc", "vYkA3cfXp5Q"], "start_seconds": ["40", "30"], "properties": ["music, slaps, human", "engine, accelerate, idle"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["t25U-v4k4ts", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["a, chirps, bird", "wind, blow, vehicle"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", null], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running?", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "several insects fly while two men talk"], "sample_ids": ["sapQIQUhFc", "s-T9OVOiMLo"], "start_seconds": ["280", "330"], "properties": ["liquid, flow, distance", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a clock ticktocks"], "sample_ids": ["zofjfKhqLk8", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["vehicles pass by on a roadway", "water pouring and bubbling"], "sample_ids": ["tgbONvsP47Y", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["pass, vehicle, roadway", "water, bubbles, pouring"], "captions_pred_video": ["footage of a fire truck entering a garage", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a car is driving on the road ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "paper is crumpling consistently"], "sample_ids": ["ziUT9IFTkjg", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["background, birds, rustling", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "some men converse over an engine running"], "sample_ids": ["wSVhSdj0F0", "sCiy7QS1U"], "start_seconds": ["10", "300"], "properties": ["beep, clang, footsteps", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["sjlVMgdGSK0", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["car, revving, loudly", "clickety-clack, train, whistle"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a train blows its whistle and blows its horn "], "question": "which is quieter", "label": 0}, {"captions": ["a man rubs two objects together then speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vveS8HT7Uog", "wz7N8YRy74I"], "start_seconds": ["100", "30"], "properties": ["a man, objects, speak", "rooster, crow, background, men"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in it?", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "someone whistles a tune"], "sample_ids": ["se87d6yxEOA", "sIXTftIuUgw"], "start_seconds": ["10", "90"], "properties": ["run, whistle, pass", "someone, tune, whistle"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", null], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["a baby cries and a woman moans", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["smDKStoHBJo", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["a, cry, woman", "music, gunfire, explosion"], "captions_pred_video": ["a man holding a crying baby in his arms", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["spJCm8tD9Zo", "uYT5gxnyMWM"], "start_seconds": ["90", "50"], "properties": ["snores, wheezes, sleeps", "female, spraying, scream"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["winds blows roughly as a vehicle races past", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["xjvTpk2Zpr8", "tiDFTC-5vU"], "start_seconds": ["70", "30"], "properties": ["wind, blows, vehicle", "male, duck, laugh"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a motorcycle engine is idling", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vZAqdHZ81yA", "uEU-Hg5MTN8"], "start_seconds": ["180", "27"], "properties": ["engine, motorcycle, idling", "animal, grunts, snorts"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine is idling loudly", "a woman is speaking and a baby is crying"], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vzxHnu-SFEw", "uZesmtKZGSw"], "start_seconds": ["80", "250"], "properties": ["two objects, woman, speak", "men, talk, cars"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yRx9txMcBl0", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["accelerates, tires, squeals", "female, spraying, scream"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["water rushes by", "water flows and trickles"], "sample_ids": ["x-PeY8Yb8M4", "tB7hWb9gTuQ"], "start_seconds": ["300", "30"], "properties": ["water, rushes, by", "water, flow, trickle"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a car is driving on a wet road ", "water is splashing and gurgling"], "question": "which entity is more calm", "label": 1}, {"captions": ["water flows and trickles", "people speak as gunfire rings out"], "sample_ids": ["tB7hWb9gTuQ", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["water, flow, trickle", "gunfire, ring, speak"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["water is splashing and gurgling", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["xl2PIWyXaM", "xZepNM9qcRA"], "start_seconds": ["160", "30"], "properties": ["chirp, man, younger person", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds are chirping and people are talking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "winds blows roughly as a vehicle races past"], "sample_ids": ["w8uLijTqtlU", "xjvTpk2Zpr8"], "start_seconds": ["70", "70"], "properties": ["wind, microphone, noise", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry and shaky", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["the wind is blowing strongly", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "an airplane engine runs"], "sample_ids": ["vveS8HT7Uog", "yVPZ2MNWpms"], "start_seconds": ["100", "0"], "properties": ["a man, objects, speak", "engine, airplane, runs"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a car is driving by on the road "], "question": "which object is moving", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a horn rings out as a machine runs by"], "sample_ids": ["zFjIWfSD-4", "slZLHwNbbt4"], "start_seconds": ["410", "300"], "properties": ["People, motor, brakes", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["birds chirp and wind blows", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sxIvBMSavMQ", "xKB8O8LTs6s"], "start_seconds": ["210", "70"], "properties": ["birds, chirp, wind", "music, gunfire, explosion"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "music plays, a person speaks, followed by whooshes and a ding"], "sample_ids": ["w5W5Kqtc8E", "tQWGZLItBXk"], "start_seconds": ["100", "170"], "properties": ["wind, engine, scream", "music, person, ding"], "captions_pred_video": [null, "worms revolution screenshots"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity is a musical composition?", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["w0xsN8X18Y", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["music, surface, rain", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be a natural occurrence", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "an insect buzzes around continuously"], "sample_ids": ["x5cuQjOdM3E", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["cat, meows, young woman", "buzzes, continuously, insect"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a cat meows and a woman speaks", "a fly is buzzing around a microphone "], "question": "which entity is more active", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vlS6YMeWAPo", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["noise, bleat, call", "loud, multiple, distance"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wqZ135Ssz0", "sSMl2vc3ek"], "start_seconds": ["60", "20"], "properties": ["man, woman, squawks", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks continuously", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vlJS7LN2XyM", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["ticktocks, clock, ticktocks continuously", "wind, blow, vehicle"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a clock?", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uWAAAL4CIoc", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["a woman, chirps, animal", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a duck quacks and a woman speaks"], "question": "which entity has more animals", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["xl2PIWyXaM", "rqu8iB22IY"], "start_seconds": ["160", "5"], "properties": ["chirp, man, younger person", "sound, repeats, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and people are talking", "a dog barks and a man speaks while music plays "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["vs65y4qmyBE", "uWAAAL4CIoc"], "start_seconds": ["340", "0"], "properties": ["engine, run, man", "a woman, chirps, animal"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and a dog is barking "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["an electric engine works nearby followed by a child talking", "dishes cling together then a man begins to speak"], "sample_ids": ["xSKJGCItUWE", "sQGXqGcwOTc"], "start_seconds": ["10", "3"], "properties": ["engine, work, child", "cling, speak, dishes"], "captions_pred_video": ["footage of the helicopter flying in the room", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["children cry and people talk", "an airplane flies overhead as a woman speaks"], "sample_ids": ["xLwHe825Zs", "zj2R0XoFr5k"], "start_seconds": ["18", "50"], "properties": ["people talk, children cry, people talk", "airplane, fly, overhead"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a plane flying overhead?", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "people speak as gunfire rings out"], "sample_ids": ["y8dSeubCNI", "wqTCwqVRDlk"], "start_seconds": ["4", "80"], "properties": ["engine revving, people speaking, motorcycle", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["an engine revving and people talking in the background", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["ukg5L09Wpvo", "vfYTJq7nU"], "start_seconds": ["150", "130"], "properties": ["clickety-clack, train, whistle", "rustling, ducks, quack"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a duck quacks and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sjlVMgdGSK0", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["car, revving, loudly", "a woman, something, fried"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tw76HGONaKg", "vfYTJq7nU"], "start_seconds": ["570", "130"], "properties": ["A, game, keyboard", "rustling, ducks, quack"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a duck quacks and a woman speaks"], "question": "which entity is about a game?", "label": 0}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a woman speaks as frying food sizzles"], "sample_ids": ["sEprKHm8Sj8", "wTideSjRFS0"], "start_seconds": ["90", "30"], "properties": ["car, tires, slows", "food, sizzle, woman"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking while water is running in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "wind blows as people chatter quietly"], "sample_ids": ["uiS58TNyUiw", "xBxDz0CFVn0"], "start_seconds": ["430", "30"], "properties": ["vocalize, bird, chirp", "wind, chatter, people"], "captions_pred_video": ["of the pigeon in the cage", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["bees buzz as wind blows", "a female speaks softly as paper crinkles"], "sample_ids": ["tMJne1a4AFI", "xvDdE3zNf8Y"], "start_seconds": ["0", "120"], "properties": ["bees, buzz, wind", "a, female, speaks"], "captions_pred_video": ["a swarm of bees on the ground", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman speaks and crumples paper"], "question": "which entity is speaking softly", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uzQnlJXBbOM", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["ringing, beep, stop", "a woman, laughs, animal"], "captions_pred_video": ["footage of a person using a cell phone on a table", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a telephone rings and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["an infant crying as a woman laughs", "a woman speaks and other women and a man talk with her"], "sample_ids": ["xhmRY9yhC7c", "vbpKkWvfOu4"], "start_seconds": ["20", "560"], "properties": ["a, laugh, infant", "a, woman, man"], "captions_pred_video": ["of a baby crying in a baby bouncer", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks as crickets sing", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["ryFDPxgDOGc", "wDVMhEdTiVw"], "start_seconds": ["570", "30"], "properties": ["a, crickets, sing", "gun, shoot, water"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a child yells and another yells", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vMDHu7Lxcgw", "wqZ135Ssz0"], "start_seconds": ["410", "60"], "properties": ["two, yell, child", "two men, woman, birds"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", null], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["two frogs croak at each other", "a man speaks as a car is passing by"], "sample_ids": ["zg0X6BnhOLQ", "sK4u5T8hW78"], "start_seconds": ["410", "30"], "properties": ["two frogs, croak, at each other", "a, car, pass"], "captions_pred_video": ["footage of lightning in the sky at night", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a frog is croaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a woman speaks happily and an animal chirps"], "sample_ids": ["wyllXV6PjKo", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["a kid, talk, cry", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a dog is barking "], "question": "which entity has a kid?", "label": 0}, {"captions": ["a man speaks then multiple motorcycles pass by", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zcDwZ6W7E3E", "uZesmtKZGSw"], "start_seconds": ["180", "250"], "properties": ["a, man, speak", "men, talk, cars"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more vehicles", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "dishes cling together then a man begins to speak"], "sample_ids": ["vZAw4apG0Es", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["people, clock, converse", "cling, speak, dishes"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a clock is ticking and people are talking", "mechanisms are operating and water is splashing "], "question": "which entity is about a clock?", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "three men talk while wind blows and some liquid flows"], "sample_ids": ["y8WEcpOlT3I", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["wind, speak, buffeting", "three men, wind, flow"], "captions_pred_video": ["on how to use a sewing machine youtube", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["a person whistles a meandering tune", "water splashes as an animal walks through"], "sample_ids": ["uFoga8sHpiw", "w1ir-sZ3Im8"], "start_seconds": ["90", "90"], "properties": ["person, tune, whistle", "animal, water, splashes"], "captions_pred_video": ["footage of a bird in a cage", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a person whistles a song", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "water is sprayed across a hard surface"], "sample_ids": ["w2JXXIAdUdg", "sQwlkXjQabo"], "start_seconds": ["10", "10"], "properties": ["snoring, distance, person", "water, spray, surface"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a person snoring and a dog whimpering", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["uYT5gxnyMWM", "w34HjHr6gAY"], "start_seconds": ["50", "30"], "properties": ["female, spraying, scream", "beeps, hit, woman"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a beep sounds followed by a child speaking"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "water flows as men speak and yell"], "sample_ids": ["tw76HGONaKg", "vJ7JPEFhyLA"], "start_seconds": ["570", "16"], "properties": ["A, game, keyboard", "water, flow, men"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking while playing a video game on a keyboard?", "label": 0}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a man speaks as a motor runs in the background"], "sample_ids": ["wvKpEYswXO0", "xZepNM9qcRA"], "start_seconds": ["150", "30"], "properties": ["water, tap, run", "background, motor, run"], "captions_pred_video": ["of the person preparing food in the kitchen", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "roadway noise occurs and a truck accelerates"], "sample_ids": ["sSMl2vc3ek", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["loud, multiple, distance", "noise, truck, accelerate"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person snoring loudly", "a car is driving on the road "], "question": "which noise is louder", "label": 1}, {"captions": ["some tunes played by whistling", "bees buzz as wind blows"], "sample_ids": ["u6BnG6YZqJ4", "tMJne1a4AFI"], "start_seconds": ["0", "0"], "properties": ["tune, play, whistling", "bees, buzz, wind"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "a swarm of bees on the ground"], "captions_pred_audio": ["a person whistling a song", "a swarm of bees buzzing around"], "question": "which entity is not a musical instrument", "label": 1}, {"captions": ["food is frying and sizzles", "some men converse over an engine running"], "sample_ids": ["zNRChLjqcU", "sCiy7QS1U"], "start_seconds": ["220", "300"], "properties": ["food is frying, sizzles, food", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running from a faucet into a sink", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "water rushes by"], "sample_ids": ["s7knHCFW82w", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["blow horn, get close, train", "water, rushes, by"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a car is driving on a wet road "], "question": "which is a moving object", "label": 0}, {"captions": ["some tunes played by whistling", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["u6BnG6YZqJ4", "t25U-v4k4ts"], "start_seconds": ["0", "40"], "properties": ["tune, play, whistling", "a, chirps, bird"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a person whistling a song", "a man is speaking and bees are buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["yajyRTUQk3U", "wDVMhEdTiVw"], "start_seconds": ["400", "30"], "properties": ["a woman, something, fried", "gun, shoot, water"], "captions_pred_video": ["- a woman cooking in the kitchen", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vbr9mHKc8WM", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["noise, loudness, engine", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and a baby is crying"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man laughs and speaks as cats purr and hiss", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vVhthZ45k3Y", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["cat, purr, hiss", "three men, wind, flow"], "captions_pred_video": ["footage is blurry and out of focus", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a toilet door squeaks as it is opened"], "sample_ids": ["ukxt9I7eMMg", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["food, pan, cook", "door, toilet, squeaks"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a dog barks and taps with background noise "], "question": "which door is squeaking", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["ziUT9IFTkjg", "tDlysoZiA1I"], "start_seconds": ["10", "0"], "properties": ["background, birds, rustling", "animal, grunts, chirps"], "captions_pred_video": [null, "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "birds are chirping and a rooster is crowing "], "question": "which entity has more birds", "label": 1}, {"captions": ["a person speaks over rustling leaves", "an insect buzzes around continuously"], "sample_ids": ["zOZleIRqZm4", "v25l1jef3JY"], "start_seconds": ["80", "0"], "properties": ["rustling, leaves, person", "buzzes, continuously, insect"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tIY7qOV3rEM", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "engine, idle, woman"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking and a subway train is moving "], "question": "which entity is a human activity", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "people applaud and hoot and chat quietly"], "sample_ids": ["w9lpbUn0hPc", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["male, wind, rustling", "people, applaud, hoot"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", null], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be in a theater", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "a crowd yells, reacts and applauds"], "sample_ids": ["s4Uz1Ffgo04", "wztCSUxOf8"], "start_seconds": ["100", "130"], "properties": ["water, rushes, motorcycle", "a crowd, yells, applauds"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking and a crowd is clapping"], "question": "which entity is more likely to be a video of a person riding a motorcycle?", "label": 0}, {"captions": ["water is sprayed across a hard surface", "water splashes and a door squeaks"], "sample_ids": ["sQwlkXjQabo", "sdXV-ylviw"], "start_seconds": ["10", "190"], "properties": ["water, spray, surface", "sound, splash, door"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "a dog barks and taps with background noise "], "question": "which entity is sprayed across a hard surface", "label": 0}, {"captions": ["someone sprays liquid onto a hard surface", "waves crash against a shoreline and people speak"], "sample_ids": ["sQwlkXjQabo", "yFB25fqfU8I"], "start_seconds": ["10", "300"], "properties": ["liquid, surface, spray", "wave, crash, shoreline"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage of a person surfing in the ocean"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be seen in a movie", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a telephone rings followed by a woman talking"], "sample_ids": ["vddP56-ogds", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["water, flow, laugh", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["leaves rustle while man speaks", "wind blowing followed by a zoom"], "sample_ids": ["zOZleIRqZm4", "vr8ZXjEBhMQ"], "start_seconds": ["80", "150"], "properties": ["leaves, rustle, speak", "wind, blow, zoom"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a jet engine roars, almost making a man inaudible", "an electric engine works nearby followed by a child talking"], "sample_ids": ["xfaoyyzw2WU", "xSKJGCItUWE"], "start_seconds": ["180", "10"], "properties": ["loud, jet engine, roar", "engine, work, child"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "footage of the helicopter flying in the room"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a high pitched engine is running and a child speaks"], "question": "which engine is quieter", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "water drips and bubbles as a man speaks"], "sample_ids": ["y8WEcpOlT3I", "vSeGhaZt-aI"], "start_seconds": ["40", "50"], "properties": ["wind, speak, buffeting", "water, bubbles, speak"], "captions_pred_video": ["on how to use a sewing machine youtube", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is more calm", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["ugHJF0hfYkg", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["engine, running, continuously", "engine, revs, vehicle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which entity has a running engine", "label": 0}, {"captions": ["water pouring and bubbling", "paper folding and crinkling"], "sample_ids": ["uyRfq-jKPpo", "zPpG3RD8lSs"], "start_seconds": ["50", "20"], "properties": ["water, bubbles, pouring", "paper, fold, crinkle"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["water is running from a faucet", "the wind blows and a mouse clicks "], "question": "which entity is not a liquid", "label": 1}, {"captions": ["a man speaks as horns blow", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tHyNqRyK34A", "w5W5Kqtc8E"], "start_seconds": ["24", "100"], "properties": ["a, man, speaks", "wind, blow, vehicle"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", null], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "several insects fly while two men talk"], "sample_ids": ["zuua6-5goWw", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["sound, pop, bird", "several, fly, men"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more flying insects", "label": 1}, {"captions": ["an engine runs and wind blows", "people speak then an engine runs"], "sample_ids": ["vs65y4qmyBE", "uMTTDZ2mb4"], "start_seconds": ["340", "30"], "properties": ["engine, run, wind", "engine, run, people"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "people are talking and a car is driving by with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a woman speaks as she rubs two objects together"], "sample_ids": ["zgUgkpk78xU", "vzxHnu-SFEw"], "start_seconds": ["70", "80"], "properties": ["horn, bells, ring", "two objects, woman, speak"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a warning device?", "label": 0}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a machine clanks and thumps and a male speaks"], "sample_ids": ["sU53zg9Jp7s", "sWZzXuWYY"], "start_seconds": ["380", "420"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "male, clanks, thumps"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a sewing machine runs and a man speaks"], "question": "which entity has a male speaking?", "label": 1}, {"captions": ["children cry and people talk", "a horn honks and then loudly blares"], "sample_ids": ["xLwHe825Zs", "wnpJndXuxLc"], "start_seconds": ["18", "50"], "properties": ["people talk, children cry, people talk", "horn, honk, loud"], "captions_pred_video": [null, "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a baby cries and a woman speaks", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xl2PIWyXaM", "yajyRTUQk3U"], "start_seconds": ["160", "400"], "properties": ["chirp, man, younger person", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and people are talking", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["water bubbles and gurgles.", "waves crash against a shoreline and people speak"], "sample_ids": ["tB7hWb9gTuQ", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["bubbles, gurgles, water", "wave, crash, shoreline"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "footage of a person surfing in the ocean"], "captions_pred_audio": ["water is splashing and gurgling", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["a horn blasts as warning bells ring", "small dogs yip and bark sharply"], "sample_ids": ["zgUgkpk78xU", "v-wcQf4BDY0"], "start_seconds": ["70", "120"], "properties": ["horn, bells, ring", "bark, yip, sharply"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "a car accelerates and wind blows"], "sample_ids": ["tw76HGONaKg", "u0TrcHhkPQ"], "start_seconds": ["570", "20"], "properties": ["music, click, man", "accelerates, wind, blows"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["people clap and speak in the distance", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["wwyfGO2J4", "tDlysoZiA1I"], "start_seconds": ["90", "0"], "properties": ["clap, distance, speak", "animal, grunts, chirps"], "captions_pred_video": [null, "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "birds are chirping and a rooster is crowing "], "question": "which entity is more animal like", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "a duck quacks continuously"], "sample_ids": ["s59PfAghdkM", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["bird, chirp, background, horse, neigh", "quacks, continuously, duck"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 0}, {"captions": ["a toilet flushes and water sputters as it drains", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["smGI3C1NZc", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["water, drain, toilet", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a person whistles a meandering tune", "a person speaks over rustling leaves"], "sample_ids": ["uFoga8sHpiw", "zOZleIRqZm4"], "start_seconds": ["90", "80"], "properties": ["person, tune, whistle", "rustling, leaves, person"], "captions_pred_video": ["footage of a bird in a cage", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a person whistles a song", "a man is speaking with crickets chirping in the background"], "question": "which person is speaking over rustling leaves", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "water pouring and bubbling"], "sample_ids": ["uqFtmnhuqA8", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["a, b, c", "water, bubbles, pouring"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "material crumbles into a microphone"], "sample_ids": ["zY3icUyMdh8", "vofpvUo6NAw"], "start_seconds": ["20", "220"], "properties": ["dog, bark, engine", "material, crumbles, microphone"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "person wrapping a toy car in a plastic bag"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "paper is being crumpled and crinkled"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "water is sprayed across a hard surface"], "sample_ids": ["ujMt0-D-x2k", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["snoring, rhythmical, nearby", "water, spray, surface"], "captions_pred_video": ["of the dog playing with a toy on the floor", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a person is snoring loudly", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an audience gives applause", "a man speaks as a motor runs in the background"], "sample_ids": ["x6iCUDmRpKQ", "xZepNM9qcRA"], "start_seconds": ["38", "30"], "properties": ["applause, audience, give", "background, motor, run"], "captions_pred_video": ["a black background with the moon and stars in the sky", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a group of people are clapping and cheering", "a man speaks while a motorcycle revs and accelerates "], "question": "which is a man speaking", "label": 1}, {"captions": ["an engine runs loudly", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vqZuVbG6-HI", "vfYTJq7nU"], "start_seconds": ["130", "130"], "properties": ["loud, engine, run", "rustling, ducks, quack"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a duck quacks and a woman speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "water flows as men speak and yell"], "sample_ids": ["xfaoyyzw2WU", "vJ7JPEFhyLA"], "start_seconds": ["180", "16"], "properties": ["loud, jet engine, roar", "water, flow, men"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which is louder", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["ugHJF0hfYkg", "zl9Dqx-j7q4"], "start_seconds": ["10", "6"], "properties": ["loud, propeller, move", "engine, laugh, loud"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a helicopter is flying overhead ", "a jet engine roars "], "question": "which is louder", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["uzQnlJXBbOM", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["ringing, beep, stop", "rooster, crow, background, men"], "captions_pred_video": ["footage of a person using a cell phone on a table", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a telephone rings and a man speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a vehicle engine accelerates and wind blows"], "sample_ids": ["xNMovAf3o50", "wudZTNBtVqc"], "start_seconds": ["0", "60"], "properties": ["rain, thunder, music", "accelerates, engine, wind"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "footage is of a parking lot with cars parked in it"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a car accelerates and revs its engine "], "question": "which entity is not a weather phenomenon", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "a man speaks then blows a vehicle horn as wind blows"], "sample_ids": ["s4Uz1Ffgo04", "zALy31PjDl0"], "start_seconds": ["100", "21"], "properties": ["water, rushes, motorcycle", "a man, a vehicle, a horn"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a motorcycle is parked on the side of a brick walkway"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking and a car horn is honking"], "question": "which entity is about a vehicle?", "label": 0}, {"captions": ["wind blows and people talk while livestock vocalizes", "a frog croaks as other frogs croak in the background"], "sample_ids": ["vXlk0lIQBFo", "yswmmRZFItk"], "start_seconds": ["470", "0"], "properties": ["wind, talk, vocalize", "background, frog, croak"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a close up of a frog in the water"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a frog is croaking"], "question": "which entity is a single animal", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "birds chirp and objects are moved around"], "sample_ids": ["ul60S8TXDA8", "yPUYU6t3rwo"], "start_seconds": ["60", "370"], "properties": ["sound, distance, bell", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "an infant crying as a woman laughs"], "sample_ids": ["yZrFNS7GFBQ", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["pigeon, buzzes, insect", "a, laugh, infant"], "captions_pred_video": ["of the bird in the cage", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["an owl hoots in the background ", "a baby cries and a woman speaks"], "question": "which entity is crying", "label": 1}, {"captions": ["a clock ticktocks briefly", "a boat travels through the waves as the wind blows loudly and a man speaks over a radio"], "sample_ids": ["u7C-AEBQM", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["ticktocks, clock, ticktocks briefly", "wind, radio, waves"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "a stream of water runs briefly"], "sample_ids": ["w8uLijTqtlU", "x-PeY8Yb8M4"], "start_seconds": ["70", "300"], "properties": ["wind, microphone, noise", "stream, water, run"], "captions_pred_video": ["footage is blurry and shaky", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["the wind is blowing strongly", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a music is played followed by a frog croaking and then music is played again"], "sample_ids": ["tw76HGONaKg", "voJh2gJxXhA"], "start_seconds": ["570", "50"], "properties": ["A, game, keyboard", "music, frog, croak"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "a frog on a black background with a red diamond in the center"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "music is playing and crickets are chirping "], "question": "which entity is not a video game", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wqZ135Ssz0", "vbZ-0lGPneg"], "start_seconds": ["60", "30"], "properties": ["man, woman, squawks", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wz7N8YRy74I", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["rooster, crow, background, people", "harsh, wind, blows"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tDlysoZiA1I", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["animal, grunt, multiple", "engine, idle, woman"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking and a subway train is moving "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "someone is typing on a computer keyboard"], "sample_ids": ["uWAAAL4CIoc", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["a woman, chirps, animal", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a fly buzzes around loudly as birds chirp"], "sample_ids": ["ylpYOorfH4o", "uJV8NDaHqqk"], "start_seconds": ["410", "100"], "properties": ["motor, run, steady", "loud, fly, chirp"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a bee hive in a wooden box"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a swarm of bees buzzing around"], "question": "which entity is louder", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a man speaks followed by another man speaking outside"], "sample_ids": ["zsLxS-uLJTw", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["horn, blast, train", "two men, speak, follow"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a single speaker?", "label": 0}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "water flows and trickles"], "sample_ids": ["yYEVLuqEytU", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["animal, pig, background", "water, flow, trickle"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["several sheep bleat and a man speaks", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "an adult male speaks and dials a rotary phone"], "sample_ids": ["uPDn2BFTHk", "tK4VlLsNxak"], "start_seconds": ["140", "120"], "properties": ["woman, laughs, speaks", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": [null, "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and using a sewing machine"], "question": "which entity is a person", "label": 1}, {"captions": ["food is frying then a woman speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["ukxt9I7eMMg", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["food, woman, speak", "engine, laugh, loud"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a man speaks followed by another man speaking outside"], "sample_ids": ["se87d6yxEOA", "viuTg1M-dqg"], "start_seconds": ["10", "30"], "properties": ["run, whistle, pass", "two men, speak, follow"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["someone whistles a tune", "people speak in the background as a clock ticktocks"], "sample_ids": ["sIXTftIuUgw", "vZAw4apG0Es"], "start_seconds": ["90", "30"], "properties": ["someone, tune, whistle", "background, clock, ticktocks"], "captions_pred_video": [null, "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a person whistling a song", "a clock is ticking and people are talking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a male speaks over some small clicks", "pigeons vocalize and birds chirp"], "sample_ids": ["uXxVebHsGZ8", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["male, clicks, speak", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "people applaud and hoot and chat quietly"], "sample_ids": ["wqUmIEzuNz4", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["frog, bird, vocalize", "people, applaud, hoot"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", null], "captions_pred_audio": ["a cat meows and rustles", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "a child speaks in closed space"], "sample_ids": ["tK4VlLsNxak", "yW6FWLSLkx4"], "start_seconds": ["120", "40"], "properties": ["a, dial, telephone", "child, space, speak"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "a soft wind underscores a woman laughing"], "sample_ids": ["x9JovgqUcs", "s6DESzUTGjY"], "start_seconds": ["500", "16"], "properties": ["a, man, speaks, keyboard", "wind, laugh, woman"], "captions_pred_video": [null, "how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a motorboat is moving with wind noise in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a toilet flushes and a female speaks"], "sample_ids": ["xC8kbrKJmco", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["background, goat, scream", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a goat is bleating ", "a toilet flushes and a man speaks"], "question": "which entity is silent", "label": 1}, {"captions": ["speaking following by laughing and clapping", "multiple birds chirp and an animal grunts"], "sample_ids": ["u2f5NpsoHBg", "tDlysoZiA1I"], "start_seconds": ["30", "0"], "properties": ["person, laugh, clap", "animal, grunt, multiple"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "birds are chirping and a rooster is crowing "], "question": "which entity is a person", "label": 0}, {"captions": ["a man speaks as music plays before artillery is fired", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vcmWSmvti8", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["music, man, fire", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman speaking and an animal grunting and snorting?", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "people speak as gunfire rings out"], "sample_ids": ["vfYTJq7nU", "wqTCwqVRDlk"], "start_seconds": ["130", "80"], "properties": ["rustling, ducks, quack", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a propeller rotates loudly and intensely"], "sample_ids": ["vhJWZheqaE", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["water drains unevenly, toilet flushes, water drains", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a toilet is flushed", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "a telephone rings followed by a woman talking"], "sample_ids": ["wIvYjuR3nrg", "tGcFnX0GHI"], "start_seconds": ["9", "0"], "properties": ["birds, pigeons, vocalize", "ring, talk, woman"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", null], "captions_pred_audio": ["birds are chirping and cooing", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "vehicles pass by on a roadway"], "sample_ids": ["tQWGZLItBXk", "tgbONvsP47Y"], "start_seconds": ["170", "0"], "properties": ["voice, music, whoosh", "pass, vehicle, roadway"], "captions_pred_video": ["worms revolution screenshots", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a car is driving on the road "], "question": "which entity is more active", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "waves crash against a shoreline and people speak"], "sample_ids": ["zcDwZ6W7E3E", "yFB25fqfU8I"], "start_seconds": ["180", "300"], "properties": ["man, speak, motorcycles", "wave, crash, shoreline"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which is a natural phenomenon", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a man speaks as a car is passing by"], "sample_ids": ["ylpYOorfH4o", "sK4u5T8hW78"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "a, car, pass"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking with background noise and breathing sounds "], "question": "which entity is quieter", "label": 1}, {"captions": ["someone snores nearby", "a man speaks while a machine runs before a smoke alarm beeps"], "sample_ids": ["spJCm8tD9Zo", "sG7TyPnFDR0"], "start_seconds": ["90", "180"], "properties": ["someone snores, nearby, someone", "beeps, machine, smoke alarm"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a person is using an espresso machine in a restaurant"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a microwave oven is beeping "], "question": "which entity is about a machine?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "an insect buzzes around continuously"], "sample_ids": ["x9JovgqUcs", "v25l1jef3JY"], "start_seconds": ["500", "0"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["an insect buzzes around continuously", "a child speaks in closed space"], "sample_ids": ["v25l1jef3JY", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["buzzes, continuously, insect", "child, space, speak"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not a child?", "label": 0}, {"captions": ["food is frying while a woman speaks", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["yhQ2Lg-7qDY", "uWPRNLnpy7Y"], "start_seconds": ["130", "10"], "properties": ["food, woman, speak", "accelerate, laugh, vehicle"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "is taken from a car driving down the street"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "cats meow and then a person begins to talk while the cats continue to meow"], "sample_ids": ["vBHyYJ8pL0", "x5cuQjOdM3E"], "start_seconds": ["2", "30"], "properties": ["noise, door, opening", "cat, talk, meow"], "captions_pred_video": [null, "a black background with an airplane flying in the sky"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a cat meows and a woman speaks"], "question": "which entity is accompanied by a person talking", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sAam2NqGhLY", "yDoT73BWsdA"], "start_seconds": ["20", "10"], "properties": ["snoring, breathing, child", "engine, revs, vehicle"], "captions_pred_video": ["of a little girl sleeping on a couch", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a person is snoring", "a race car accelerates and revs its engine "], "question": "which entity is not a person", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wRV8yMk886E", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["liquid, spray, nozzle", "engine, accelerate, idle"], "captions_pred_video": ["two cars are parked in a parking lot at night", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man speaks followed by a loud burst", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks as a machine runs", "people cheer as a vehicle engine revs"], "sample_ids": ["vD6lYD1l0BY", "xjhAnI2q6hM"], "start_seconds": ["330", "6"], "properties": ["a, machine, run", "engine revs, vehicle, people"], "captions_pred_video": ["game controller being held in the hands of the person", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a stream of water runs briefly"], "sample_ids": ["ukxt9I7eMMg", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["food, pan, cook", "stream, water, run"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "water is sprayed across a hard surface"], "sample_ids": ["vBslzh7saPw", "sQwlkXjQabo"], "start_seconds": ["90", "10"], "properties": ["engine, roar, louder", "water, spray, surface"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a jet engine roars and accelerates ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["ticking continues without interruption", "a man speaks and is typing on a keyboard"], "sample_ids": ["v-g-j2uTByM", "x9JovgqUcs"], "start_seconds": ["30", "500"], "properties": ["ticking, continuous, clock", "a, man, speaks, keyboard"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", null], "captions_pred_audio": ["a clock is ticking loudly", "a man speaks and types on a keyboard"], "question": "which entity is not continuous", "label": 1}, {"captions": ["water splashes as an animal walks through", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["w1ir-sZ3Im8", "vfYTJq7nU"], "start_seconds": ["90", "130"], "properties": ["animal, water, splashes", "rustling, ducks, quack"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 0}, {"captions": ["a man makes an exclamation, then another man speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xO-Q2BlIIPU", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["two men, exclamation, speak", "men, talk, cars"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a weapon fires multiple times"], "sample_ids": ["x9JovgqUcs", "sMC07Ucy7kg"], "start_seconds": ["500", "10"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "weapon, fire, multiple"], "captions_pred_video": [null, "footage is from a car's point of view"], "captions_pred_audio": ["a man speaks and types on a keyboard", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["dogs barking and whimpering", "a propeller rotates loudly and intensely"], "sample_ids": ["tIY7qOV3rEM", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["barking, whimpering, dog", "loud, intense, propeller"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a small engine idles continuously"], "sample_ids": ["xBxDz0CFVn0", "y5WII6cTH7k"], "start_seconds": ["30", "40"], "properties": ["stream, water, flow", "engine, idle, continuously"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "an engine is knocking and vibrating "], "question": "which entity is not a stream of water?", "label": 1}, {"captions": ["an airplane engine runs", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yVPZ2MNWpms", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["engine, airplane, runs", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car is driving by on the road ", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "an airplane engine spools and people speak"], "sample_ids": ["vSeGhaZt-aI", "wTjoRj1se3U"], "start_seconds": ["50", "390"], "properties": ["water, bubbles, run", "airplane, engine, spool"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a jet engine is running and people are talking"], "question": "which entity is a video of a man speaking and water bubbles and runs?", "label": 0}, {"captions": ["paper is crumpling consistently", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["v5cSxLaHADY", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "a woman, laughs, animal"], "captions_pred_video": ["footage of the person holding a pair of scissors", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["paper is crumpled and crinkled", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a clock ticktocks in wind", "a saw finishes running as metal clings in the background"], "sample_ids": ["yVumC9TGknc", "zofjfKhqLk8"], "start_seconds": ["30", "10"], "properties": ["ticktocks, clock, wind", "background, metal, clings"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["a series of beeps and chirps", "a large engine is running and a bell is ringing"], "question": "which entity is a clock", "label": 0}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a woman speaks happily and an animal chirps"], "sample_ids": ["sLUnaPT5gM8", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["loud, laughter, intermittent", "a woman, chirps, animal"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "winds blows roughly as a vehicle races past"], "sample_ids": ["vVhthZ45k3Y", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["cat, purr, hiss", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a jet engine roars and wind blows "], "question": "which entity is more likely to be a windy day", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "water flows and trickles"], "sample_ids": ["wy1eKjR7KC0", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["people, talk, distance", "water, flow, trickle"], "captions_pred_video": ["two police officers riding motorcycles down the street", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and a siren is going off", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["an airplane accelerates briefly", "water splashes as an animal walks through"], "sample_ids": ["zjTG0gaGCUI", "w1ir-sZ3Im8"], "start_seconds": ["80", "90"], "properties": ["accelerates, airplane, briefly", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a jet engine roars as wind blows ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a car", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man talks while a clock does ticktock"], "sample_ids": ["sfAvvZwdLCY", "spYNpeN7rPY"], "start_seconds": ["20", "1"], "properties": ["water drains, flushes, water", "a clock, ticktock, man"], "captions_pred_video": ["footage of the toilet in the bathroom", "in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and breathing with background noise "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "people cheer as a vehicle engine revs"], "sample_ids": ["w-4gHptFNuU", "xjhAnI2q6hM"], "start_seconds": ["21", "6"], "properties": ["engine revs, accelerates, bump", "engine revs, vehicle, people"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["zkKdxzNC97Y", "y8WEcpOlT3I"], "start_seconds": ["27", "40"], "properties": ["loud, bang, noise", "harsh, wind, blows"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking with wind noise in the background "], "question": "which entity is softer", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wqADXCzngMw", "sSMl2vc3ek"], "start_seconds": ["340", "20"], "properties": ["engine, idle, man", "loud, multiple, distance"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", null], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a man speaks as a car is passing by"], "sample_ids": ["yajyRTUQk3U", "sK4u5T8hW78"], "start_seconds": ["400", "30"], "properties": ["a woman, something, fried", "a, car, pass"], "captions_pred_video": ["- a woman cooking in the kitchen", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "an animal quacks rapidly"], "sample_ids": ["vf9xf3vMsGM", "vh30P49Po6s"], "start_seconds": ["540", "30"], "properties": ["A man speaks while turning a water faucet on.", "animal, quacks, rapidly"], "captions_pred_video": ["of the person washing their hands under the faucet", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a duck is quacking loudly"], "question": "which entity is a person", "label": 0}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["xKB8O8LTs6s", "uYT5gxnyMWM"], "start_seconds": ["70", "50"], "properties": ["music, radio, gunshots", "female, spraying, scream"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking on a radio?", "label": 0}, {"captions": ["wind blows strongly and a young man speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vs65y4qmyBE", "uZesmtKZGSw"], "start_seconds": ["340", "250"], "properties": ["wind, blows, strongly", "men, talk, cars"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more likely to be in a city", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sK4u5T8hW78", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["a, car, pass", "water, radio, man"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a clock ticktocks"], "sample_ids": ["sfAvvZwdLCY", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the toilet in the bathroom", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a toilet is flushed", "a clock is ticking loudly"], "question": "which entity is a timepiece", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "water splashes and a motorboat passes as people yell"], "sample_ids": ["vSeGhaZt-aI", "w5W5Kqtc8E"], "start_seconds": ["50", "100"], "properties": ["water, bubbles, run", "water, splashes, motorboat"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["t25U-v4k4ts", "y8WEcpOlT3I"], "start_seconds": ["40", "40"], "properties": ["bees buzz, birds chirp, man speaks", "harsh, wind, blows"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be in a calm environment", "label": 0}, {"captions": ["a clock ticktocks", "a vehicle accelerates and squeals tires"], "sample_ids": ["v-g-j2uTByM", "yRx9txMcBl0"], "start_seconds": ["30", "40"], "properties": ["ticktocks, clock, ticktocks", "accelerates, tires, squeals"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a clock is ticking loudly", "a car is revving its engine and skidding "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["an engine runs and a man speaks", "a train horn sounds as a railroad passing bell rings"], "sample_ids": ["yT5WfYMRr-U", "zgUgkpk78xU"], "start_seconds": ["30", "70"], "properties": ["engine, run, man", "horn, bell, train"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a train blows its horn as it speeds down the tracks "], "question": "which train is going to pass the other train?", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a propeller rotates loudly and intensely"], "sample_ids": ["zj2R0XoFr5k", "ugHJF0hfYkg"], "start_seconds": ["50", "10"], "properties": ["airplane, fly, woman", "loud, intense, propeller"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "people speak as gunfire rings out"], "sample_ids": ["wyllXV6PjKo", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["a kid, talk, cry", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "birds chirp and objects are moved around"], "sample_ids": ["zl9Dqx-j7q4", "yPUYU6t3rwo"], "start_seconds": ["6", "370"], "properties": ["engine, laugh, loud", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a jet engine roars ", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "paper folding and crinkling"], "sample_ids": ["t97k0cejSQE", "zPpG3RD8lSs"], "start_seconds": ["250", "20"], "properties": ["bird, chirp, insect", "paper, fold, crinkle"], "captions_pred_video": ["a bee on a purple thistle flower", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "the wind blows and a mouse clicks "], "question": "which entity is not a living thing", "label": 1}, {"captions": ["a baby cries and a woman speaks", "water flows and trickles"], "sample_ids": ["tMbMDvT50j8", "tB7hWb9gTuQ"], "start_seconds": ["12", "30"], "properties": ["a, cry, woman", "water, flow, trickle"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a baby cries and a woman speaks", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["y1saVTXsKwc", "w5W5Kqtc8E"], "start_seconds": ["80", "100"], "properties": ["a, dog, talk", "wind, blow, vehicle"], "captions_pred_video": ["a dog playing with a pink ball", null], "captions_pred_audio": ["a dog barks and a man speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vbZ-0lGPneg", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["a woman, a television program, a bird", "applause, audience, yells"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "water pouring and bubbling"], "sample_ids": ["xZepNM9qcRA", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["background, motor, run", "water, bubbles, pouring"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "water is running from a faucet"], "question": "which entity is more active", "label": 1}, {"captions": ["a person snoring several times", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["spJCm8tD9Zo", "tdWhHV3X25Q"], "start_seconds": ["90", "60"], "properties": ["snore, person, several", "applause, audience, yells"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman and man are speaking", "small dogs yip and bark sharply"], "sample_ids": ["vbpKkWvfOu4", "v-wcQf4BDY0"], "start_seconds": ["560", "120"], "properties": ["two people, speaking, woman, man", "bark, yip, sharply"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a stream of water runs briefly", "some tunes played by whistling"], "sample_ids": ["x-PeY8Yb8M4", "u6BnG6YZqJ4"], "start_seconds": ["300", "0"], "properties": ["stream, water, run", "tune, play, whistling"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a car is driving on a wet road ", "a person whistling a song"], "question": "which entity is not a stream of water?", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "water rushes and then a vehicle zooms past"], "sample_ids": ["s4Uz1Ffgo04", "s4Uz1Ffgo04"], "start_seconds": ["100", "100"], "properties": ["water, rushes, motorcycle", "water, rushes, vehicle"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity shows a vehicle zooming past?", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "some men converse over an engine running"], "sample_ids": ["vzxHnu-SFEw", "sCiy7QS1U"], "start_seconds": ["80", "300"], "properties": ["two objects, woman, speak", "men, converse, engine"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which object is being rubbed together", "label": 0}, {"captions": ["a man speaks while water drains", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vSeGhaZt-aI", "uZesmtKZGSw"], "start_seconds": ["50", "250"], "properties": ["water, drain, man", "men, talk, cars"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a clock ticktocks continuously", "a motorcycle engine works nearby"], "sample_ids": ["vlJS7LN2XyM", "tOSWIURC-4"], "start_seconds": ["30", "0"], "properties": ["ticktocks, clock, ticktocks continuously", "engine, work, nearby"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a lawn mower is running "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man talks as several small engines run", "ticking continues without interruption"], "sample_ids": ["u9A6VZQCZpU", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "ticking, continuous, clock"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vXlk0lIQBFo", "uYT5gxnyMWM"], "start_seconds": ["470", "50"], "properties": ["wind, speak, vocalize", "a, scream, girl"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a woman is speaking and a baby is crying"], "question": "which entity is about a girl speaking followed by a scream?", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "pigeons vocalize and birds chirp"], "sample_ids": ["sapQIQUhFc", "uiS58TNyUiw"], "start_seconds": ["280", "430"], "properties": ["liquid, flow, distance", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["waves crash and wind blows", "rain falls loudly as the build up of thunder and lightning occurs in the distance"], "sample_ids": ["vwqaIHKxLvM", "wulOEFdECWs"], "start_seconds": ["20", "10"], "properties": ["wind, crash, wave", "rain, thunder, lightning"], "captions_pred_video": ["of a surfer riding a big wave in the ocean", "a view of the city at night from a window in the rain"], "captions_pred_audio": ["waves crash and wind blows ", "a heavy rain is falling on a surface"], "question": "which entity is more likely to cause damage", "label": 1}, {"captions": ["a power tool runs and touches a surface", "water splashes as an animal walks through"], "sample_ids": ["zfvPRf3chY", "w1ir-sZ3Im8"], "start_seconds": ["290", "90"], "properties": ["power tool, run, touch", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "water splashes and gurgles as people speak"], "question": "which entity is not a power tool?", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "small dogs yip and bark sharply"], "sample_ids": ["sEprKHm8Sj8", "v-wcQf4BDY0"], "start_seconds": ["90", "120"], "properties": ["noise, loud, buzzing", "bark, yip, sharply"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "people applaud and hoot and chat quietly"], "sample_ids": ["y2bVZ7rz-5M", "wwyfGO2J4"], "start_seconds": ["280", "90"], "properties": ["engine, horn, siren", "people, applaud, hoot"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "people are clapping and speaking with background noise "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tDlfY3nmx1A", "zj2R0XoFr5k"], "start_seconds": ["160", "50"], "properties": ["applause, laugh, man", "airplane, boy, fly"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "wind blows as people chatter quietly"], "sample_ids": ["w6RTHR6AeAg", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["call, owl, screech", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a toilet flushes and water drains", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sfAvvZwdLCY", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "stream, water, flow"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage is blurry and out of focus"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with wind noise in the background "], "question": "which entity is moving water", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a person snores loudly multiple times at a close distance"], "sample_ids": ["un9VQlzgZM", "sSMl2vc3ek"], "start_seconds": ["5", "20"], "properties": ["wind, speak, laugh", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["ukg5L09Wpvo", "y2bVZ7rz-5M"], "start_seconds": ["150", "280"], "properties": ["a train, a horn, a bell", "motor noise, horn, siren"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn and a bell?", "label": 0}, {"captions": ["a child and woman laughs and the woman speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["uPDn2BFTHk", "wz7N8YRy74I"], "start_seconds": ["140", "30"], "properties": ["woman, laughs, speaks", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in it?", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wRBHTgrbiwg", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["bird, owl, speak", "three men, wind, flow"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking briefly?", "label": 0}, {"captions": ["birds coo incessantly", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yZrFNS7GFBQ", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["coo, bird, incessant", "clickety-clack, train, whistle"], "captions_pred_video": ["of the bird in the cage", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["an owl hoots in the background ", "a train blows its whistle and blows its horn "], "question": "which is continuous", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "water splashes as an animal walks through"], "sample_ids": ["vzxHnu-SFEw", "w1ir-sZ3Im8"], "start_seconds": ["80", "90"], "properties": ["two objects, woman, speak", "animal, water, splashes"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "paper is repeatedly crumpled and crinkled"], "sample_ids": ["tK4VlLsNxak", "vms5XGTDVQc"], "start_seconds": ["120", "220"], "properties": ["a, dial, telephone", "paper, crumpled, crinkled"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of a woman opening a black bag on a table"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "paper is crumpled and crinkled"], "question": "which is not a rotary telephone", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "water splashes as an animal walks through"], "sample_ids": ["sTpirNYo8vQ", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["a, tone, fast", "animal, water, splashes"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["frogs croak and vocalize", "some men converse over an engine running"], "sample_ids": ["yswmmRZFItk", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["croak, vocalize, frog", "men, converse, engine"], "captions_pred_video": ["a close up of a frog in the water", null], "captions_pred_audio": ["a frog is croaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a human activity", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zOZleIRqZm4", "tDVADusiIoc"], "start_seconds": ["80", "60"], "properties": ["rustling, leaves, person", "water, radio, man"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["speaking following by laughing and clapping", "wind blows and people scream while an engine revs"], "sample_ids": ["u2f5NpsoHBg", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["person, laugh, clap", "wind, engine, scream"], "captions_pred_video": ["is being projected on a screen at the front of the stage", null], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more likely to be a movie", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["sjlVMgdGSK0", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["car, revving, loudly", "sound, chirp, buzz"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a bee on a purple thistle flower"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a bee buzzes and a woman speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "vehicles pass by on a roadway"], "sample_ids": ["sShpyu2l4YQ", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["growl, bark, yip", "pass, vehicle, roadway"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a dog is barking and growling", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["u5RmF3c3Aw", "zFjIWfSD-4"], "start_seconds": ["60", "410"], "properties": ["engine, car, zoom", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a car zooming by?", "label": 0}, {"captions": ["a clang followed by a toilet flushing", "pigeons vocalize and birds chirp"], "sample_ids": ["wNZ5thZM7XU", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["sound, flush, toilet", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a toilet in a bathroom stall", "of the pigeon in the cage"], "captions_pred_audio": ["a toilet flushes", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "a clock ticktocks"], "sample_ids": ["vfYTJq7nU", "v-g-j2uTByM"], "start_seconds": ["130", "30"], "properties": ["ducks, quack, man", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a dog barks and whimpers", "wind blows and people scream while an engine revs"], "sample_ids": ["sShpyu2l4YQ", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["barks, whimpers, dog", "wind, engine, scream"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more quiet", "label": 0}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vW4x7S1VfQc", "tiDFTC-5vU"], "start_seconds": ["150", "30"], "properties": ["clacking, oil, woman", "male, duck, laugh"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", null], "captions_pred_audio": ["food sizzles in a frying pan", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "some tunes played by whistling"], "sample_ids": ["vXlk0lIQBFo", "u6BnG6YZqJ4"], "start_seconds": ["470", "0"], "properties": ["wind, speak, vocalize", "tune, play, whistling"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["tOSWIURC-4", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["engine, work, nearby", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a lawn mower is running ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "some men converse over an engine running"], "sample_ids": ["u--KhUW8l1Y", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["engine, sound, horn", "men, converse, engine"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", null], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a running engine", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a young woman speaks over spraying and another person yells"], "sample_ids": ["wy1eKjR7KC0", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["people, talk, distance", "person, spray, yell"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a stream of water runs briefly"], "sample_ids": ["tiDFTC-5vU", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["male, duck, laugh", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["weDbePuc-Xc", "zj2R0XoFr5k"], "start_seconds": ["40", "50"], "properties": ["music, slaps, human", "airplane, boy, fly"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a woman sneezes then speaks", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["x4dZyf9Gbj0", "y8WEcpOlT3I"], "start_seconds": ["130", "40"], "properties": ["sneezes, speaks, woman", "harsh, wind, blows"], "captions_pred_video": ["footage is blurry and out of focus", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking with wind noise in the background "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "plastic is tapped on while someone speaks"], "sample_ids": ["y8dSeubCNI", "wvKpEYswXO0"], "start_seconds": ["4", "150"], "properties": ["men, women, car", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["an engine revving and people talking in the background", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is quieter", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a baby laughs giddily and a woman laughs then speaks"], "sample_ids": ["y8dSeubCNI", "wjsXBsc7M40"], "start_seconds": ["4", "10"], "properties": ["men, women, car", "a baby laughs, a woman laughs, a woman speaks"], "captions_pred_video": [null, "footage of the baby playing with a toothbrush"], "captions_pred_audio": ["an engine revving and people talking in the background", "a baby laughs and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "people cheer as a vehicle engine revs"], "sample_ids": ["wDVMhEdTiVw", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["gun, shoot, water", "engine revs, vehicle, people"], "captions_pred_video": ["a blurry image of trees and water in the forest", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a man speaks as crickets sing"], "sample_ids": ["tQWGZLItBXk", "ryFDPxgDOGc"], "start_seconds": ["170", "570"], "properties": ["music, kid, speak", "a, crickets, sing"], "captions_pred_video": ["worms revolution screenshots", "a group of people dressed in camouflage and hunting gear in the dark"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking with crickets chirping in the background"], "question": "which entity has a man speaking as crickets sing?", "label": 1}, {"captions": ["a car accelerates and wind blows", "a motor runs steadily as a man speaks, then the motor revs twice"], "sample_ids": ["u0TrcHhkPQ", "ylpYOorfH4o"], "start_seconds": ["20", "410"], "properties": ["accelerates, wind, blows", "motor, run, steady"], "captions_pred_video": [null, "for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and an engine is revving"], "question": "which is a moving object", "label": 0}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tZGN5a7ybxo", "su6FAOcOA8c"], "start_seconds": ["60", "4"], "properties": ["ring, train, horn", "engine, idle, woman"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wDVMhEdTiVw", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["gun, shoot, water", "engine, idle, woman"], "captions_pred_video": ["a blurry image of trees and water in the forest", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["yYJksgsxx5U", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["audio, woman, silverware", "A, game, keyboard"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man speaks and types on a computer keyboard "], "question": "which entity is a video?", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a flush is followed by gurgling water, then another flush"], "sample_ids": ["vfYTJq7nU", "tqR406bGiE"], "start_seconds": ["130", "40"], "properties": ["rustling, ducks, quack", "flush, water, gurgle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a duck quacks and a woman speaks", "a toilet is flushed"], "question": "which entity is more likely to be a video of a toilet flushing?", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "someone is typing on a computer keyboard"], "sample_ids": ["zofjfKhqLk8", "v0x1odnXtP0"], "start_seconds": ["10", "210"], "properties": ["noise, stop, motor", "keyboard, type, computer"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "how to make money on youtube in spanish"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "water splashes and a door squeaks"], "sample_ids": ["sxYkFKFIZD0", "sdXV-ylviw"], "start_seconds": ["20", "190"], "properties": ["screech, man, door", "sound, splash, door"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a dog barks and taps with background noise "], "question": "which door is squeaking", "label": 1}, {"captions": ["a person is whistling a tune", "a infant makes noise and is excited"], "sample_ids": ["scYRUkrFLiQ", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["a, tune, whistle", "noise, excited, infant"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a person whistling a song", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "winds blows roughly as a vehicle races past"], "sample_ids": ["w0xsN8X18Y", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["music, surface, rain", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be a storm", "label": 1}, {"captions": ["an insect buzzes around continuously", "someone is typing on a computer keyboard"], "sample_ids": ["v25l1jef3JY", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["buzzes, continuously, insect", "keyboard, type, computer"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "how to make money on youtube in spanish"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a person is typing on a keyboard"], "question": "which entity is not a person", "label": 0}, {"captions": ["a woman speaks happily and an animal chirps", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uWAAAL4CIoc", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["a woman, chirps, animal", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a machine runs continuously", "water pouring and bubbling"], "sample_ids": ["wdXV3Pv0jiY", "uyRfq-jKPpo"], "start_seconds": ["11", "50"], "properties": ["machine, running, continuously", "water, bubbles, pouring"], "captions_pred_video": ["footage is blurry and shaky", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "water is running from a faucet"], "question": "which entity is not running continuously", "label": 1}, {"captions": ["scraping and female speech with distant music", "a infant makes noise and is excited"], "sample_ids": ["yHeVV-xeOxQ", "wIJK3-5y0kA"], "start_seconds": ["130", "30"], "properties": ["female, speech, music", "noise, excited, infant"], "captions_pred_video": ["of a girl milking a goat's udder", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a crowd yells, reacts and applauds"], "sample_ids": ["zY3icUyMdh8", "wztCSUxOf8"], "start_seconds": ["20", "130"], "properties": ["dog, bark, engine", "a crowd, yells, applauds"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking and a crowd is clapping"], "question": "which entity is more likely to be at a sporting event", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wRV8yMk886E", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["liquid, spray, nozzle", "loud, jet engine, roar"], "captions_pred_video": ["two cars are parked in a parking lot at night", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man speaks followed by a loud burst", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "birds chirp quietly and an adult man speaks"], "sample_ids": ["y8dSeubCNI", "zuua6-5goWw"], "start_seconds": ["4", "30"], "properties": ["men, women, car", "birds, chirp, quiet, man, speaks"], "captions_pred_video": [null, "in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot"], "captions_pred_audio": ["an engine revving and people talking in the background", "birds are chirping and a man is speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["tDVADusiIoc", "ziUT9IFTkjg"], "start_seconds": ["60", "10"], "properties": ["water, radio, man", "background, birds, rustling"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xl2PIWyXaM", "vJ7JPEFhyLA"], "start_seconds": ["160", "16"], "properties": ["chirp, man, younger person", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and people are talking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a man talks as several small engines run", "music plays, a person speaks, followed by whooshes and a ding"], "sample_ids": ["u9A6VZQCZpU", "tQWGZLItBXk"], "start_seconds": ["30", "170"], "properties": ["a, man, talk", "music, person, ding"], "captions_pred_video": [null, "worms revolution screenshots"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has a person speaking?", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a man speaks as a car is passing by"], "sample_ids": ["sa6TLVbooCc", "sK4u5T8hW78"], "start_seconds": ["240", "30"], "properties": ["people, laugh, child", "a, car, pass"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tQWGZLItBXk", "wDVMhEdTiVw"], "start_seconds": ["170", "30"], "properties": ["music, person, ding", "gun, shoot, water"], "captions_pred_video": ["worms revolution screenshots", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a game", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "white noise and snoring with some rustling in the background"], "sample_ids": ["x5cuQjOdM3E", "xzKKf9bKNUo"], "start_seconds": ["30", "10"], "properties": ["cat, talk, meow", "background, noise, snoring"], "captions_pred_video": ["a black background with an airplane flying in the sky", "shows a woman laying on a bed with her eyes closed and her mouth open"], "captions_pred_audio": ["a cat meows and a woman speaks", "a person snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a man speaks as a car is passing by"], "sample_ids": ["sK4u5T8hW78", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "a, car, pass"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a photograph", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "a car accelerates and wind blows"], "sample_ids": ["vs65y4qmyBE", "u0TrcHhkPQ"], "start_seconds": ["340", "20"], "properties": ["wind, blows, strongly", "accelerates, wind, blows"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "an adult male speaks and dials a rotary phone"], "sample_ids": ["tDVADusiIoc", "tK4VlLsNxak"], "start_seconds": ["60", "120"], "properties": ["man, radio, blows", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and using a sewing machine"], "question": "which entity is a man?", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["uYT5gxnyMWM", "tDVADusiIoc"], "start_seconds": ["50", "60"], "properties": ["person, spray, yell", "water, radio, man"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a person speaking over spraying?", "label": 0}, {"captions": ["females talk and laugh over gusting wind", "a man talks as something metal hits against and glass is set down"], "sample_ids": ["un9VQlzgZM", "x6ijhqRY38s"], "start_seconds": ["5", "250"], "properties": ["females, talk, laugh", "something metal, glass, hit"], "captions_pred_video": [null, "a chef preparing a dish with a bottle of wine and a plate of food on a table"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking and dishes are clanging "], "question": "which entity is about a man talking?", "label": 1}, {"captions": ["the revving of an engine throttle followed by a man speaking", "small dogs yip and bark sharply"], "sample_ids": ["tezvROoo4bs", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["audio, throttle, speaking", "bark, yip, sharply"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xyL9F5VrjkE", "w5W5Kqtc8E"], "start_seconds": ["20", "100"], "properties": ["engine, run, wind", "wind, blow, vehicle"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a motorboat is moving and people are shouting and cheering "], "question": "which vehicle is running an engine?", "label": 0}, {"captions": ["multiple ducks quack continuously", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wfHeoPDLMaM", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["multiple, quack, continuously", "airplane, boy, fly"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["ducks are quacking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["wind blows strongly", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["w8uLijTqtlU", "zj2R0XoFr5k"], "start_seconds": ["70", "50"], "properties": ["wind, blows, strongly", "airplane, boy, fly"], "captions_pred_video": ["footage is blurry and shaky", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["the wind is blowing strongly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "birds chirp and objects are moved around"], "sample_ids": ["vZAw4apG0Es", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["people, clock, converse", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a clock is ticking and people are talking", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "water flows as men speak and yell"], "sample_ids": ["zsLxS-uLJTw", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["horn, blast, train", "water, flow, men"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a train passing?", "label": 0}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wyllXV6PjKo", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["a baby, a woman, a man", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a helicopter engine idles continuously", "wind blows as people chatter quietly"], "sample_ids": ["ugHJF0hfYkg", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["engine, idle, continuously", "wind, chatter, people"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage is blurry and out of focus"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vzxHnu-SFEw", "sSMl2vc3ek"], "start_seconds": ["80", "20"], "properties": ["two objects, woman, speak", "loud, multiple, distance"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "a man speaks as a motor runs in the background"], "sample_ids": ["y2ZBGpgbhHM", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["dog, chirp, breathe", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds chirping and a dog panting", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "two men speak as a buffeting wind blows"], "sample_ids": ["vSeGhaZt-aI", "y8WEcpOlT3I"], "start_seconds": ["50", "40"], "properties": ["water, bubbles, speak", "wind, speak, buffeting"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be in a calm environment", "label": 0}, {"captions": ["people clap and speak in the distance", "a person is snoring while sleeping"], "sample_ids": ["wwyfGO2J4", "vJrjSeP17yE"], "start_seconds": ["90", "40"], "properties": ["clap, distance, speak", "a person is sleeping, snoring, person"], "captions_pred_video": [null, "a black background with a small plane flying in the sky"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a person snoring loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wudZTNBtVqc", "uYT5gxnyMWM"], "start_seconds": ["60", "50"], "properties": ["accelerates, engine, wind", "female, spraying, scream"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn honks and then loudly blares", "a infant makes noise and is excited"], "sample_ids": ["wnpJndXuxLc", "wIJK3-5y0kA"], "start_seconds": ["50", "30"], "properties": ["horn, honk, loud", "noise, excited, infant"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "multiple people speak and children yell while water gurgles"], "sample_ids": ["w34HjHr6gAY", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["beeps, squawk, child speaking", "multiple, people, yell"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["wvKpEYswXO0", "y2bVZ7rz-5M"], "start_seconds": ["150", "280"], "properties": ["sound, water, running", "motor noise, horn, siren"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honking?", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a car accelerates and wind blows"], "sample_ids": ["wqZ135Ssz0", "u0TrcHhkPQ"], "start_seconds": ["60", "20"], "properties": ["man, woman, squawks", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "water splashes as an animal walks through"], "sample_ids": ["sHbXC6na9hg", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["a person, saw, wood", "animal, water, splashes"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["an engine is idling and vibrating", "water splashes and gurgles as people speak"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a man speaks as a car is passing by"], "sample_ids": ["sjlVMgdGSK0", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["car, revving, loudly", "a, car, pass"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which car is revving loudly", "label": 0}, {"captions": ["various birds chirp and squeal, and an animal grunts", "wind blowing followed by a zoom"], "sample_ids": ["tDlysoZiA1I", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["animal, grunts, chirps", "wind, blow, zoom"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is silent", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a man speaks as a car is passing by"], "sample_ids": ["sEprKHm8Sj8", "sK4u5T8hW78"], "start_seconds": ["90", "30"], "properties": ["car, tires, slows", "a, car, pass"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which car is going faster", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["uOpoD0gGXcs", "zj2R0XoFr5k"], "start_seconds": ["120", "50"], "properties": ["chirps, woman, bird", "airplane, boy, fly"], "captions_pred_video": ["a herd of cows grazing in the field", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "wind blowing followed by a zoom"], "sample_ids": ["ylpYOorfH4o", "vr8ZXjEBhMQ"], "start_seconds": ["410", "150"], "properties": ["engine, running, wind", "wind, blow, zoom"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and an engine is revving", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a stream runs then someone speaks"], "sample_ids": ["x5cuQjOdM3E", "wbHTKEJZyhc"], "start_seconds": ["30", "20"], "properties": ["cat, meows, young woman", "stream, run, someone"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a cat meows and a woman speaks", "a waterfall is flowing and people are speaking "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["s4Uz1Ffgo04", "su6FAOcOA8c"], "start_seconds": ["100", "4"], "properties": ["roars, background, people speaking", "engine, idle, woman"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman is speaking and a subway train is moving "], "question": "which entity is quieter", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a man speaks as a car is passing by"], "sample_ids": ["yYEVLuqEytU", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["animal, pig, background", "a, car, pass"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a clock ticktocks"], "sample_ids": ["un9VQlzgZM", "v-g-j2uTByM"], "start_seconds": ["5", "30"], "properties": ["wind, speak, laugh", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wP8ZKrlx3oA", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["rain, storm, thunder", "rustling, ducks, quack"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a duck quacks and a woman speaks"], "question": "which entity is more likely to be in a lake", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "crowd applause while a guy laughs followed by another man speaking"], "sample_ids": ["uiItxDsDMFI", "tDlfY3nmx1A"], "start_seconds": ["30", "160"], "properties": ["wood, piece, saw", "applause, laugh, man"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a man in a suit and tie is talking to another man in a suit and tie"], "captions_pred_audio": ["a saw is being used with background noise ", "a crowd is clapping and laughing and a man is speaking "], "question": "which entity is a man speaking to a crowd?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "an airplane engine runs"], "sample_ids": ["x9JovgqUcs", "yVPZ2MNWpms"], "start_seconds": ["500", "0"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a machine beeps continuously"], "sample_ids": ["yRx9txMcBl0", "y682ml90jGw"], "start_seconds": ["40", "11"], "properties": ["accelerates, tires, squeals", "beeps, machine, continuously"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a toilet flushes and water drains", "two women and a man talk while a kid cries"], "sample_ids": ["sfAvvZwdLCY", "wyllXV6PjKo"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "a kid, talk, cry"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a woman speaks and a baby cries"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "winds blows roughly as a vehicle races past"], "sample_ids": ["tDVADusiIoc", "xjvTpk2Zpr8"], "start_seconds": ["60", "70"], "properties": ["man, radio, blows", "wind, blows, vehicle"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine roars and wind blows "], "question": "which entity shows a vehicle racing past?", "label": 1}, {"captions": ["some people speak", "vehicles pass by on a roadway"], "sample_ids": ["vbZ-0lGPneg", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "pass, vehicle, roadway"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a car is driving on the road "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["multiple ducks quack continuously", "pigeons vocalize and birds chirp"], "sample_ids": ["wfHeoPDLMaM", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["multiple, quack, continuously", "vocalize, bird, chirp"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "of the pigeon in the cage"], "captions_pred_audio": ["ducks are quacking", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sWZzXuWYY", "wz7N8YRy74I"], "start_seconds": ["420", "30"], "properties": ["male, clanks, thumps", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["xKB8O8LTs6s", "ziUT9IFTkjg"], "start_seconds": ["70", "10"], "properties": ["music, gunfire, explosion", "background, birds, rustling"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "birds are chirping and a chime is ringing "], "question": "which entity is more peaceful", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["zcDwZ6W7E3E", "wwyfGO2J4"], "start_seconds": ["180", "90"], "properties": ["man, speak, motorcycles", "people, applaud, hoot"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "birds chirp and objects are moved around"], "sample_ids": ["x6ijhqRY38s", "yPUYU6t3rwo"], "start_seconds": ["250", "370"], "properties": ["bowl, silverware, man", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "insects buzz and a man speaks"], "question": "which entity is about moving objects around", "label": 1}, {"captions": ["a large crowd cheers and applauds", "an infant crying frantically"], "sample_ids": ["rqfQRErjfk8", "zwOBqeFTgiU"], "start_seconds": ["170", "30"], "properties": ["crowd, cheers, applauds", "cry, infant, frantically"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "of the baby crying in the car seat"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a cat meows and children speak", "a child speaks in closed space"], "sample_ids": ["x5cuQjOdM3E", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["cat, speak, children", "child, space, speak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "people applaud and hoot and chat quietly"], "sample_ids": ["yRx9txMcBl0", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["accelerates, tires, squeals", "people, applaud, hoot"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "people are clapping and speaking with background noise "], "question": "which entity is a performance", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["ugHJF0hfYkg", "yajyRTUQk3U"], "start_seconds": ["10", "400"], "properties": ["engine, running, continuously", "a woman, something, fried"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of something being fried?", "label": 1}, {"captions": ["a woman sneezes then speaks", "a frog croaks as other frogs croak in the background"], "sample_ids": ["x4dZyf9Gbj0", "yswmmRZFItk"], "start_seconds": ["130", "0"], "properties": ["sneezes, speaks, woman", "background, frog, croak"], "captions_pred_video": ["footage is blurry and out of focus", "a close up of a frog in the water"], "captions_pred_audio": ["a woman sneezes and speaks", "a frog is croaking"], "question": "which entity is a croaking animal?", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "some tunes played by whistling"], "sample_ids": ["sYITalLZjj4", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["water, rushes, background, birds", "tune, play, whistling"], "captions_pred_video": ["two ducks are swimming in the water near each other", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["wind blows and birds chirp", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["uiItxDsDMFI", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["wood, piece, saw", "engine, revs, vehicle"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a saw is being used with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tQWGZLItBXk", "vbZ-0lGPneg"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "a woman, a television program, a bird"], "captions_pred_video": ["worms revolution screenshots", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["a woman sneezes then speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["x4dZyf9Gbj0", "vzxHnu-SFEw"], "start_seconds": ["130", "80"], "properties": ["sneezes, speaks, woman", "two objects, woman, speak"], "captions_pred_video": ["footage is blurry and out of focus", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman sneezes and speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman speaks as she rubs two objects together?", "label": 1}, {"captions": ["a helicopter engine runs", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["t5ZbXbniOWk", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["engine, helicopter, run", "engine, idle, woman"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a subway train is moving "], "question": "which entity has a running engine", "label": 0}, {"captions": ["a person sniffles and sneezes", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uRlbY6aoBU", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["sneezes, sniffles, person", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is sneezing ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a motorcycle engine is idling", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vZAqdHZ81yA", "wDVMhEdTiVw"], "start_seconds": ["180", "30"], "properties": ["engine, motorcycle, idling", "gun, shoot, water"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["an engine is idling loudly", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a cat meows and children speak", "an insect buzzes around continuously"], "sample_ids": ["x5cuQjOdM3E", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["cat, speak, children", "buzzes, continuously, insect"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a cat meows and a woman speaks", "a fly is buzzing around a microphone "], "question": "which entity is more active", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "people cheer as a vehicle engine revs"], "sample_ids": ["yYJksgsxx5U", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["audio, woman, silverware", "engine revs, vehicle, people"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w0xsN8X18Y", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["rain, thunder, surface", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not a gun?", "label": 0}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a woman speaks and dog vocalizes"], "sample_ids": ["zj2R0XoFr5k", "uWAAAL4CIoc"], "start_seconds": ["50", "0"], "properties": ["airplane, boy, fly", "a, dog, vocalize"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", null], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a dog is barking "], "question": "which entity is a dog?", "label": 1}, {"captions": ["a low rumbling in the distance followed by a motorcycle engine revving up", "wind blows as people chatter quietly"], "sample_ids": ["vr8ZXjEBhMQ", "xBxDz0CFVn0"], "start_seconds": ["150", "30"], "properties": ["sound, distance, engine", "wind, chatter, people"], "captions_pred_video": ["is taken from a motorcycle's point of view", "footage is blurry and out of focus"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a man speaks as a car is passing by"], "sample_ids": ["zY3icUyMdh8", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "a, car, pass"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more passive", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a child speaks in closed space"], "sample_ids": ["xSKJGCItUWE", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["engine, run, boy", "child, space, speak"], "captions_pred_video": ["footage of the helicopter flying in the room", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vlJS7LN2XyM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["background, clocks, ticking", "stream, water, flow"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage is blurry and out of focus"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a helicopter engine idles continuously"], "sample_ids": ["vs65y4qmyBE", "ugHJF0hfYkg"], "start_seconds": ["340", "10"], "properties": ["engine, run, man", "engine, idle, continuously"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a helicopter is flying overhead "], "question": "which entity has a running engine", "label": 0}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a man speaks as a vehicle engine idles"], "sample_ids": ["yLy-WycbVVE", "shmR4OZtzqA"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "man, engine, idle"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a man speaks while a motor runs"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person is burping while a girl speaks", "a stream of water runs briefly"], "sample_ids": ["vdoxuJn9lTc", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["person, burp, girl", "stream, water, run"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a child speaks followed by a burp", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a beep occurs briefly", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["xtWeJ56-U-g", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["beep, occur, briefly", "a, scream, girl"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["birds chirp then an animal grunts", "people speak as gunfire rings out"], "sample_ids": ["tDlysoZiA1I", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["animal, grunt, chirp", "gunfire, ring, speak"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["zcDwZ6W7E3E", "uWAAAL4CIoc"], "start_seconds": ["180", "0"], "properties": ["man, speak, motorcycles", "a woman, chirps, animal"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a woman is speaking and a dog is barking "], "question": "which entity has a more active animal", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a train horn blows as it passes by"], "sample_ids": ["sAam2NqGhLY", "zVacuqSb4LI"], "start_seconds": ["20", "30"], "properties": ["snoring, breathing, child", "horn, blows, train"], "captions_pred_video": ["of a little girl sleeping on a couch", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a person is snoring", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is not a person", "label": 1}, {"captions": ["food is frying then a woman speaks", "a stream of water runs briefly"], "sample_ids": ["ukxt9I7eMMg", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["food, woman, speak", "stream, water, run"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["water is sprayed across a hard surface", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sQwlkXjQabo", "sLUnaPT5gM8"], "start_seconds": ["10", "0"], "properties": ["water, spray, surface", "loud, laughter, intermittent"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["spraying followed by silence", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "paper is crumpling consistently"], "sample_ids": ["wDVMhEdTiVw", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["gun, shoot, water", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a blurry image of trees and water in the forest", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a propeller moves loudly nearby", "motors runs briefly and tires screech"], "sample_ids": ["ugHJF0hfYkg", "yRx9txMcBl0"], "start_seconds": ["10", "40"], "properties": ["loud, propeller, move", "motors, tires, screech"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is revving its engine and skidding "], "question": "which is louder", "label": 1}, {"captions": ["birds chirp, a woman speaks, and insects buzz", "paper is crumpling consistently"], "sample_ids": ["t97k0cejSQE", "v5cSxLaHADY"], "start_seconds": ["250", "0"], "properties": ["sound, chirp, buzz", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a bee on a purple thistle flower", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "wind blows as people chatter quietly"], "sample_ids": ["tDVADusiIoc", "xBxDz0CFVn0"], "start_seconds": ["60", "30"], "properties": ["wind, radio, waves", "wind, chatter, people"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["an engine runs and wind blows", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vs65y4qmyBE", "vfYTJq7nU"], "start_seconds": ["340", "130"], "properties": ["engine, run, wind", "rustling, ducks, quack"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["someone whistles a song", "gunshots ring out, a man yells, and more shots follow"], "sample_ids": ["sIXTftIuUgw", "vKrYfzleLB8"], "start_seconds": ["90", "110"], "properties": ["someone, song, whistle", "a, ring, gunshots"], "captions_pred_video": [null, "stock footage of a person holding a gun in their hand"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with background noise and a cap gun is fired "], "question": "which entity is more quiet", "label": 0}, {"captions": ["sawing of wood and rustling with leaves blowing in the distance", "a toilet flushes and water drains"], "sample_ids": ["uiItxDsDMFI", "sfAvvZwdLCY"], "start_seconds": ["30", "20"], "properties": ["sound, distance, leaves", "water drains, flushes, water"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a saw is being used with background noise ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["goats bleat and metal clings", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tH17JPjDPnc", "uZesmtKZGSw"], "start_seconds": ["260", "250"], "properties": ["bleat, metal, clings", "men, talk, cars"], "captions_pred_video": ["feed of the goats eating hay in the barn", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["water flows and trickles", "a woman talking as an infant is crying"], "sample_ids": ["tB7hWb9gTuQ", "tMbMDvT50j8"], "start_seconds": ["30", "12"], "properties": ["water, flow, trickle", "a, talk, infant"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["water is splashing and gurgling", "a baby cries and a woman speaks"], "question": "which entity is a human activity", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sapQIQUhFc", "uEU-Hg5MTN8"], "start_seconds": ["280", "27"], "properties": ["water, stream, trickles", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "a person snores loudly multiple times at a close distance"], "sample_ids": ["smGI3C1NZc", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["water, drain, toilet", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a child speaks in closed space"], "sample_ids": ["weDbePuc-Xc", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["cartoon character, music, vocalize", "child, space, speak"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a duck quacks loudly and continuously"], "sample_ids": ["vdoxuJn9lTc", "vh30P49Po6s"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "loud, continuous, quacks"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a child speaks followed by a burp", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "an airplane engine roars increasingly louder"], "sample_ids": ["wjsXBsc7M40", "vBslzh7saPw"], "start_seconds": ["10", "90"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "engine, roar, louder"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a jet engine roars and accelerates "], "question": "which entity is louder", "label": 1}, {"captions": ["a child speaks in closed space", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yW6FWLSLkx4", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["child, space, speak", "three men, wind, flow"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be in a closed space", "label": 0}, {"captions": ["a man rubs two objects together then speaks", "birds chirp and objects are moved around"], "sample_ids": ["vveS8HT7Uog", "yPUYU6t3rwo"], "start_seconds": ["100", "370"], "properties": ["a man, objects, speak", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "insects buzz and a man speaks"], "question": "which entity is about birds?", "label": 1}, {"captions": ["paper folding and crinkling", "a infant makes noise and is excited"], "sample_ids": ["zPpG3RD8lSs", "wIJK3-5y0kA"], "start_seconds": ["20", "30"], "properties": ["paper, fold, crinkle", "noise, excited, infant"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a machine runs continuously", "a steam engine runs and whistles as it passes by"], "sample_ids": ["wdXV3Pv0jiY", "se87d6yxEOA"], "start_seconds": ["11", "10"], "properties": ["machine, running, continuously", "run, whistle, pass"], "captions_pred_video": ["footage is blurry and shaky", "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a train is moving and blowing its whistle "], "question": "which machine is running continuously", "label": 0}, {"captions": ["an animal growls followed by birds chirping", "small dogs yip and bark sharply"], "sample_ids": ["y2ZBGpgbhHM", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["animal, growl, bird", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["birds chirping and a dog panting", "a dog barks and growls"], "question": "which animal is more likely to be a dog", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sQGXqGcwOTc", "zl9Dqx-j7q4"], "start_seconds": ["3", "6"], "properties": ["audio, kid, giggles", "engine, laugh, loud"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a man driving a car in the dark"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a duck quacks continuously"], "sample_ids": ["w1mlz3Pe4fU", "vh30P49Po6s"], "start_seconds": ["300", "30"], "properties": ["vocalize, chirp, continuously", "quacks, continuously, duck"], "captions_pred_video": ["of a bird in a cage", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping and singing", "a duck is quacking loudly"], "question": "which entity is a duck?", "label": 1}, {"captions": ["an engine runs and a man speaks", "a horn rings out as a machine runs by"], "sample_ids": ["yT5WfYMRr-U", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["engine, run, man", "a, horn, run"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a person snoring", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["t8tv5YRMJUg", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["a person, snore, loud", "animal, grunts, snorts"], "captions_pred_video": ["of a man getting his face licked by another man", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a woman is speaking and a baby is crying"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "water is sprayed across a hard surface"], "sample_ids": ["wudZTNBtVqc", "sQwlkXjQabo"], "start_seconds": ["60", "10"], "properties": ["accelerates, engine, wind", "water, spray, surface"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a car accelerates and revs its engine ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "water splashes as an animal walks through"], "sample_ids": ["zofjfKhqLk8", "w1ir-sZ3Im8"], "start_seconds": ["10", "90"], "properties": ["noise, stop, motor", "animal, water, splashes"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a man talks while vehicles pass by", "a clock ticktocks"], "sample_ids": ["sK4u5T8hW78", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "ticktocks, clock, ticktocks"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a woman talking as an infant is crying", "an airplane engine spools and people speak"], "sample_ids": ["tMbMDvT50j8", "wTjoRj1se3U"], "start_seconds": ["12", "390"], "properties": ["a, talk, infant", "airplane, engine, spool"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a baby cries and a woman speaks", "a jet engine is running and people are talking"], "question": "which entity is a video of a person talking to an infant?", "label": 0}, {"captions": ["water gurgles, metal squeaks and the water stops", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["x4a9YGIw4ok", "zj2R0XoFr5k"], "start_seconds": ["120", "50"], "properties": ["water, gurgles, stops", "airplane, boy, fly"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a toilet flushes and water splashes", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a propeller rotates loudly and intensely"], "sample_ids": ["y2ZBGpgbhHM", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["animal, growl, bird", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["birds chirping and a dog panting", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "vehicles pass by on a roadway"], "sample_ids": ["vcmWSmvti8", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["music, man, fire", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a car is driving on the road "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["an airplane accelerates briefly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zjTG0gaGCUI", "xKB8O8LTs6s"], "start_seconds": ["80", "70"], "properties": ["accelerates, airplane, briefly", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a jet engine roars as wind blows ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a movie?", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zVacuqSb4LI", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["blares, fades, train", "rooster, crow, background, men"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xKB8O8LTs6s", "zl9Dqx-j7q4"], "start_seconds": ["70", "6"], "properties": ["music, gunshots, explosion", "engine, laugh, loud"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a man driving a car in the dark"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a vehicle engine runs and someone speaks"], "sample_ids": ["zOZleIRqZm4", "zF8yoL0rkbI"], "start_seconds": ["80", "30"], "properties": ["rustling, leaves, person", "engine, run, someone"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of the traffic on the street at night"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "the wind is blowing hard and water is splashing"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zj2R0XoFr5k", "tdWhHV3X25Q"], "start_seconds": ["50", "60"], "properties": ["airplane, boy, fly", "applause, audience, yells"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "water is sprayed across a hard surface"], "sample_ids": ["uYT5gxnyMWM", "sQwlkXjQabo"], "start_seconds": ["50", "10"], "properties": ["person, spray, yell", "water, spray, surface"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "spraying followed by silence"], "question": "which entity is a video of a person speaking over spraying and another person yelling?", "label": 0}, {"captions": ["an engine revs and a turning noise is made", "someone is burping continuously"], "sample_ids": ["tOSWIURC-4", "y636gklDioE"], "start_seconds": ["0", "20"], "properties": ["noise, engine, revs", "burps, burps, burps"], "captions_pred_video": [null, "a dog sitting on a red chair in front of an old telephone"], "captions_pred_audio": ["a lawn mower is running ", "a person burps loudly several times"], "question": "which noise is made by a human", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a clock ticks quietly and rhythmically"], "sample_ids": ["tK4VlLsNxak", "u7C-AEBQM"], "start_seconds": ["120", "30"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "ticks, rhythmic, quiet"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a ticktock of a clock"], "question": "which entity is quieter", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a vehicle accelerates before a race car idles then accelerates quickly"], "sample_ids": ["tEE3MpBt1sg", "sjlVMgdGSK0"], "start_seconds": ["50", "30"], "properties": ["drill, something, laugh", "accelerates, vehicle, race car"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["vMf1dLD6Sng", "y2bVZ7rz-5M"], "start_seconds": ["6", "280"], "properties": ["frog, bird, vocalize", "motor noise, horn, siren"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a frog croaks loudly", "a truck is honking its horn and a siren is blaring "], "question": "which entity is not a frog?", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["tw76HGONaKg", "ukg5L09Wpvo"], "start_seconds": ["570", "150"], "properties": ["music, click, man", "clickety-clack, train, whistle"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yDoT73BWsdA", "zFjIWfSD-4"], "start_seconds": ["10", "410"], "properties": ["engine, revs, vehicle", "People, motor, brakes"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running and air brakes hissing?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a clock ticktocks"], "sample_ids": ["vJrjSeP17yE", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a black background with a small plane flying in the sky", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person snoring loudly", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "waves crash against a shoreline and people speak"], "sample_ids": ["zsLxS-uLJTw", "yFB25fqfU8I"], "start_seconds": ["20", "300"], "properties": ["horn, blast, train", "wave, crash, shoreline"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a duck quacks several times", "someone whistles a tune"], "sample_ids": ["vh30P49Po6s", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["quacks, duck, several", "someone, tune, whistle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["a helicopter engine runs", "a duck quacks continuously"], "sample_ids": ["t5ZbXbniOWk", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["engine, helicopter, run", "quacks, continuously, duck"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a helicopter is flying overhead ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "water rushes and then a vehicle zooms past"], "sample_ids": ["sK4u5T8hW78", "s4Uz1Ffgo04"], "start_seconds": ["30", "100"], "properties": ["a, car, pass", "water, rushes, vehicle"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is about a vehicle zooming past?", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "an infant crying as a woman laughs"], "sample_ids": ["wDVMhEdTiVw", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["gun, shoot, water", "a, laugh, infant"], "captions_pred_video": ["a blurry image of trees and water in the forest", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "people speak as gunfire rings out"], "sample_ids": ["vcmWSmvti8", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["music, man, fire", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking and a gun is fired"], "question": "which entity is about a man speaking as music plays before artillery is fired?", "label": 0}, {"captions": ["an adult woman and an adult man speak", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zTLVJCo4WEE", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["two people, adult, speak", "rustling, ducks, quack"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a duck quacks and a woman speaks"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a man speaks followed by another man speaking outside"], "sample_ids": ["wfHeoPDLMaM", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["quacking, squawking, ducks", "two men, speak, follow"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["ducks are quacking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a group of animals?", "label": 0}, {"captions": ["a door opens and closes", "people applaud and hoot and chat quietly"], "sample_ids": ["vBHyYJ8pL0", "wwyfGO2J4"], "start_seconds": ["2", "90"], "properties": ["open, close, door", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a child speaks in closed space"], "sample_ids": ["u--KhUW8l1Y", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["horn, siren, life", "child, space, speak"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "water flows as men speak and yell"], "sample_ids": ["vddP56-ogds", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["water, flow, laugh", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["three men talk while wind blows and some liquid flows", "someone whistles a tune"], "sample_ids": ["vJ7JPEFhyLA", "sIXTftIuUgw"], "start_seconds": ["16", "90"], "properties": ["three men, wind, flow", "someone, tune, whistle"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a motorcycle engine is idling", "paper is crumpling consistently"], "sample_ids": ["vZAqdHZ81yA", "v5cSxLaHADY"], "start_seconds": ["180", "0"], "properties": ["engine, motorcycle, idling", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["an engine is idling loudly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a woman talking as an infant is crying", "three men talk while wind blows and some liquid flows"], "sample_ids": ["tMbMDvT50j8", "vJ7JPEFhyLA"], "start_seconds": ["12", "16"], "properties": ["a, talk, infant", "three men, wind, flow"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman talking to an infant?", "label": 0}, {"captions": ["a vehicle accelerates and squeals tires", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yRx9txMcBl0", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["accelerates, tires, squeals", "men, talk, cars"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a person whistles and clicks a mouse", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zCrAfDfv6-A", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["person, mouse, click", "engine, idle, woman"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a person whistles a song", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "pigeons vocalize and birds chirp"], "sample_ids": ["uiItxDsDMFI", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["wood, piece, saw", "vocalize, bird, chirp"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of the pigeon in the cage"], "captions_pred_audio": ["a saw is being used with background noise ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a telephone rings and a bird vocalizes", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["skd2PphS6oI", "yajyRTUQk3U"], "start_seconds": ["190", "400"], "properties": ["ring, bird, vocalize", "a woman, something, fried"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["some men converse over an engine running", "vehicles pass by on a roadway"], "sample_ids": ["sCiy7QS1U", "tgbONvsP47Y"], "start_seconds": ["300", "0"], "properties": ["men, converse, engine", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a car accelerates and wind blows", "a person snores loudly multiple times at a close distance"], "sample_ids": ["u0TrcHhkPQ", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["accelerates, wind, blows", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["s6DESzUTGjY", "y8WEcpOlT3I"], "start_seconds": ["16", "40"], "properties": ["wind, laugh, woman", "harsh, wind, blows"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is softer", "label": 0}, {"captions": ["dishes cling together then a man begins to speak", "pigeons vocalize and birds chirp"], "sample_ids": ["sQGXqGcwOTc", "uiS58TNyUiw"], "start_seconds": ["3", "430"], "properties": ["cling, speak, dishes", "vocalize, bird, chirp"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "of the pigeon in the cage"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a weapon fires multiple times"], "sample_ids": ["t25U-v4k4ts", "sMC07Ucy7kg"], "start_seconds": ["40", "10"], "properties": ["a, chirps, bird", "weapon, fire, multiple"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage is from a car's point of view"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "water pouring and bubbling"], "sample_ids": ["wnpJndXuxLc", "uyRfq-jKPpo"], "start_seconds": ["50", "50"], "properties": ["beeps, loud, whistle", "water, bubbles, pouring"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a train horn blows as it passes by"], "sample_ids": ["u5RmF3c3Aw", "zVacuqSb4LI"], "start_seconds": ["60", "30"], "properties": ["engine, car, zoom", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["tQWGZLItBXk", "w34HjHr6gAY"], "start_seconds": ["170", "30"], "properties": ["music, person, ding", "beeps, hit, woman"], "captions_pred_video": ["worms revolution screenshots", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a beep sounds followed by a child speaking"], "question": "which entity has a person speaking?", "label": 0}, {"captions": ["a helicopter engine runs continuously", "a power tool runs and touches a surface"], "sample_ids": ["ugHJF0hfYkg", "zfvPRf3chY"], "start_seconds": ["10", "290"], "properties": ["engine, running, continuously", "power tool, run, touch"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking while a power tool is being used "], "question": "which entity is not running continuously", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["xvDdE3zNf8Y", "uEU-Hg5MTN8"], "start_seconds": ["120", "27"], "properties": ["a, female, speaks", "a woman, laughs, animal"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman speaks and crumples paper", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking softly?", "label": 0}, {"captions": ["multiple people speak and children yell while water gurgles", "water flows and trickles"], "sample_ids": ["vb1fPSDI4c", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["multiple, people, yell", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a crowd of people are talking and laughing", "water is splashing and gurgling"], "question": "which entity is a video of water flowing and trickling?", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "birds chirp and objects are moved around"], "sample_ids": ["w2bYrCVLT60", "yPUYU6t3rwo"], "start_seconds": ["120", "370"], "properties": ["ducks, speak, quack", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a horse runs while two women talk"], "sample_ids": ["vuUVPzd2FXw", "sdvI1mHAsc"], "start_seconds": ["160", "20"], "properties": ["a, steam, release", "two women, horse, run"], "captions_pred_video": ["of the person cooking on the grill with a spatula", null], "captions_pred_audio": ["a man is speaking and dishes are clanging", "horses clip-clop and a woman speaks"], "question": "which entity is a horse?", "label": 1}, {"captions": ["a car accelerates and wind blows", "a toilet door squeaks as it is opened"], "sample_ids": ["u0TrcHhkPQ", "sdXV-ylviw"], "start_seconds": ["20", "190"], "properties": ["accelerates, wind, blows", "door, toilet, squeaks"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a dog barks and taps with background noise "], "question": "which door is squeaking", "label": 1}, {"captions": ["women speak and laugh as wind blows", "small dogs yip and bark sharply"], "sample_ids": ["un9VQlzgZM", "v-wcQf4BDY0"], "start_seconds": ["5", "120"], "properties": ["wind, speak, laugh", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a person is snoring while sleeping", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vJrjSeP17yE", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["a person is sleeping, snoring, person", "rustling, ducks, quack"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a duck quacks and a woman speaks"], "question": "which entity is a video", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a telephone rings followed by a woman talking"], "sample_ids": ["tK4VlLsNxak", "tGcFnX0GHI"], "start_seconds": ["120", "0"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "ring, talk, woman"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a ringing phone", "label": 1}, {"captions": ["scraping and female speech with distant music", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["yHeVV-xeOxQ", "yeFvk9x0wWI"], "start_seconds": ["130", "30"], "properties": ["female, speech, music", "clack, bird, chirp"], "captions_pred_video": ["of a girl milking a goat's udder", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "birds chirp in the background as a car drives by "], "question": "which entity is accompanied by birds", "label": 1}, {"captions": ["a man speaks as a machine runs", "an infant crying as a woman laughs"], "sample_ids": ["vD6lYD1l0BY", "xhmRY9yhC7c"], "start_seconds": ["330", "20"], "properties": ["a, machine, run", "a, laugh, infant"], "captions_pred_video": ["game controller being held in the hands of the person", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["s3cTDAj31g", "ziUT9IFTkjg"], "start_seconds": ["80", "10"], "properties": ["man, talk, woman", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a baby is crying", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speak and tapping occurs", "a telephone rings followed by a woman talking"], "sample_ids": ["tFCUUGdREgA", "tGcFnX0GHI"], "start_seconds": ["70", "0"], "properties": ["people, tap, speak", "ring, talk, woman"], "captions_pred_video": ["a person riding a white horse in an indoor arena", null], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a man talks as several small engines run", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["u9A6VZQCZpU", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a telephone rings followed by a woman talking"], "sample_ids": ["vZAw4apG0Es", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["background, clock, ticktocks", "ring, talk, woman"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a clock ticking in the background", "label": 0}, {"captions": ["water flows as a woman laughs and a man speaks", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vddP56-ogds", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["water, flow, laugh", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a stream of water runs briefly"], "sample_ids": ["sShpyu2l4YQ", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["growl, bark, yip", "stream, water, run"], "captions_pred_video": ["the puppies are playing with a toy", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a dog is barking and growling", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["un9VQlzgZM", "su6FAOcOA8c"], "start_seconds": ["5", "4"], "properties": ["females, talk, laugh", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is about a bus engine?", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a stream of water runs briefly"], "sample_ids": ["vhJWZheqaE", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["water drains unevenly, toilet flushes, water drains", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a toilet is flushed", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a woman speaks happily and an animal chirps"], "sample_ids": ["xvDdE3zNf8Y", "uWAAAL4CIoc"], "start_seconds": ["120", "0"], "properties": ["a, female, speaks", "a woman, chirps, animal"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", null], "captions_pred_audio": ["a woman speaks and crumples paper", "a woman is speaking and a dog is barking "], "question": "which entity is more active", "label": 1}, {"captions": ["electronic beeps occur in a short series", "vehicle engines race around a track as a man commentates"], "sample_ids": ["y682ml90jGw", "sZPuqDgX2V0"], "start_seconds": ["11", "30"], "properties": ["beeps, series, electronic", "commentator, race, track"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a man is speaking and a helicopter is flying overhead "], "question": "which entity is a video of a race", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "someone is typing on a computer keyboard"], "sample_ids": ["vlS6YMeWAPo", "v0x1odnXtP0"], "start_seconds": ["40", "210"], "properties": ["noise, bleat, call", "keyboard, type, computer"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "how to make money on youtube in spanish"], "captions_pred_audio": ["a goat bleats and birds chirp", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "an infant crying frantically"], "sample_ids": ["sHbXC6na9hg", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["a person, saw, wood", "cry, infant, frantically"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "of the baby crying in the car seat"], "captions_pred_audio": ["an engine is idling and vibrating", "a baby cries loudly"], "question": "which entity is a person", "label": 0}, {"captions": ["winds blows roughly as a vehicle races past", "wind blows as people chatter quietly"], "sample_ids": ["xjvTpk2Zpr8", "xBxDz0CFVn0"], "start_seconds": ["70", "30"], "properties": ["wind, blows, vehicle", "wind, chatter, people"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage is blurry and out of focus"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a helicopter engine idles continuously", "some men converse over an engine running"], "sample_ids": ["ugHJF0hfYkg", "sCiy7QS1U"], "start_seconds": ["10", "300"], "properties": ["engine, idle, continuously", "men, converse, engine"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a jet engine spools up and takes off", "a horn rings out as a machine runs by"], "sample_ids": ["vBslzh7saPw", "slZLHwNbbt4"], "start_seconds": ["90", "300"], "properties": ["engine, spools, takes", "a, horn, run"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["shmR4OZtzqA", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["man, engine, idle", "wind, blow, vehicle"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", null], "captions_pred_audio": ["a man speaks while a motor runs", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["people speak softly as food sizzles", "people speak as gunfire rings out"], "sample_ids": ["yhQ2Lg-7qDY", "wqTCwqVRDlk"], "start_seconds": ["130", "80"], "properties": ["food, sizzle, speak", "gunfire, ring, speak"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "an airplane engine runs"], "sample_ids": ["uPDn2BFTHk", "yVPZ2MNWpms"], "start_seconds": ["140", "0"], "properties": ["woman, laughs, speaks", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tQWGZLItBXk", "uYT5gxnyMWM"], "start_seconds": ["170", "50"], "properties": ["music, kid, speak", "female, spraying, scream"], "captions_pred_video": ["worms revolution screenshots", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and then playing music?", "label": 0}, {"captions": ["material crumbles into a microphone", "an infant crying as a woman laughs"], "sample_ids": ["vofpvUo6NAw", "xhmRY9yhC7c"], "start_seconds": ["220", "20"], "properties": ["material, crumbles, microphone", "a, laugh, infant"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["y2bVZ7rz-5M", "uWPRNLnpy7Y"], "start_seconds": ["280", "10"], "properties": ["engine, horn, siren", "accelerate, laugh, vehicle"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "is taken from a car driving down the street"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a car accelerates and revs its engine "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "an engine runs loudly"], "sample_ids": ["xyx6eNVEYRY", "vqZuVbG6-HI"], "start_seconds": ["380", "130"], "properties": ["loud, engine, muffles", "loud, engine, run"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "footage is blurry because it's raining outside"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["some men converse over an engine running", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sCiy7QS1U", "vb1fPSDI4c"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "several insects fly while two men talk"], "sample_ids": ["tqR406bGiE", "s-T9OVOiMLo"], "start_seconds": ["40", "330"], "properties": ["flush, water, gurgle", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a toilet?", "label": 0}, {"captions": ["a man talks while vehicles pass by", "a bird chirps in response to a woman chirping for the birds"], "sample_ids": ["sK4u5T8hW78", "uOpoD0gGXcs"], "start_seconds": ["30", "120"], "properties": ["a, man, talk", "chirps, woman, bird"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a herd of cows grazing in the field"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "birds are chirping and a man is speaking"], "question": "which entity is a response to a woman?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a man speaks as a car is passing by"], "sample_ids": ["smDKStoHBJo", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["a, talk, baby, cry", "a, car, pass"], "captions_pred_video": ["a man holding a crying baby in his arms", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a baby?", "label": 0}, {"captions": ["a man speaks while water drains", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vSeGhaZt-aI", "su6FAOcOA8c"], "start_seconds": ["50", "4"], "properties": ["water, drain, man", "engine, idle, woman"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "dishes cling together then a man begins to speak"], "sample_ids": ["vJvryTwuAV8", "sQGXqGcwOTc"], "start_seconds": ["16", "3"], "properties": ["audience, cheer, man", "cling, speak, dishes"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking to an audience?", "label": 0}, {"captions": ["a speedboat passes quickly on the water", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tjmoSi330GM", "uZesmtKZGSw"], "start_seconds": ["23", "250"], "properties": ["speed, water, boat", "men, talk, cars"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is moving faster", "label": 1}, {"captions": ["children speak and play together", "people applaud and hoot and chat quietly"], "sample_ids": ["yVVP8XvWJTo", "wwyfGO2J4"], "start_seconds": ["260", "90"], "properties": ["children, speak, play", "people, applaud, hoot"], "captions_pred_video": ["footage of a playground at a school or daycare center", null], "captions_pred_audio": ["children are speaking and breathing with background noise ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a clock ticktocks"], "sample_ids": ["w2JXXIAdUdg", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["snoring, distance, person", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a man speaks as a motor runs in the background"], "sample_ids": ["wz7N8YRy74I", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, people", "background, motor, run"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "an infant crying as a woman laughs"], "sample_ids": ["vb1fPSDI4c", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["multiple, people, yell", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a toilet flushes and water drains", "wind blowing followed by a zoom"], "sample_ids": ["sfAvvZwdLCY", "vr8ZXjEBhMQ"], "start_seconds": ["20", "150"], "properties": ["water drains, flushes, water", "wind, blow, zoom"], "captions_pred_video": ["footage of the toilet in the bathroom", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a toilet is flushed", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "vehicles pass by on a roadway"], "sample_ids": ["zkKdxzNC97Y", "tgbONvsP47Y"], "start_seconds": ["27", "0"], "properties": ["hard, surface, door", "pass, vehicle, roadway"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a door is opened and closed", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "pigeons vocalize and birds chirp"], "sample_ids": ["wwyfGO2J4", "uiS58TNyUiw"], "start_seconds": ["90", "430"], "properties": ["people, applaud, hoot", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a telephone rings followed by a woman talking"], "sample_ids": ["zofjfKhqLk8", "tGcFnX0GHI"], "start_seconds": ["10", "0"], "properties": ["background, metal, clings", "ring, talk, woman"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "water splashes as an animal walks through"], "sample_ids": ["tjmoSi330GM", "w1ir-sZ3Im8"], "start_seconds": ["23", "90"], "properties": ["speed, water, boat", "animal, water, splashes"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "water splashes and gurgles as people speak"], "question": "which entity is moving at a slower speed", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "someone whistles a tune"], "sample_ids": ["sU53zg9Jp7s", "sIXTftIuUgw"], "start_seconds": ["380", "90"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "someone, tune, whistle"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a horse runs while two women talk", "a horn rings out as a machine runs by"], "sample_ids": ["sdvI1mHAsc", "slZLHwNbbt4"], "start_seconds": ["20", "300"], "properties": ["two women, horse, run", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "winds blows roughly as a vehicle races past"], "sample_ids": ["w-4gHptFNuU", "xjvTpk2Zpr8"], "start_seconds": ["21", "70"], "properties": ["engine revs, accelerates, bump", "wind, blows, vehicle"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["some people speak", "a drill runs and two people laugh"], "sample_ids": ["vbZ-0lGPneg", "tEE3MpBt1sg"], "start_seconds": ["30", "50"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "two people, laugh, drill"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "people are laughing breathing and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "paper is crumpling consistently"], "sample_ids": ["rwtmaKiCcQU", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["nozzle, depressed, spray can", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["spraying and people speaking", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a baby laughs giddily and a woman laughs then speaks"], "sample_ids": ["vmrxwuAMb2I", "wjsXBsc7M40"], "start_seconds": ["40", "10"], "properties": ["a dog, inhales, exhales", "a baby laughs, a woman laughs, a woman speaks"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "footage of the baby playing with a toothbrush"], "captions_pred_audio": ["a dog barks and growls", "a baby laughs and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "water flows and trickles"], "sample_ids": ["sZvwOuuPGP0", "tB7hWb9gTuQ"], "start_seconds": ["50", "30"], "properties": ["engine, diesel, truck", "water, flow, trickle"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a medium engine is running ", "water is splashing and gurgling"], "question": "which entity is not a continuous flow", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["zkKdxzNC97Y", "ukg5L09Wpvo"], "start_seconds": ["27", "150"], "properties": ["hard, surface, door", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a door is opened and closed", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a man speaks as a motor runs in the background"], "sample_ids": ["zY3icUyMdh8", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "background, motor, run"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a baby cries and a woman moans", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["smDKStoHBJo", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["a, cry, woman", "applause, audience, yells"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["vbpKkWvfOu4", "xjvTpk2Zpr8"], "start_seconds": ["560", "70"], "properties": ["a, man, speaks", "wind, blows, vehicle"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a jet engine roars and wind blows "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["s59PfAghdkM", "tDlysoZiA1I"], "start_seconds": ["0", "0"], "properties": ["bird, chirp, background, horse, neigh", "animal, grunts, chirps"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "birds are chirping and a rooster is crowing "], "question": "which entity has a horse neighing?", "label": 0}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["ylpYOorfH4o", "wDVMhEdTiVw"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "gun, shoot, water"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["an engine starts and increases in power", "the revving of an engine throttle followed by a man speaking"], "sample_ids": ["zjTG0gaGCUI", "tezvROoo4bs"], "start_seconds": ["80", "40"], "properties": ["power, increase, engine", "audio, throttle, speaking"], "captions_pred_video": [null, "footage of a busy city street with cars parked on both sides of the road"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a car accelerates and revs while a man speaks "], "question": "which entity is about an engine?", "label": 0}, {"captions": ["water rushes and then a vehicle zooms past", "a person uses a saw to cut some wood"], "sample_ids": ["s4Uz1Ffgo04", "sHbXC6na9hg"], "start_seconds": ["100", "0"], "properties": ["water, rushes, vehicle", "a person, saw, wood"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "an engine is idling and vibrating"], "question": "which entity is a person", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "small dogs yip and bark sharply"], "sample_ids": ["sQGXqGcwOTc", "v-wcQf4BDY0"], "start_seconds": ["3", "120"], "properties": ["audio, kid, giggles", "bark, yip, sharply"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "vehicles pass by on a roadway"], "sample_ids": ["vzxHnu-SFEw", "tgbONvsP47Y"], "start_seconds": ["80", "0"], "properties": ["two objects, woman, speak", "pass, vehicle, roadway"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a car is driving on the road "], "question": "which object is moving", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "waves crash against a shoreline and people speak"], "sample_ids": ["zofjfKhqLk8", "yFB25fqfU8I"], "start_seconds": ["10", "300"], "properties": ["background, metal, clings", "wave, crash, shoreline"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "vehicles pass by on a roadway"], "sample_ids": ["x6ijhqRY38s", "tgbONvsP47Y"], "start_seconds": ["250", "0"], "properties": ["something metal, glass, hit", "pass, vehicle, roadway"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "water pouring and bubbling"], "sample_ids": ["w2JXXIAdUdg", "uyRfq-jKPpo"], "start_seconds": ["10", "50"], "properties": ["emits, sleeping, person", "water, bubbles, pouring"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a person snoring and a dog whimpering", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "winds blows roughly as a vehicle races past"], "sample_ids": ["zgUgkpk78xU", "xjvTpk2Zpr8"], "start_seconds": ["70", "70"], "properties": ["clinking, humming, horn", "wind, blows, vehicle"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "children cheer as a man speaks then an audience screams"], "sample_ids": ["shmR4OZtzqA", "vJvryTwuAV8"], "start_seconds": ["30", "16"], "properties": ["man, engine, idle", "audience, cheer, man"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a man speaks while a motor runs", "a man is speaking and a crowd is shouting and whooping "], "question": "which man is speaking to an audience?", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "winds blows roughly as a vehicle races past"], "sample_ids": ["ylpYOorfH4o", "xjvTpk2Zpr8"], "start_seconds": ["410", "70"], "properties": ["motor, run, steady", "wind, blows, vehicle"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["roadway noise occurs and a truck accelerates", "water is sprayed across a hard surface"], "sample_ids": ["tgbONvsP47Y", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["noise, truck, accelerate", "water, spray, surface"], "captions_pred_video": ["footage of a fire truck entering a garage", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a car is driving on the road ", "spraying followed by silence"], "question": "which is a liquid", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wqZ135Ssz0", "zFjIWfSD-4"], "start_seconds": ["60", "410"], "properties": ["two men, woman, birds", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks as a machine runs", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vD6lYD1l0BY", "xKB8O8LTs6s"], "start_seconds": ["330", "70"], "properties": ["a, machine, run", "music, gunfire, explosion"], "captions_pred_video": ["game controller being held in the hands of the person", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["birds tweet and squawk", "a consistent ticking pattern"], "sample_ids": ["w1mlz3Pe4fU", "sCeWURVHfOM"], "start_seconds": ["300", "30"], "properties": ["squawk, tweet, scream", "ticking, pattern, clock"], "captions_pred_video": ["of a bird in a cage", "- a close-up view of the clock's inner workings"], "captions_pred_audio": ["birds are chirping and singing", "ticking of a clock"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person is snoring while sleeping", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vJrjSeP17yE", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "multiple, people, yell"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a toilet flushes and water drains", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sfAvvZwdLCY", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["water drains, flushes, water", "a woman, laughs, animal"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["children speak and play together", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yVVP8XvWJTo", "uEU-Hg5MTN8"], "start_seconds": ["260", "27"], "properties": ["children, speak, play", "a woman, laughs, animal"], "captions_pred_video": ["footage of a playground at a school or daycare center", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video of a woman speaking and laughing?", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vzxHnu-SFEw", "uEU-Hg5MTN8"], "start_seconds": ["80", "27"], "properties": ["two objects, woman, speak", "animal, grunts, snorts"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a man talks followed by a woman shouting", "water flows and trickles"], "sample_ids": ["s3cTDAj31g", "tB7hWb9gTuQ"], "start_seconds": ["80", "30"], "properties": ["man, talk, woman", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and a baby is crying", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "people cheer as a vehicle engine revs"], "sample_ids": ["uJV8NDaHqqk", "xjhAnI2q6hM"], "start_seconds": ["100", "6"], "properties": ["loud, fly, chirp", "engine revs, vehicle, people"], "captions_pred_video": ["a bee hive in a wooden box", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a swarm of bees buzzing around", "a truck is revving its engine and a man is speaking "], "question": "which entity is not a person?", "label": 0}, {"captions": ["wind blows and people scream while an engine revs", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["w5W5Kqtc8E", "vlS6YMeWAPo"], "start_seconds": ["100", "40"], "properties": ["wind, engine, scream", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "a woman speaks and other women and a man talk with her"], "sample_ids": ["y8dSeubCNI", "vbpKkWvfOu4"], "start_seconds": ["4", "560"], "properties": ["engine revving, people speaking, motorcycle", "a, woman, man"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["an engine revving and people talking in the background", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "wind blows as people chatter quietly"], "sample_ids": ["uC9dtII1KDI", "xBxDz0CFVn0"], "start_seconds": ["150", "30"], "properties": ["wind, gusts, distance", "wind, chatter, people"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xjvTpk2Zpr8", "zl9Dqx-j7q4"], "start_seconds": ["70", "6"], "properties": ["wind, blows, vehicle", "engine, laugh, loud"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a car speeding up in the distance"], "sample_ids": ["uPDn2BFTHk", "u0TrcHhkPQ"], "start_seconds": ["140", "20"], "properties": ["lady, laugh, baby", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a propeller rotates loudly and intensely"], "sample_ids": ["yaln9y8I7ms", "ugHJF0hfYkg"], "start_seconds": ["230", "10"], "properties": ["female, flushes, toilet", "loud, intense, propeller"], "captions_pred_video": ["footage is blurry and out of focus", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a stream of water flows as people talk and wind blows"], "sample_ids": ["tDlysoZiA1I", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, chirp", "stream, water, flow"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["ylpYOorfH4o", "w5W5Kqtc8E"], "start_seconds": ["410", "100"], "properties": ["motor, run, steady", "wind, blow, vehicle"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a stream of water runs briefly"], "sample_ids": ["ugHJF0hfYkg", "x-PeY8Yb8M4"], "start_seconds": ["10", "300"], "properties": ["engine, running, continuously", "stream, water, run"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is driving on a wet road "], "question": "which entity is running continuously", "label": 0}, {"captions": ["a vehicle engine revs and tires squeal", "someone is typing on a computer keyboard"], "sample_ids": ["yDoT73BWsdA", "v0x1odnXtP0"], "start_seconds": ["10", "210"], "properties": ["engine revs, tires squeal, vehicle", "keyboard, type, computer"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "how to make money on youtube in spanish"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person is typing on a keyboard"], "question": "which entity is stationary", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "someone is typing on a computer keyboard"], "sample_ids": ["yswmmRZFItk", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["background, frog, croak", "keyboard, type, computer"], "captions_pred_video": ["a close up of a frog in the water", "how to make money on youtube in spanish"], "captions_pred_audio": ["a frog is croaking", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "water pouring and bubbling"], "sample_ids": ["wvKpEYswXO0", "uyRfq-jKPpo"], "start_seconds": ["150", "50"], "properties": ["plastic, tap, speak", "water, bubbles, pouring"], "captions_pred_video": ["of the person preparing food in the kitchen", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "water is running from a faucet"], "question": "which entity is bubbling", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "a man speaks as a motor runs in the background"], "sample_ids": ["wAAkbZToh8", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["burp, laugh, speak", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man burps and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["an infant crying frantically", "wind blows as people chatter quietly"], "sample_ids": ["zwOBqeFTgiU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["cry, infant, frantically", "wind, chatter, people"], "captions_pred_video": ["of the baby crying in the car seat", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries loudly", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a heavy rain falls endlessly", "winds blows roughly as a vehicle races past"], "sample_ids": ["wP8ZKrlx3oA", "xjvTpk2Zpr8"], "start_seconds": ["40", "70"], "properties": ["heavy, rain, fall", "wind, blows, vehicle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["people speak then an engine runs", "pigeons vocalize and birds chirp"], "sample_ids": ["uMTTDZ2mb4", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["engine, run, people", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["yI-KvObbDoY", "w5W5Kqtc8E"], "start_seconds": ["260", "100"], "properties": ["sound, smack, wind", "wind, blow, vehicle"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", null], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["male speech with light ticking", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["xO-Q2BlIIPU", "sapQIQUhFc"], "start_seconds": ["30", "280"], "properties": ["male, speech, ticking", "liquid, flow, distance"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more distant", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xvDdE3zNf8Y", "tDVADusiIoc"], "start_seconds": ["120", "60"], "properties": ["A, crumple, paper", "water, radio, man"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "people applaud and hoot and chat quietly"], "sample_ids": ["tQWGZLItBXk", "wwyfGO2J4"], "start_seconds": ["170", "90"], "properties": ["voice, music, whoosh", "people, applaud, hoot"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man is filing a hard object", "a stream of water flows quickly"], "sample_ids": ["vveS8HT7Uog", "wbHTKEJZyhc"], "start_seconds": ["100", "20"], "properties": ["a man, hard, object", "stream, water, flow"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a waterfall is flowing and people are speaking "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["yajyRTUQk3U", "tdWhHV3X25Q"], "start_seconds": ["400", "60"], "properties": ["noise, woman, speak", "applause, audience, yells"], "captions_pred_video": ["- a woman cooking in the kitchen", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vXlk0lIQBFo", "zl9Dqx-j7q4"], "start_seconds": ["470", "6"], "properties": ["wind, talk, vocalize", "engine, laugh, loud"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of a man driving a car in the dark"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["xyL9F5VrjkE", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["wind, motor, distance", "a woman, laughs, animal"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a movie", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vbZ-0lGPneg", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["a woman, a television program, a bird", "wind, blow, vehicle"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vh30P49Po6s", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["loud, continuous, quacks", "sheep, baa, birds"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a duck is quacking loudly", "a goat bleats and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["a stream of water flows quickly", "wind blowing followed by a zoom"], "sample_ids": ["wbHTKEJZyhc", "vr8ZXjEBhMQ"], "start_seconds": ["20", "150"], "properties": ["stream, water, flow", "wind, blow, zoom"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is moving at a slower speed", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a stream of water runs briefly"], "sample_ids": ["wSVhSdj0F0", "x-PeY8Yb8M4"], "start_seconds": ["10", "300"], "properties": ["horn honks, keys jingle, slam", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a drill runs and two people laugh", "three men talk while wind blows and some liquid flows"], "sample_ids": ["tEE3MpBt1sg", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["two people, laugh, drill", "three men, wind, flow"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a drill running and two people laughing?", "label": 0}, {"captions": ["wind blows as people chatter quietly", "vehicles pass by on a roadway"], "sample_ids": ["xBxDz0CFVn0", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["wind, chatter, people", "pass, vehicle, roadway"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "an airplane engine runs"], "sample_ids": ["xjhAnI2q6hM", "yVPZ2MNWpms"], "start_seconds": ["6", "0"], "properties": ["engine revs, vehicle, people", "engine, airplane, runs"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a car is driving by on the road "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a man speaks as a vehicle engine idles", "people speak in the background as a clock ticktocks"], "sample_ids": ["shmR4OZtzqA", "vZAw4apG0Es"], "start_seconds": ["30", "30"], "properties": ["man, engine, idle", "background, clock, ticktocks"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a man speaks while a motor runs", "a clock is ticking and people are talking"], "question": "which entity has a clock ticking in the background?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tQWGZLItBXk", "xfaoyyzw2WU"], "start_seconds": ["170", "180"], "properties": ["voice, music, whoosh", "loud, jet engine, roar"], "captions_pred_video": ["worms revolution screenshots", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a woman speaks happily and an animal chirps"], "sample_ids": ["smDKStoHBJo", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["a, talk, baby, cry", "a woman, chirps, animal"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a dog is barking "], "question": "which entity has a baby?", "label": 0}, {"captions": ["a person is whistling", "water flows as men speak and yell"], "sample_ids": ["sIXTftIuUgw", "vJ7JPEFhyLA"], "start_seconds": ["90", "16"], "properties": ["person, whistling, person", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person whistling a song", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a person speaking and yelling?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vD6lYD1l0BY", "tDVADusiIoc"], "start_seconds": ["330", "60"], "properties": ["a, machine, run", "water, radio, man"], "captions_pred_video": ["game controller being held in the hands of the person", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a female speaks softly as paper crinkles"], "sample_ids": ["weDbePuc-Xc", "xvDdE3zNf8Y"], "start_seconds": ["40", "120"], "properties": ["cartoon character, music, vocalize", "a, female, speaks"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a woman speaks and crumples paper"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wTideSjRFS0", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["food, sizzle, woman", "music, gunfire, explosion"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "people applaud and hoot and chat quietly"], "sample_ids": ["spJCm8tD9Zo", "wwyfGO2J4"], "start_seconds": ["90", "90"], "properties": ["snores, wheezes, sleeps", "people, applaud, hoot"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp as a train approaches", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xM4joTqDVp4", "zj2R0XoFr5k"], "start_seconds": ["160", "50"], "properties": ["bird, chirp, train", "airplane, boy, fly"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "some tunes played by whistling"], "sample_ids": ["vf9xf3vMsGM", "u6BnG6YZqJ4"], "start_seconds": ["540", "0"], "properties": ["A man speaks while turning a water faucet on.", "tune, play, whistling"], "captions_pred_video": ["of the person washing their hands under the faucet", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a small engine idles continuously", "a train horn blows as it passes by"], "sample_ids": ["y5WII6cTH7k", "zVacuqSb4LI"], "start_seconds": ["40", "30"], "properties": ["engine, idle, continuously", "horn, blows, train"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "someone snores nearby"], "sample_ids": ["vmrxwuAMb2I", "spJCm8tD9Zo"], "start_seconds": ["40", "90"], "properties": ["a dog, inhales, exhales", "someone snores, nearby, someone"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a dog barks and growls", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["water flows followed by women screaming", "race cars go around a track as a man commentates"], "sample_ids": ["w5W5Kqtc8E", "uZesmtKZGSw"], "start_seconds": ["100", "250"], "properties": ["water, flow, women", "car, track, man"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is a video of a race?", "label": 1}, {"captions": ["a toilet flushes and water drains", "paper is crumpling consistently"], "sample_ids": ["sfAvvZwdLCY", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["water drains, flushes, water", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a toilet is flushed", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yeFvk9x0wWI", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["clack, bird, chirp", "engine, laugh, loud"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a child speaks in closed space"], "sample_ids": ["yZmhM1HcsyE", "yW6FWLSLkx4"], "start_seconds": ["4", "40"], "properties": ["engine, roar, water", "child, space, speak"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["some men converse over an engine running", "a clock ticktocks"], "sample_ids": ["sCiy7QS1U", "v-g-j2uTByM"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vcmWSmvti8", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["music, man, fire", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a video of a gun shooting?", "label": 1}, {"captions": ["someone snores nearby", "a duck quacks loudly and continuously"], "sample_ids": ["spJCm8tD9Zo", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "loud, continuous, quacks"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person is snoring loudly", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["uWPRNLnpy7Y", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["accelerate, laugh, vehicle", "wind, blow, vehicle"], "captions_pred_video": ["is taken from a car driving down the street", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a motorboat is moving and people are shouting and cheering "], "question": "which vehicle is moving faster", "label": 0}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "speaking following by laughing and clapping"], "sample_ids": ["ukg5L09Wpvo", "u2f5NpsoHBg"], "start_seconds": ["150", "30"], "properties": ["clickety-clack, train, whistle", "person, laugh, clap"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a woman is speaking and a crowd is clapping"], "question": "which entity is clapping", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a stream of water flows as people talk and wind blows"], "sample_ids": ["xBxDz0CFVn0", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["wind, chatter, people", "stream, water, flow"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be in a stream", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "paper folding and crinkling"], "sample_ids": ["yRx9txMcBl0", "zPpG3RD8lSs"], "start_seconds": ["40", "20"], "properties": ["accelerates, tires, squeals", "paper, fold, crinkle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a car is revving its engine and skidding ", "the wind blows and a mouse clicks "], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a car accelerates and wind blows"], "sample_ids": ["vD6lYD1l0BY", "u0TrcHhkPQ"], "start_seconds": ["330", "20"], "properties": ["a, machine, run", "accelerates, wind, blows"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vbpKkWvfOu4", "tiDFTC-5vU"], "start_seconds": ["560", "30"], "properties": ["a, man, speaks", "male, duck, laugh"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking and ducks are quacking"], "question": "which entity has a duck speaking?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "an infant crying as a woman laughs"], "sample_ids": ["xvDdE3zNf8Y", "xhmRY9yhC7c"], "start_seconds": ["120", "20"], "properties": ["a, female, speaks", "a, laugh, infant"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman speaks and crumples paper", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a infant makes noise and is excited", "a frog croaks as other frogs croak in the background"], "sample_ids": ["wIJK3-5y0kA", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["noise, excited, infant", "background, frog, croak"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "a close up of a frog in the water"], "captions_pred_audio": ["a baby cries and a woman speaks", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["a dog barks and whimpers", "a woman speaks and other women and a man talk with her"], "sample_ids": ["sShpyu2l4YQ", "vbpKkWvfOu4"], "start_seconds": ["0", "560"], "properties": ["barks, whimpers, dog", "a, woman, man"], "captions_pred_video": ["the puppies are playing with a toy", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and a man is speaking"], "question": "which entity is more social", "label": 1}, {"captions": ["a toilet flushes and water drains", "a clock ticktocks briefly"], "sample_ids": ["sfAvvZwdLCY", "u7C-AEBQM"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "ticktocks, clock, ticktocks briefly"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a ticktock of a clock"], "question": "which entity is a clock?", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "small dogs yip and bark sharply"], "sample_ids": ["s4Uz1Ffgo04", "v-wcQf4BDY0"], "start_seconds": ["100", "120"], "properties": ["water, rushes, vehicle", "bark, yip, sharply"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["weDbePuc-Xc", "t97k0cejSQE"], "start_seconds": ["40", "250"], "properties": ["music, slaps, human", "sound, chirp, buzz"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a bee on a purple thistle flower"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a bee buzzes and a woman speaks"], "question": "which entity has a human sniveling?", "label": 0}, {"captions": ["material crumbles into a microphone", "a child speaks in closed space"], "sample_ids": ["vofpvUo6NAw", "yW6FWLSLkx4"], "start_seconds": ["220", "40"], "properties": ["material, crumbles, microphone", "child, space, speak"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a car speeding up in the distance", "pigeons vocalize and birds chirp"], "sample_ids": ["u0TrcHhkPQ", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["distance, car, speed", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a car?", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["tPJvjq9QePY", "x9JovgqUcs"], "start_seconds": ["40", "500"], "properties": ["animal, bleat, moo", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "a man speaks and types on a keyboard"], "question": "which entity is speaking", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tOj4tdLRaA", "wqZ135Ssz0"], "start_seconds": ["70", "60"], "properties": ["woman, laugh, baby", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "some men converse over an engine running"], "sample_ids": ["vKrYfzleLB8", "sCiy7QS1U"], "start_seconds": ["110", "300"], "properties": ["a, ring, gunshots", "men, converse, engine"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", null], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more calm", "label": 1}, {"captions": ["a baby laugh at a sputter", "some men converse over an engine running"], "sample_ids": ["sLUnaPT5gM8", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["laugh, sputter, baby", "men, converse, engine"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a group of people", "label": 1}, {"captions": ["a person sniffles and sneezes", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uRlbY6aoBU", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["sneezes, sniffles, person", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is sneezing ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xZepNM9qcRA", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["background, motor, run", "multiple, people, yell"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "a clock ticktocks"], "sample_ids": ["tw76HGONaKg", "v-g-j2uTByM"], "start_seconds": ["570", "30"], "properties": ["music, click, man", "ticktocks, clock, ticktocks"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "birds chirp and objects are moved around"], "sample_ids": ["rwtmaKiCcQU", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["nozzle, depressed, spray can", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["spraying and people speaking", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["v5P-ThUCINM", "tDVADusiIoc"], "start_seconds": ["400", "60"], "properties": ["background, chirp, bird", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wnpJndXuxLc", "uZesmtKZGSw"], "start_seconds": ["50", "250"], "properties": ["blows, vehicle, train", "men, talk, cars"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a train?", "label": 0}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "food is frying while a woman speaks"], "sample_ids": ["vlS6YMeWAPo", "yhQ2Lg-7qDY"], "start_seconds": ["40", "130"], "properties": ["sheep, baa, birds", "food, woman, speak"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a goat bleats and birds chirp", "a faucet is running and a man is speaking"], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vD6lYD1l0BY", "vYkA3cfXp5Q"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "engine, accelerate, idle"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "waves crash against a shoreline and people speak"], "sample_ids": ["wTideSjRFS0", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["food, sizzle, woman", "wave, crash, shoreline"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a infant makes noise and is excited"], "sample_ids": ["wTjoRj1se3U", "wIJK3-5y0kA"], "start_seconds": ["390", "30"], "properties": ["engine, run, people", "noise, excited, infant"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a jet engine is running and people are talking", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "people speak as gunfire rings out"], "sample_ids": ["sLUnaPT5gM8", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["loud, laughter, intermittent", "gunfire, ring, speak"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["tapping occurs then a baby cries", "a man speaks as a car is passing by"], "sample_ids": ["wIJK3-5y0kA", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a, cry, baby", "a, car, pass"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "a man speaking with light rustling"], "sample_ids": ["sapQIQUhFc", "zOZleIRqZm4"], "start_seconds": ["280", "80"], "properties": ["water, trickles, flow", "light, rustling, man"], "captions_pred_video": [null, "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking with crickets chirping in the background"], "question": "which entity is a man speaking while water trickles and flows?", "label": 0}, {"captions": ["music plays followed by gunshots and then an explosion", "dishes cling together then a man begins to speak"], "sample_ids": ["xKB8O8LTs6s", "sQGXqGcwOTc"], "start_seconds": ["70", "3"], "properties": ["music, gunshots, explosion", "cling, speak, dishes"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a person is whistling", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sIXTftIuUgw", "y8WEcpOlT3I"], "start_seconds": ["90", "40"], "properties": ["person, whistling, person", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with wind noise in the background "], "question": "which entity is a person", "label": 0}, {"captions": ["a vehicle accelerates and squeals tires", "a infant makes noise and is excited"], "sample_ids": ["yRx9txMcBl0", "wIJK3-5y0kA"], "start_seconds": ["40", "30"], "properties": ["accelerates, tires, squeals", "noise, excited, infant"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "continuous sizzling with a woman speaking towards the end"], "sample_ids": ["w-4gHptFNuU", "ukxt9I7eMMg"], "start_seconds": ["21", "30"], "properties": ["engine revs, accelerates, bump", "continuous, woman, speaking"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking while food is frying in the background "], "question": "which entity is a video of a motorcycle?", "label": 0}, {"captions": ["a woman speaks as frying food sizzles", "several insects fly while two men talk"], "sample_ids": ["wTideSjRFS0", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["food, sizzle, woman", "several, fly, men"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["birds vocalize and a man speaks", "a car revs and accelerates loudly and men and women chatter among themselves"], "sample_ids": ["v0wPrLBI3hg", "y8dSeubCNI"], "start_seconds": ["30", "4"], "properties": ["vocalize, bird, speak", "men, women, car"], "captions_pred_video": ["footage of the pigeons feeding on the ground", null], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "an engine revving and people talking in the background"], "question": "which entity is more quiet", "label": 0}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["s4Uz1Ffgo04", "y8WEcpOlT3I"], "start_seconds": ["100", "40"], "properties": ["water, rushes, motorcycle", "harsh, wind, blows"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking with wind noise in the background "], "question": "which entity is a video of a motorcycle zooming by in the distance?", "label": 0}, {"captions": ["there are rhythmical snoring nearby", "speaking following by laughing and clapping"], "sample_ids": ["ujMt0-D-x2k", "u2f5NpsoHBg"], "start_seconds": ["0", "30"], "properties": ["snoring, rhythmical, nearby", "person, laugh, clap"], "captions_pred_video": ["of the dog playing with a toy on the floor", "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a crowd is clapping"], "question": "which entity is more likely to be a person", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a man speaks as a car is passing by"], "sample_ids": ["zY3icUyMdh8", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "a, car, pass"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vZAw4apG0Es", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["background, tick, repeat", "loud, multiple, distance"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["su6FAOcOA8c", "wDVMhEdTiVw"], "start_seconds": ["4", "30"], "properties": ["engine, run, woman", "gun, shoot, water"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vzxHnu-SFEw", "vb1fPSDI4c"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "multiple, people, yell"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "distant men speak as a spray can nozzle is depressed"], "sample_ids": ["tDVADusiIoc", "rwtmaKiCcQU"], "start_seconds": ["60", "30"], "properties": ["wind, radio, waves", "nozzle, depressed, spray can"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "shows a man spraying paint on a wall with a spray gun"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "spraying and people speaking"], "question": "which entity is a spray can?", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a man speaks while turning a water faucet on"], "sample_ids": ["w6RTHR6AeAg", "vf9xf3vMsGM"], "start_seconds": ["40", "540"], "properties": ["call, owl, screech", "A man speaks while turning a water faucet on."], "captions_pred_video": [null, "of the person washing their hands under the faucet"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a man is speaking while water is running in the background"], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a duck quacks continuously"], "sample_ids": ["x6ijhqRY38s", "vh30P49Po6s"], "start_seconds": ["250", "30"], "properties": ["bowl, silverware, man", "quacks, continuously, duck"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "children speak and play together"], "sample_ids": ["vdoxuJn9lTc", "yVVP8XvWJTo"], "start_seconds": ["40", "260"], "properties": ["burp, loud, girl", "children, speak, play"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a child speaks followed by a burp", "children are speaking and breathing with background noise "], "question": "which entity is more social", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "a woman speaks as she rubs two objects together"], "sample_ids": ["w0xsN8X18Y", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["rain, thunder, surface", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a process", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w2JXXIAdUdg", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["snoring, distance, person", "stream, water, flow"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage is blurry and out of focus"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vh30P49Po6s", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["loud, continuous, quacks", "engine, revs, vehicle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a duck is quacking loudly", "a race car accelerates and revs its engine "], "question": "which entity is quieter", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["xjhAnI2q6hM", "uEU-Hg5MTN8"], "start_seconds": ["6", "27"], "properties": ["engine revs, vehicle, people", "animal, grunts, snorts"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "people cheer as a vehicle engine revs"], "sample_ids": ["spYNpeN7rPY", "xjhAnI2q6hM"], "start_seconds": ["1", "6"], "properties": ["a clock, ticktock, man", "engine revs, vehicle, people"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vD6lYD1l0BY", "wz7N8YRy74I"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "rooster, crow, background, men"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "a woman speaks happily and an animal chirps"], "sample_ids": ["xERFUeZONz8", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["ring, approach, traffic", "a woman, chirps, animal"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", null], "captions_pred_audio": ["an emergency vehicle siren blares", "a woman is speaking and a dog is barking "], "question": "which entity is more calming", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["u7C-AEBQM", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["ticks, rhythmic, quiet", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a ticktock of a clock", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a person speaks briefly", "birds chirp and objects are moved around"], "sample_ids": ["zOZleIRqZm4", "yPUYU6t3rwo"], "start_seconds": ["80", "370"], "properties": ["person, talk, brief", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xjvTpk2Zpr8", "zj2R0XoFr5k"], "start_seconds": ["70", "50"], "properties": ["engine, run, wind", "airplane, boy, fly"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a propeller rotates loudly and intensely"], "sample_ids": ["sZPuqDgX2V0", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["commentator, race, track", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a child yells and another yells", "a man speaks followed by another man speaking outside"], "sample_ids": ["vMDHu7Lxcgw", "viuTg1M-dqg"], "start_seconds": ["410", "30"], "properties": ["two, yell, child", "two men, speak, follow"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two people speaking?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["x6ijhqRY38s", "sapQIQUhFc"], "start_seconds": ["250", "280"], "properties": ["bowl, silverware, man", "liquid, flow, distance"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking and a stream is flowing in the background "], "question": "which entity shows a man speaking?", "label": 0}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "a car accelerates and wind blows"], "sample_ids": ["w-4gHptFNuU", "u0TrcHhkPQ"], "start_seconds": ["21", "20"], "properties": ["engine revs, accelerates, bump", "accelerates, wind, blows"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a clock ticktocks continuously", "a woman speaks with water running"], "sample_ids": ["vlJS7LN2XyM", "wTideSjRFS0"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks continuously", "water, running, woman"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking while water is running in the background"], "question": "which entity is a clock", "label": 0}, {"captions": ["there are rhythmical snoring nearby", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["ujMt0-D-x2k", "wSVhSdj0F0"], "start_seconds": ["0", "10"], "properties": ["snoring, rhythmical, nearby", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["of the dog playing with a toy on the floor", null], "captions_pred_audio": ["a person is snoring loudly", "a car horn honks and keys jangle with background noise "], "question": "which entity is not rhythmical", "label": 1}, {"captions": ["a man talks as several small engines run", "winds blows roughly as a vehicle races past"], "sample_ids": ["u9A6VZQCZpU", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["a, man, talk", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle racing past?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wSVhSdj0F0", "ukg5L09Wpvo"], "start_seconds": ["10", "150"], "properties": ["horn honks, keys jingle, slam", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man speaks while water drains", "someone snores nearby"], "sample_ids": ["vSeGhaZt-aI", "spJCm8tD9Zo"], "start_seconds": ["50", "90"], "properties": ["water, drain, man", "someone snores, nearby, someone"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["material crumbles into a microphone", "a clock ticktocks"], "sample_ids": ["vofpvUo6NAw", "v-g-j2uTByM"], "start_seconds": ["220", "30"], "properties": ["material, crumbles, microphone", "ticktocks, clock, ticktocks"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vzxHnu-SFEw", "wqZ135Ssz0"], "start_seconds": ["80", "60"], "properties": ["two objects, woman, speak", "two men, woman, birds"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a woman talks while a baby cries and a man whispers"], "sample_ids": ["yZrFNS7GFBQ", "smDKStoHBJo"], "start_seconds": ["30", "0"], "properties": ["pigeon, buzzes, insect", "a, talk, baby, cry"], "captions_pred_video": ["of the bird in the cage", "a man holding a crying baby in his arms"], "captions_pred_audio": ["an owl hoots in the background ", "a baby is crying and a woman is speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a propeller rotates loudly and intensely"], "sample_ids": ["un9VQlzgZM", "ugHJF0hfYkg"], "start_seconds": ["5", "10"], "properties": ["wind, speak, laugh", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "pigeons vocalize and birds chirp"], "sample_ids": ["sHbXC6na9hg", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["a person, saw, wood", "vocalize, bird, chirp"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "of the pigeon in the cage"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a person is snoring while sleeping", "dishes cling together then a man begins to speak"], "sample_ids": ["vJrjSeP17yE", "sQGXqGcwOTc"], "start_seconds": ["40", "3"], "properties": ["a person is sleeping, snoring, person", "cling, speak, dishes"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a person snoring loudly", "mechanisms are operating and water is splashing "], "question": "which entity is about a person speaking?", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "an infant crying as a woman laughs"], "sample_ids": ["uiItxDsDMFI", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["wood, piece, saw", "a, laugh, infant"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a saw is being used with background noise ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "dishes cling together then a man begins to speak"], "sample_ids": ["sofxkNWaP0s", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["wind, engine, louder", "cling, speak, dishes"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a stream of water runs briefly", "a stream of water runs briefly"], "sample_ids": ["x-PeY8Yb8M4", "x-PeY8Yb8M4"], "start_seconds": ["300", "300"], "properties": ["stream, water, run", "stream, water, run"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a car is driving on a wet road ", "a car is driving on a wet road "], "question": "which stream of water runs briefly", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "small dogs yip and bark sharply"], "sample_ids": ["siJFXfGWgDk", "v-wcQf4BDY0"], "start_seconds": ["50", "120"], "properties": ["man, woman, vehicle", "bark, yip, sharply"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "an engine runs loudly"], "sample_ids": ["uZesmtKZGSw", "vqZuVbG6-HI"], "start_seconds": ["250", "130"], "properties": ["men, talk, cars", "loud, engine, run"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a infant makes noise and is excited"], "sample_ids": ["uYT5gxnyMWM", "wIJK3-5y0kA"], "start_seconds": ["50", "30"], "properties": ["a, scream, girl", "noise, excited, infant"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "an infant crying frantically"], "sample_ids": ["tDVADusiIoc", "zwOBqeFTgiU"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "cry, infant, frantically"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tDVADusiIoc", "wqZ135Ssz0"], "start_seconds": ["60", "60"], "properties": ["wind, radio, waves", "two men, woman, birds"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "people speak as gunfire rings out"], "sample_ids": ["vSeGhaZt-aI", "wqTCwqVRDlk"], "start_seconds": ["50", "80"], "properties": ["water, bubbles, run", "gunfire, ring, speak"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a jet engine spools up and takes off", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vBslzh7saPw", "wz7N8YRy74I"], "start_seconds": ["90", "30"], "properties": ["engine, spools, takes", "rooster, crow, background, men"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "wind blows as people chatter quietly"], "sample_ids": ["sofxkNWaP0s", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["wind, engine, louder", "wind, chatter, people"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vzxHnu-SFEw", "vYkA3cfXp5Q"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "engine, accelerate, idle"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "an engine is idling"], "question": "which is a vehicle", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["vlS6YMeWAPo", "vddP56-ogds"], "start_seconds": ["40", "30"], "properties": ["noise, bleat, call", "liquid, laughs, man"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "water is running and gurgling and a man is speaking"], "question": "which entity has a man talking?", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "water flows as men speak and yell"], "sample_ids": ["ziUT9IFTkjg", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["background, birds, rustling", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["material crumbles into a microphone", "an engine runs loudly"], "sample_ids": ["vofpvUo6NAw", "vqZuVbG6-HI"], "start_seconds": ["220", "130"], "properties": ["material, crumbles, microphone", "loud, engine, run"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage is blurry because it's raining outside"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "a propeller rotates loudly and intensely"], "sample_ids": ["sQGXqGcwOTc", "ugHJF0hfYkg"], "start_seconds": ["3", "10"], "properties": ["audio, kid, giggles", "loud, intense, propeller"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a baby cries and a woman speaks", "an insect buzzes around continuously"], "sample_ids": ["tMbMDvT50j8", "v25l1jef3JY"], "start_seconds": ["12", "0"], "properties": ["a, cry, woman", "buzzes, continuously, insect"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a baby cries and a woman speaks", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["a dog barks and whimpers", "a propeller rotates loudly and intensely"], "sample_ids": ["sShpyu2l4YQ", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["barks, whimpers, dog", "loud, intense, propeller"], "captions_pred_video": ["the puppies are playing with a toy", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a dog is barking and growling", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["bees buzz as wind blows", "waves crash against a shoreline and people speak"], "sample_ids": ["tMJne1a4AFI", "yFB25fqfU8I"], "start_seconds": ["0", "300"], "properties": ["bees, buzz, wind", "wave, crash, shoreline"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "water running down a sink while a man is talking"], "sample_ids": ["tDlysoZiA1I", "vSeGhaZt-aI"], "start_seconds": ["0", "50"], "properties": ["animal, grunt, multiple", "water, sink, talk"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video of a man talking?", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wDVMhEdTiVw", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["gun, shoot, water", "a woman, something, fried"], "captions_pred_video": ["a blurry image of trees and water in the forest", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "race cars go around a track as a man commentates"], "sample_ids": ["zofjfKhqLk8", "uZesmtKZGSw"], "start_seconds": ["10", "250"], "properties": ["background, metal, clank", "car, track, man"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and a car is revving with laughter in the background "], "question": "which is a video of a man commentating", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "water pouring and bubbling"], "sample_ids": ["y8dSeubCNI", "uyRfq-jKPpo"], "start_seconds": ["4", "50"], "properties": ["men, women, car", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["an engine revving and people talking in the background", "water is running from a faucet"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "a man speaks as a car is passing by"], "sample_ids": ["slZLHwNbbt4", "sK4u5T8hW78"], "start_seconds": ["300", "30"], "properties": ["a, horn, run", "a, car, pass"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["tDVADusiIoc", "wyllXV6PjKo"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "a baby, a woman, a man"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman speaks and a baby cries"], "question": "which entity is a group of people", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["ugHJF0hfYkg", "su6FAOcOA8c"], "start_seconds": ["10", "4"], "properties": ["engine, running, continuously", "engine, idle, woman"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a subway train is moving "], "question": "which entity has an engine that is running continuously?", "label": 0}, {"captions": ["a stream of water flows quickly", "a helicopter engine idles continuously"], "sample_ids": ["wbHTKEJZyhc", "ugHJF0hfYkg"], "start_seconds": ["20", "10"], "properties": ["stream, water, flow", "engine, idle, continuously"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a helicopter is flying overhead "], "question": "which entity is not moving", "label": 1}, {"captions": ["a clock ticktocks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["v-g-j2uTByM", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["ticktocks, clock, ticktocks", "water, radio, man"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a clock?", "label": 1}, {"captions": ["a person is burping while a girl speaks", "a woman speaks over sizzling noise"], "sample_ids": ["vdoxuJn9lTc", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["person, burp, girl", "noise, woman, speak"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a child speaks followed by a burp", "a woman is speaking while food is frying in the background"], "question": "which entity is speaking over noise", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["xSKJGCItUWE", "wz7N8YRy74I"], "start_seconds": ["10", "30"], "properties": ["engine, run, boy", "rooster, crow, background, men"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster crow?", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sQGXqGcwOTc", "uZesmtKZGSw"], "start_seconds": ["3", "250"], "properties": ["cling, speak, dishes", "men, talk, cars"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["popping and crackling repeats as men yell and laugh", "a female speaks softly as paper crinkles"], "sample_ids": ["rqu8iB22IY", "xvDdE3zNf8Y"], "start_seconds": ["5", "120"], "properties": ["sound, repeats, laugh", "a, female, speaks"], "captions_pred_video": [null, "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a woman speaks and crumples paper"], "question": "which entity has a female speaking softly?", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vuUVPzd2FXw", "w34HjHr6gAY"], "start_seconds": ["160", "30"], "properties": ["a, steam, release", "beeps, hit, woman"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zofjfKhqLk8", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "stream, water, flow"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage is blurry and out of focus"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["people speak then an engine runs", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["uMTTDZ2mb4", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["engine, run, people", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "roadway noise occurs and a truck accelerates"], "sample_ids": ["yDoT73BWsdA", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["engine, revs, vehicle", "noise, truck, accelerate"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sG7TyPnFDR0", "xKB8O8LTs6s"], "start_seconds": ["180", "70"], "properties": ["beeps, machine, smoke alarm", "music, gunfire, explosion"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["siJFXfGWgDk", "zFjIWfSD-4"], "start_seconds": ["50", "410"], "properties": ["man, woman, vehicle", "People, motor, brakes"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has more people", "label": 1}, {"captions": ["people clap and speak in the distance", "people cheer as a vehicle engine revs"], "sample_ids": ["wwyfGO2J4", "xjhAnI2q6hM"], "start_seconds": ["90", "6"], "properties": ["clap, distance, speak", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["someone snores nearby", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["spJCm8tD9Zo", "yajyRTUQk3U"], "start_seconds": ["90", "400"], "properties": ["someone snores, nearby, someone", "a woman, something, fried"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a train horn blows as it passes by"], "sample_ids": ["siJFXfGWgDk", "zVacuqSb4LI"], "start_seconds": ["50", "30"], "properties": ["man, woman, vehicle", "horn, blows, train"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man speaking with light rustling", "a car accelerates and wind blows"], "sample_ids": ["zOZleIRqZm4", "u0TrcHhkPQ"], "start_seconds": ["80", "20"], "properties": ["light, rustling, man", "accelerates, wind, blows"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks as several small engines run", "three men talk while wind blows and some liquid flows"], "sample_ids": ["u9A6VZQCZpU", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["a, man, talk", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["continuous sneezing together with speech", "three men talk while wind blows and some liquid flows"], "sample_ids": ["x4dZyf9Gbj0", "vJ7JPEFhyLA"], "start_seconds": ["130", "16"], "properties": ["continuous, sneeze, speech", "three men, wind, flow"], "captions_pred_video": ["footage is blurry and out of focus", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be a video of a person sneezing?", "label": 0}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vfYTJq7nU", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["ducks, quack, man", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a joke", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "continuous snoring"], "sample_ids": ["tw76HGONaKg", "sLkeqCDJIyw"], "start_seconds": ["570", "120"], "properties": ["A, game, keyboard", "loud, snoring, noise"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a train engine runs and a horn blows", "wind blowing followed by a zoom"], "sample_ids": ["zPX9o1uDiI", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["engine, horn, run", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "water flows as men speak and yell"], "sample_ids": ["sjlVMgdGSK0", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["accelerates, vehicle, race car", "water, flow, men"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a moving object", "label": 0}, {"captions": ["food fries in a pan as someone talks and cooks", "a duck quacks continuously"], "sample_ids": ["ukxt9I7eMMg", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["food, pan, cook", "quacks, continuously, duck"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "water splashes as an animal walks through"], "sample_ids": ["vddP56-ogds", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["liquid, laughs, man", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vdoxuJn9lTc", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["burp, loud, girl", "men, talk, cars"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["birds chirp as a bell rings", "waves crash against a shoreline and people speak"], "sample_ids": ["ziUT9IFTkjg", "yFB25fqfU8I"], "start_seconds": ["10", "300"], "properties": ["chirp, bell, ring", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["tapping occurs then a baby cries", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wIJK3-5y0kA", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["a, cry, baby", "music, gunfire, explosion"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a baby cries and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a person snoring several times", "a frog croaks as other frogs croak in the background"], "sample_ids": ["spJCm8tD9Zo", "yswmmRZFItk"], "start_seconds": ["90", "0"], "properties": ["snore, person, several", "background, frog, croak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a close up of a frog in the water"], "captions_pred_audio": ["a person is snoring loudly", "a frog is croaking"], "question": "which entity is a croaking animal?", "label": 1}, {"captions": ["food is frying and sizzles", "someone is typing on a computer keyboard"], "sample_ids": ["zNRChLjqcU", "v0x1odnXtP0"], "start_seconds": ["220", "210"], "properties": ["food is frying, sizzles, food", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["water is running from a faucet into a sink", "a person is typing on a keyboard"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a beep occurs briefly", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xtWeJ56-U-g", "zj2R0XoFr5k"], "start_seconds": ["20", "50"], "properties": ["beep, occur, briefly", "airplane, boy, fly"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a train horn sounds and railroad crossing ring", "people speak in a closed space"], "sample_ids": ["s7knHCFW82w", "sTpirNYo8vQ"], "start_seconds": ["30", "30"], "properties": ["horn, sound, train", "people, space, speak"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man is speaking while a car is revving and accelerating "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a man speaks as a motor runs in the background"], "sample_ids": ["sfAvvZwdLCY", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["flushes, drains, water", "background, motor, run"], "captions_pred_video": ["footage of the toilet in the bathroom", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a toilet is flushed", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a stream of water runs briefly", "a car speeding up in the distance"], "sample_ids": ["x-PeY8Yb8M4", "u0TrcHhkPQ"], "start_seconds": ["300", "20"], "properties": ["stream, water, run", "distance, car, speed"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["w6RTHR6AeAg", "zj2R0XoFr5k"], "start_seconds": ["40", "50"], "properties": ["call, owl, screech", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["w0xsN8X18Y", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["rain, thunder, surface", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["w2JXXIAdUdg", "tDVADusiIoc"], "start_seconds": ["10", "60"], "properties": ["emits, sleeping, person", "water, radio, man"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["children cry and people talk", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xLwHe825Zs", "w5W5Kqtc8E"], "start_seconds": ["18", "100"], "properties": ["people talk, children cry, people talk", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby cries and a woman speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running?", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yaln9y8I7ms", "xBxDz0CFVn0"], "start_seconds": ["230", "30"], "properties": ["female, flushes, toilet", "stream, water, flow"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a man speaks as a car is passing by"], "sample_ids": ["uZesmtKZGSw", "sK4u5T8hW78"], "start_seconds": ["250", "30"], "properties": ["car, track, man", "a, car, pass"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking to a car passing by?", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "water flows and trickles"], "sample_ids": ["tgbONvsP47Y", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["noise, truck, accelerate", "water, flow, trickle"], "captions_pred_video": ["footage of a fire truck entering a garage", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a car is driving on the road ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a car accelerates and wind blows"], "sample_ids": ["vYkA3cfXp5Q", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["engine, accelerate, idle", "accelerates, wind, blows"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", null], "captions_pred_audio": ["an engine is idling", "a race car accelerates and revs its engine "], "question": "which entity is a car?", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "an airplane engine spools and people speak"], "sample_ids": ["smGI3C1NZc", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["water, drain, toilet", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a toilet is flushed", "a jet engine is running and people are talking"], "question": "which entity is a machine", "label": 1}, {"captions": ["a train engine runs and a horn blows", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zPX9o1uDiI", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["engine, horn, run", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["wz7N8YRy74I", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["rooster, crow, background, men", "two men, woman, birds"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["speaking following by laughing and clapping", "vehicles pass by on a roadway"], "sample_ids": ["u2f5NpsoHBg", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["person, laugh, clap", "pass, vehicle, roadway"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "small dogs yip and bark sharply"], "sample_ids": ["zCrAfDfv6-A", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["person, mouse, click", "bark, yip, sharply"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a person whistles a song", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["material crumbles into a microphone", "vehicles pass by on a roadway"], "sample_ids": ["vofpvUo6NAw", "tgbONvsP47Y"], "start_seconds": ["220", "0"], "properties": ["material, crumbles, microphone", "pass, vehicle, roadway"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage of a fire truck entering a garage"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zgUgkpk78xU", "zFjIWfSD-4"], "start_seconds": ["70", "410"], "properties": ["clinking, humming, horn", "People, motor, brakes"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a train?", "label": 0}, {"captions": ["white noise and birds chirping", "several insects fly while two men talk"], "sample_ids": ["wRBHTgrbiwg", "s-T9OVOiMLo"], "start_seconds": ["50", "330"], "properties": ["noise, white, chirping", "several, fly, men"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "paper is crumpling consistently"], "sample_ids": ["xzKKf9bKNUo", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["background, noise, snoring", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person snoring loudly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a man speaks as a motor runs in the background"], "sample_ids": ["xyL9F5VrjkE", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["wind, motor, distance", "background, motor, run"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["u5RmF3c3Aw", "tDVADusiIoc"], "start_seconds": ["60", "60"], "properties": ["engine, car, zoom", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a man speaking with light rustling", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zOZleIRqZm4", "uYT5gxnyMWM"], "start_seconds": ["80", "50"], "properties": ["light, rustling, man", "female, spraying, scream"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a baby is crying"], "question": "which entity is a woman?", "label": 1}, {"captions": ["bees buzz and wind blows", "water pouring and bubbling"], "sample_ids": ["tMJne1a4AFI", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["bees buzz, wind blows, bees", "water, bubbles, pouring"], "captions_pred_video": ["a swarm of bees on the ground", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a swarm of bees buzzing around", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["w5W5Kqtc8E", "vfYTJq7nU"], "start_seconds": ["100", "130"], "properties": ["wind, engine, scream", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a duck quacks and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["an audience gives applause", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["x6iCUDmRpKQ", "yajyRTUQk3U"], "start_seconds": ["38", "400"], "properties": ["applause, audience, give", "a woman, something, fried"], "captions_pred_video": ["a black background with the moon and stars in the sky", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a group of people are clapping and cheering", "a woman is speaking while food is frying in the background"], "question": "which entity is a demonstration", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a woman talking as an infant is crying"], "sample_ids": ["vfYTJq7nU", "tMbMDvT50j8"], "start_seconds": ["130", "12"], "properties": ["rustling, ducks, quack", "a, talk, infant"], "captions_pred_video": [null, "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is a person talking to an infant?", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a man speaks followed by another man speaking outside"], "sample_ids": ["siJFXfGWgDk", "viuTg1M-dqg"], "start_seconds": ["50", "30"], "properties": ["a, bird, vehicle", "two men, speak, follow"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "a clock ticktocks"], "sample_ids": ["wz7N8YRy74I", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, men", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a car speeding up in the distance"], "sample_ids": ["y8WEcpOlT3I", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["wind, speak, buffeting", "distance, car, speed"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a airplane flies overhead as a woman speaks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zj2R0XoFr5k", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["airplane, fly, woman", "female, spraying, scream"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a woman speaking?", "label": 0}, {"captions": ["a saw finishes running as metal clings in the background", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zofjfKhqLk8", "vb1fPSDI4c"], "start_seconds": ["10", "30"], "properties": ["background, metal, clings", "multiple, people, yell"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vbpKkWvfOu4", "vJ7JPEFhyLA"], "start_seconds": ["560", "16"], "properties": ["a, man, speaks", "three men, wind, flow"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "paper is crumpling consistently"], "sample_ids": ["yNtRmrn0io8", "v5cSxLaHADY"], "start_seconds": ["210", "0"], "properties": ["storm, distance, strike", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a house in the middle of the night", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["rain falls and thunder roars", "paper is crumpled and crinkled"], "question": "which entity is not a storm?", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "paper folding and crinkling"], "sample_ids": ["shmR4OZtzqA", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["man, engine, idle", "paper, fold, crinkle"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man speaks while a motor runs", "the wind blows and a mouse clicks "], "question": "which is not a vehicle", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w0xsN8X18Y", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["music, surface, rain", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more liquid flowing", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "a frog croaks as other frogs croak in the background"], "sample_ids": ["s4tUs779vBA", "yswmmRZFItk"], "start_seconds": ["160", "0"], "properties": ["a, sound, stop", "background, frog, croak"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "a close up of a frog in the water"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a frog is croaking"], "question": "which entity has more frogs croaking in the background", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "someone is typing on a computer keyboard"], "sample_ids": ["sd7xVssqlw", "v0x1odnXtP0"], "start_seconds": ["50", "210"], "properties": ["accelerates, tires, squealing", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person is typing on a keyboard"], "question": "which entity is stationary", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "a woman speaks as she rubs two objects together"], "sample_ids": ["sTpirNYo8vQ", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["a, tone, fast", "two objects, woman, speak"], "captions_pred_video": ["of a man taking a selfie on a bus", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman speaks as she rubs two objects together?", "label": 1}, {"captions": ["a infant makes noise and is excited", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wIJK3-5y0kA", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["noise, excited, infant", "loud, laughter, intermittent"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a baby cries and a woman speaks", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaking with light rustling", "a person snores loudly multiple times at a close distance"], "sample_ids": ["zOZleIRqZm4", "sSMl2vc3ek"], "start_seconds": ["80", "20"], "properties": ["light, rustling, man", "loud, multiple, distance"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["xfudFO976zE", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["animal, bleats, cry", "female, spraying, scream"], "captions_pred_video": ["footage is blurry and shaky", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "a person snores loudly multiple times at a close distance"], "sample_ids": ["u6jIvCtKarQ", "sSMl2vc3ek"], "start_seconds": ["70", "20"], "properties": ["a, man, speaks", "loud, multiple, distance"], "captions_pred_video": ["footage of a person using a blender on a stove top", null], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "paper is crumpling consistently"], "sample_ids": ["uiItxDsDMFI", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["wood, piece, saw", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a saw is being used with background noise ", "paper is crumpled and crinkled"], "question": "which object is being crumpled", "label": 0}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a man speaks over intermittent keyboard taps"], "sample_ids": ["yLy-WycbVVE", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["background, people, talk", "audio, man, keyboard"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a man speaks and types on a computer keyboard "], "question": "which entity has a more calming background", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "wind blows as people chatter quietly"], "sample_ids": ["zY3icUyMdh8", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "wind, chatter, people"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a propeller rotates loudly and intensely"], "sample_ids": ["wsHBIgzs9Fs", "ugHJF0hfYkg"], "start_seconds": ["50", "10"], "properties": ["horn, continuous, buzzing", "loud, intense, propeller"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xyx6eNVEYRY", "tDVADusiIoc"], "start_seconds": ["380", "60"], "properties": ["loud, engine, muffles", "water, radio, man"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vD6lYD1l0BY", "wDVMhEdTiVw"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "gun, shoot, water"], "captions_pred_video": ["game controller being held in the hands of the person", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun", "label": 1}, {"captions": ["an insect buzzes around continuously", "someone snores nearby"], "sample_ids": ["v25l1jef3JY", "spJCm8tD9Zo"], "start_seconds": ["0", "90"], "properties": ["buzzes, continuously, insect", "someone snores, nearby, someone"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a horn rings out as a machine runs by"], "sample_ids": ["vW4x7S1VfQc", "slZLHwNbbt4"], "start_seconds": ["150", "300"], "properties": ["clacking, oil, woman", "a, horn, run"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["food sizzles in a frying pan", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["a man speaks as a machine runs", "people applaud and hoot and chat quietly"], "sample_ids": ["vD6lYD1l0BY", "wwyfGO2J4"], "start_seconds": ["330", "90"], "properties": ["a, machine, run", "people, applaud, hoot"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a person sniffles and sneezes", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["uRlbY6aoBU", "y8WEcpOlT3I"], "start_seconds": ["0", "40"], "properties": ["sneezes, sniffles, person", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be a person", "label": 0}, {"captions": ["a bird is chirping and tweeting a bird song", "a toilet flushes and a female speaks"], "sample_ids": ["wPz6QRAkEb4", "yaln9y8I7ms"], "start_seconds": ["60", "230"], "properties": ["chirps, tweets, song", "female, flushes, toilet"], "captions_pred_video": ["a bird in a cage on top of a pole", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping in the background ", "a toilet flushes and a man speaks"], "question": "which entity is not a bird?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a woman speaks and other women and a man talk with her"], "sample_ids": ["w2M4i1mklOA", "vbpKkWvfOu4"], "start_seconds": ["30", "560"], "properties": ["alarm, gears, turn", "a, woman, man"], "captions_pred_video": ["footage of an antique clock", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["smDKStoHBJo", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["a, talk, baby, cry", "People, motor, brakes"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a man speaks as a car is passing by"], "sample_ids": ["sQwlkXjQabo", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["water, spray, surface", "a, car, pass"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a heavy rain falls endlessly"], "sample_ids": ["zALy31PjDl0", "wP8ZKrlx3oA"], "start_seconds": ["21", "40"], "properties": ["a man, a vehicle, a horn", "heavy, rain, fall"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a heavy rain is falling on a surface"], "question": "which entity is a weather event", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wqUmIEzuNz4", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["frog, bird, vocalize", "men, talk, cars"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a cat meows and rustles", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a vehicle engine revs as the vehicle passes", "a woman speaks as she rubs two objects together"], "sample_ids": ["yDoT73BWsdA", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["engine, revs, vehicle", "two objects, woman, speak"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "three men talk while wind blows and some liquid flows"], "sample_ids": ["slZLHwNbbt4", "vJ7JPEFhyLA"], "start_seconds": ["300", "16"], "properties": ["train, horn, sound", "three men, wind, flow"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a train?", "label": 1}, {"captions": ["electronic beeps occur in a short series", "paper folding and crinkling"], "sample_ids": ["y682ml90jGw", "zPpG3RD8lSs"], "start_seconds": ["11", "20"], "properties": ["beeps, series, electronic", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a beeping sound is being made ", "the wind blows and a mouse clicks "], "question": "which entity is not a series of beeps", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["yYEVLuqEytU", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["grunt, slurp, background", "a, scream, girl"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a male speaks and another male speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["viuTg1M-dqg", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["two males, speaking, male", "rooster, crow, background, men"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "birds chirp and objects are moved around"], "sample_ids": ["sG7TyPnFDR0", "yPUYU6t3rwo"], "start_seconds": ["180", "370"], "properties": ["beeps, machine, smoke alarm", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "people speak as gunfire rings out"], "sample_ids": ["wjsXBsc7M40", "wqTCwqVRDlk"], "start_seconds": ["10", "80"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "gunfire, ring, speak"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a woman speaks with water running", "some men converse over an engine running"], "sample_ids": ["wTideSjRFS0", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["water, running, woman", "men, converse, engine"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a woman speaking with water running?", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yZmhM1HcsyE", "uZesmtKZGSw"], "start_seconds": ["4", "250"], "properties": ["engine, roar, water", "men, talk, cars"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more likely to be in a race", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a man speaks with another voice speaking in the background"], "sample_ids": ["vZAw4apG0Es", "u21-Z5gJCB8"], "start_seconds": ["30", "30"], "properties": ["background, tick, repeat", "background, voice, man"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking in the background?", "label": 0}, {"captions": ["birds chirp and a dog breathes heavily", "people applaud and hoot and chat quietly"], "sample_ids": ["y2ZBGpgbhHM", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["dog, chirp, breathe", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a car is passing by", "white noise and snoring with some rustling in the background"], "sample_ids": ["sK4u5T8hW78", "xzKKf9bKNUo"], "start_seconds": ["30", "10"], "properties": ["a, car, pass", "background, noise, snoring"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "shows a woman laying on a bed with her eyes closed and her mouth open"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a person snoring loudly"], "question": "which entity has a car passing by", "label": 0}, {"captions": ["a propeller rotates loudly and intensely", "people cheer as a vehicle engine revs"], "sample_ids": ["ugHJF0hfYkg", "xjhAnI2q6hM"], "start_seconds": ["10", "6"], "properties": ["loud, intense, propeller", "engine revs, vehicle, people"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a helicopter is flying overhead ", "a truck is revving its engine and a man is speaking "], "question": "which is louder", "label": 0}, {"captions": ["ticking continues without interruption", "some tunes played by whistling"], "sample_ids": ["v-g-j2uTByM", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["ticking, continuous, clock", "tune, play, whistling"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a clock is ticking loudly", "a person whistling a song"], "question": "which entity is not continuous", "label": 1}, {"captions": ["a helicopter engine runs", "an insect buzzes around continuously"], "sample_ids": ["t5ZbXbniOWk", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["engine, helicopter, run", "buzzes, continuously, insect"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a helicopter is flying overhead ", "a fly is buzzing around a microphone "], "question": "which entity is not a helicopter?", "label": 1}, {"captions": ["a woman and man are speaking", "someone whistles a tune"], "sample_ids": ["vbpKkWvfOu4", "sIXTftIuUgw"], "start_seconds": ["560", "90"], "properties": ["two people, speaking, woman, man", "someone, tune, whistle"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a person whistling a song"], "question": "which entity is a single person", "label": 1}, {"captions": ["goats bleat and metal clings", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tH17JPjDPnc", "sSMl2vc3ek"], "start_seconds": ["260", "20"], "properties": ["bleat, metal, clings", "loud, multiple, distance"], "captions_pred_video": ["feed of the goats eating hay in the barn", null], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a woman speaks as she rubs two objects together"], "sample_ids": ["siJFXfGWgDk", "vzxHnu-SFEw"], "start_seconds": ["50", "80"], "properties": ["a, bird, vehicle", "two objects, woman, speak"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["an adult woman and an adult man speak", "wind blows as people chatter quietly"], "sample_ids": ["zTLVJCo4WEE", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["two people, adult, speak", "wind, chatter, people"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "birds tweet and squawk"], "sample_ids": ["yI-KvObbDoY", "w1mlz3Pe4fU"], "start_seconds": ["260", "300"], "properties": ["sound, smack, wind", "squawk, tweet, scream"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "of a bird in a cage"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "birds are chirping and singing"], "question": "which entity is a bird", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a car accelerates and wind blows"], "sample_ids": ["vzceMbklWc", "u0TrcHhkPQ"], "start_seconds": ["180", "20"], "properties": ["water, faucet, sink", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "wind blowing followed by a zoom"], "sample_ids": ["y8dSeubCNI", "vr8ZXjEBhMQ"], "start_seconds": ["4", "150"], "properties": ["engine revving, people speaking, motorcycle", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["an engine revving and people talking in the background", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a woman and man are speaking", "a car accelerates and wind blows"], "sample_ids": ["vbpKkWvfOu4", "u0TrcHhkPQ"], "start_seconds": ["560", "20"], "properties": ["two people, speaking, woman, man", "accelerates, wind, blows"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a race car accelerates and revs its engine "], "question": "which is not a person", "label": 1}, {"captions": ["a cat meows and children speak", "a motor vehicle roars, drowning out people speaking in the background"], "sample_ids": ["x5cuQjOdM3E", "s4Uz1Ffgo04"], "start_seconds": ["30", "100"], "properties": ["cat, speak, children", "roars, background, people speaking"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["paper is crumpling consistently", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["v5cSxLaHADY", "uZesmtKZGSw"], "start_seconds": ["0", "250"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "men, talk, cars"], "captions_pred_video": ["footage of the person holding a pair of scissors", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sYITalLZjj4", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["water, rushes, background, birds", "airplane, boy, fly"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["wind blows and birds chirp", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["sNB8zxXneIM", "yeFvk9x0wWI"], "start_seconds": ["20", "30"], "properties": ["several, quack, cocks", "clack, bird, chirp"], "captions_pred_video": ["a group of geese in a cage", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "birds chirp in the background as a car drives by "], "question": "which entity is a bird?", "label": 1}, {"captions": ["a person whistles a meandering tune", "a airplane flies overhead as a woman speaks"], "sample_ids": ["uFoga8sHpiw", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["person, tune, whistle", "airplane, fly, woman"], "captions_pred_video": ["footage of a bird in a cage", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person whistles a song", "a woman speaks while a helicopter flies overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["vqZuVbG6-HI", "xjvTpk2Zpr8"], "start_seconds": ["130", "70"], "properties": ["background, male, female", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["an airplane engine runs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yVPZ2MNWpms", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["engine, airplane, runs", "female, spraying, scream"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car is driving by on the road ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a male speaks and another male speaks", "a man speaks followed by another man speaking outside"], "sample_ids": ["viuTg1M-dqg", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["two males, speaking, male", "two men, speak, follow"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 0}, {"captions": ["waves crash against a shoreline and wind blows", "a car speeding up in the distance"], "sample_ids": ["zdYdyF9-m8U", "u0TrcHhkPQ"], "start_seconds": ["7", "20"], "properties": ["wind, crash, shoreline", "distance, car, speed"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", null], "captions_pred_audio": ["waves crash and wind blows ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["multiple ducks quack continuously", "water flows and trickles"], "sample_ids": ["wfHeoPDLMaM", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "water, flow, trickle"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["ducks are quacking", "water is splashing and gurgling"], "question": "which entity is a source of water", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "some tunes played by whistling"], "sample_ids": ["w9lpbUn0hPc", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["male, wind, rustling", "tune, play, whistling"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "a propeller rotates loudly and intensely"], "sample_ids": ["w-4gHptFNuU", "ugHJF0hfYkg"], "start_seconds": ["21", "10"], "properties": ["engine revs, accelerates, bump", "loud, intense, propeller"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a horn rings out as a machine runs by"], "sample_ids": ["vh30P49Po6s", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["loud, continuous, quacks", "a, horn, run"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a duck is quacking loudly", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is not continuous", "label": 1}, {"captions": ["a person snoring", "a frog croaks as other frogs croak in the background"], "sample_ids": ["t8tv5YRMJUg", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["a person, snore, loud", "background, frog, croak"], "captions_pred_video": ["of a man getting his face licked by another man", "a close up of a frog in the water"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a frog is croaking"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "a toilet flushes and a female speaks"], "sample_ids": ["tqR406bGiE", "yaln9y8I7ms"], "start_seconds": ["40", "230"], "properties": ["flush, water, gurgle", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a toilet is flushed", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["zkKdxzNC97Y", "xfaoyyzw2WU"], "start_seconds": ["27", "180"], "properties": ["hard, surface, door", "loud, jet engine, roar"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a door is opened and closed", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "vehicles pass by on a roadway"], "sample_ids": ["vms5XGTDVQc", "tgbONvsP47Y"], "start_seconds": ["220", "0"], "properties": ["paper, crumpled, crinkled", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "footage of a fire truck entering a garage"], "captions_pred_audio": ["paper is crumpled and crinkled", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a drill runs and two people laugh", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tEE3MpBt1sg", "sSMl2vc3ek"], "start_seconds": ["50", "20"], "properties": ["two people, laugh, drill", "loud, multiple, distance"], "captions_pred_video": ["footage is blurry due to the smoke in the air", null], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman talking as an infant is crying", "a stream runs then someone speaks"], "sample_ids": ["tMbMDvT50j8", "wbHTKEJZyhc"], "start_seconds": ["12", "20"], "properties": ["a, talk, infant", "stream, run, someone"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a baby cries and a woman speaks", "a waterfall is flowing and people are speaking "], "question": "which entity is a stream?", "label": 1}, {"captions": ["water flows followed by women screaming", "a toilet flushes and a female speaks"], "sample_ids": ["w5W5Kqtc8E", "yaln9y8I7ms"], "start_seconds": ["100", "230"], "properties": ["water, flow, women", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "a man speaks as a vehicle engine idles"], "sample_ids": ["vb1fPSDI4c", "shmR4OZtzqA"], "start_seconds": ["30", "30"], "properties": ["multiple, people, yell", "man, engine, idle"], "captions_pred_video": [null, "shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a man speaks while a motor runs"], "question": "which entity has a vehicle engine idle?", "label": 1}, {"captions": ["white noise and birds chirping", "water splashes as an animal walks through"], "sample_ids": ["wRBHTgrbiwg", "w1ir-sZ3Im8"], "start_seconds": ["50", "90"], "properties": ["noise, white, chirping", "animal, water, splashes"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a helicopter engine idles continuously", "a clock ticktocks"], "sample_ids": ["ugHJF0hfYkg", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["engine, idle, continuously", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a helicopter is flying overhead ", "a clock is ticking loudly"], "question": "which entity is ticking continuously", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a frog croaks as other frogs croak in the background"], "sample_ids": ["sShpyu2l4YQ", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["growl, bark, yip", "background, frog, croak"], "captions_pred_video": ["the puppies are playing with a toy", "a close up of a frog in the water"], "captions_pred_audio": ["a dog is barking and growling", "a frog is croaking"], "question": "which entity is a solitary animal", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "wind blowing followed by a zoom"], "sample_ids": ["zsLxS-uLJTw", "vr8ZXjEBhMQ"], "start_seconds": ["20", "150"], "properties": ["horn, blast, train", "wind, blow, zoom"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["u5RmF3c3Aw", "w5W5Kqtc8E"], "start_seconds": ["60", "100"], "properties": ["engine, car, zoom", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a car zooming by?", "label": 0}, {"captions": ["children speak and play together", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yVVP8XvWJTo", "wz7N8YRy74I"], "start_seconds": ["260", "30"], "properties": ["children, speak, play", "rooster, crow, background, men"], "captions_pred_video": ["footage of a playground at a school or daycare center", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity shows a rooster crow?", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "water splashes as an animal walks through"], "sample_ids": ["xV7Mg1QucSc", "w1ir-sZ3Im8"], "start_seconds": ["14", "90"], "properties": ["alarm, ticktocks, laughs", "animal, water, splashes"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["w34HjHr6gAY", "xV7Mg1QucSc"], "start_seconds": ["30", "14"], "properties": ["beeps, hit, woman", "alarm, ticktocks, laughs"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "an alarm clock ticks and a woman laughs"], "question": "which entity has a man laugh?", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "waves crash against a shoreline and people speak"], "sample_ids": ["wRBHTgrbiwg", "yFB25fqfU8I"], "start_seconds": ["50", "300"], "properties": ["birds, chirp, cooing", "wave, crash, shoreline"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a person surfing in the ocean"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["white noise and birds chirping", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wRBHTgrbiwg", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["noise, white, chirping", "airplane, boy, fly"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "a clock ticktocks in wind"], "sample_ids": ["uiS58TNyUiw", "yVumC9TGknc"], "start_seconds": ["430", "30"], "properties": ["audio, man, speaking", "ticktocks, clock, wind"], "captions_pred_video": ["of the pigeon in the cage", "game title screen of the game shadow of the colossus on sony playstation 2"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a series of beeps and chirps"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as a machine runs", "water runs into a sink while men speak"], "sample_ids": ["vD6lYD1l0BY", "vzceMbklWc"], "start_seconds": ["330", "180"], "properties": ["a, machine, run", "water, sink, run"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "water is running and a man is speaking"], "question": "which entity is a man speaking as a machine runs?", "label": 0}, {"captions": ["a clock alarm sounds and gears turn", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["w2M4i1mklOA", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["alarm, gears, turn", "airplane, boy, fly"], "captions_pred_video": ["footage of an antique clock", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["ukxt9I7eMMg", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["food, pan, cook", "rustling, ducks, quack"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["ylpYOorfH4o", "yeFvk9x0wWI"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "clack, bird, chirp"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a man is speaking and an engine is revving", "birds chirp in the background as a car drives by "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a door opens and closes"], "sample_ids": ["yajyRTUQk3U", "vBHyYJ8pL0"], "start_seconds": ["400", "2"], "properties": ["noise, woman, speak", "open, close, door"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is more silent", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "someone is typing on a computer keyboard"], "sample_ids": ["vuUVPzd2FXw", "v0x1odnXtP0"], "start_seconds": ["160", "210"], "properties": ["a, steam, release", "keyboard, type, computer"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a woman sneezes then speaks", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["x4dZyf9Gbj0", "xfaoyyzw2WU"], "start_seconds": ["130", "180"], "properties": ["sneezes, speaks, woman", "loud, jet engine, roar"], "captions_pred_video": ["footage is blurry and out of focus", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman sneezes and speaks", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a duck quacks continuously"], "sample_ids": ["tDVADusiIoc", "vh30P49Po6s"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "quacks, continuously, duck"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a duck is quacking loudly"], "question": "which entity is speaking", "label": 0}, {"captions": ["multiple birds vocalize and wind blows", "an infant crying as a woman laughs"], "sample_ids": ["uoGVs9yUqY4", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["multiple, vocalize, wind", "a, laugh, infant"], "captions_pred_video": ["for how to make a wooden shed door youtube", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "birds chirp and an owl hoots before a man speaks briefly"], "sample_ids": ["yks4cLgIDMc", "wRBHTgrbiwg"], "start_seconds": ["170", "50"], "properties": ["background, speaking, child", "bird, owl, speak"], "captions_pred_video": ["footage of two kids wrestling on the floor", "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a man is speaking and a child is crying", "birds are chirping and insects are buzzing"], "question": "which entity has a bird speaking?", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "a woman speaks as she rubs two objects together"], "sample_ids": ["wqADXCzngMw", "vzxHnu-SFEw"], "start_seconds": ["340", "80"], "properties": ["audio, humming, revving", "two objects, woman, speak"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xjhAnI2q6hM", "ukg5L09Wpvo"], "start_seconds": ["6", "150"], "properties": ["engine revs, vehicle, people", "clickety-clack, train, whistle"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "birds chirp and objects are moved around"], "sample_ids": ["sapQIQUhFc", "yPUYU6t3rwo"], "start_seconds": ["280", "370"], "properties": ["liquid, flow, distance", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaking with light rustling", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zOZleIRqZm4", "tdWhHV3X25Q"], "start_seconds": ["80", "60"], "properties": ["light, rustling, man", "applause, audience, yells"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaking with light rustling", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["zOZleIRqZm4", "y2bVZ7rz-5M"], "start_seconds": ["80", "280"], "properties": ["light, rustling, man", "motor noise, horn, siren"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a truck is honking its horn and a siren is blaring "], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind blows strongly and a young man speaks", "a vehicle engine accelerates and wind blows"], "sample_ids": ["vs65y4qmyBE", "wudZTNBtVqc"], "start_seconds": ["340", "60"], "properties": ["wind, blows, strongly", "accelerates, engine, wind"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage is of a parking lot with cars parked in it"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a car accelerates and revs its engine "], "question": "which entity is about a vehicle engine accelerating and wind blowing?", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a piece of wood is being placed down and sawed"], "sample_ids": ["tiDFTC-5vU", "uiItxDsDMFI"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "wood, piece, saw"], "captions_pred_video": [null, "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a saw is being used with background noise "], "question": "which entity is being cut", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vZAw4apG0Es", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["people, clock, converse", "engine, laugh, loud"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a clock is ticking and people are talking", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "people cheer as a vehicle engine revs"], "sample_ids": ["w34HjHr6gAY", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["beeps, hit, woman", "engine revs, vehicle, people"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["x6ijhqRY38s", "xKB8O8LTs6s"], "start_seconds": ["250", "70"], "properties": ["something metal, glass, hit", "music, gunfire, explosion"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "someone snores nearby"], "sample_ids": ["se87d6yxEOA", "spJCm8tD9Zo"], "start_seconds": ["10", "90"], "properties": ["run, whistle, pass", "someone snores, nearby, someone"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a person is snoring loudly"], "question": "which is quieter", "label": 0}, {"captions": ["a person is burping while a girl speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vdoxuJn9lTc", "tdWhHV3X25Q"], "start_seconds": ["40", "60"], "properties": ["person, burp, girl", "applause, audience, yells"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wyllXV6PjKo", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["a baby, a woman, a man", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a baby is crying"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yZrFNS7GFBQ", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["pigeon, buzzes, insect", "three men, wind, flow"], "captions_pred_video": ["of the bird in the cage", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a pigeon?", "label": 0}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "people speak as gunfire rings out"], "sample_ids": ["wvKpEYswXO0", "wqTCwqVRDlk"], "start_seconds": ["150", "80"], "properties": ["water, tap, run", "gunfire, ring, speak"], "captions_pred_video": ["of the person preparing food in the kitchen", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "people speak in the background as a clock ticktocks"], "sample_ids": ["sU53zg9Jp7s", "vZAw4apG0Es"], "start_seconds": ["380", "30"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "background, clock, ticktocks"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a clock is ticking and people are talking"], "question": "which entity has a clock ticktocking?", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "men speak and a nozzle sprays liquid"], "sample_ids": ["yaln9y8I7ms", "wRV8yMk886E"], "start_seconds": ["230", "0"], "properties": ["female, flushes, toilet", "liquid, spray, nozzle"], "captions_pred_video": ["footage is blurry and out of focus", "two cars are parked in a parking lot at night"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man speaks followed by a loud burst"], "question": "which entity is a machine", "label": 1}, {"captions": ["wind blows strongly", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["w8uLijTqtlU", "tiDFTC-5vU"], "start_seconds": ["70", "30"], "properties": ["wind, blows, strongly", "male, duck, laugh"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and ducks are quacking"], "question": "which entity is more likely to be a natural phenomenon", "label": 0}, {"captions": ["children cheer as a man speaks then an audience screams", "people applaud and hoot and chat quietly"], "sample_ids": ["vJvryTwuAV8", "wwyfGO2J4"], "start_seconds": ["16", "90"], "properties": ["audience, cheer, man", "people, applaud, hoot"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", null], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "people are clapping and speaking with background noise "], "question": "which entity shows a quieter audience", "label": 1}, {"captions": ["an engine runs loudly", "a toilet flushes and a female speaks"], "sample_ids": ["vqZuVbG6-HI", "yaln9y8I7ms"], "start_seconds": ["130", "230"], "properties": ["loud, engine, run", "female, flushes, toilet"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage is blurry and out of focus"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a toilet flushes and a man speaks"], "question": "which entity is silent", "label": 1}, {"captions": ["a person is snoring while sleeping", "dogs barking and whimpering"], "sample_ids": ["vJrjSeP17yE", "tIY7qOV3rEM"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "barking, whimpering, dog"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a person snoring loudly", "a dog is barking and a cat is meowing"], "question": "which entity is a dog", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "several insects fly while two men talk"], "sample_ids": ["wTjoRj1se3U", "s-T9OVOiMLo"], "start_seconds": ["390", "330"], "properties": ["engine, run, people", "several, fly, men"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be in a nature setting", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "water is sprayed across a hard surface"], "sample_ids": ["xKB8O8LTs6s", "sQwlkXjQabo"], "start_seconds": ["70", "10"], "properties": ["music, gunshots, explosion", "water, spray, surface"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a infant makes noise and is excited", "bees buzz and wind blows"], "sample_ids": ["wIJK3-5y0kA", "tMJne1a4AFI"], "start_seconds": ["30", "0"], "properties": ["noise, excited, infant", "bees buzz, wind blows, bees"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "a swarm of bees on the ground"], "captions_pred_audio": ["a baby cries and a woman speaks", "a swarm of bees buzzing around"], "question": "which entity is buzzing", "label": 1}, {"captions": ["a man speaking with light rustling", "ticking continues without interruption"], "sample_ids": ["zOZleIRqZm4", "v-g-j2uTByM"], "start_seconds": ["80", "30"], "properties": ["light, rustling, man", "ticking, continuous, clock"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a clock is ticking loudly"], "question": "which entity is continuous", "label": 1}, {"captions": ["children speak and play together", "vehicles pass by on a roadway"], "sample_ids": ["yVVP8XvWJTo", "tgbONvsP47Y"], "start_seconds": ["260", "0"], "properties": ["children, speak, play", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a playground at a school or daycare center", "footage of a fire truck entering a garage"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a car accelerates and wind blows"], "sample_ids": ["uJV8NDaHqqk", "u0TrcHhkPQ"], "start_seconds": ["100", "20"], "properties": ["loud, fly, chirp", "accelerates, wind, blows"], "captions_pred_video": ["a bee hive in a wooden box", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "waves crash against a shoreline and people speak"], "sample_ids": ["rwTERCUno", "yFB25fqfU8I"], "start_seconds": ["90", "300"], "properties": ["engine, idle, sputter", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a jet engine spools up and takes off", "some men converse over an engine running"], "sample_ids": ["vBslzh7saPw", "sCiy7QS1U"], "start_seconds": ["90", "300"], "properties": ["engine, spools, takes", "men, converse, engine"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "people cheer as a vehicle engine revs"], "sample_ids": ["vJ7JPEFhyLA", "xjhAnI2q6hM"], "start_seconds": ["16", "6"], "properties": ["three men, wind, flow", "engine revs, vehicle, people"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an infant crying frantically", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zwOBqeFTgiU", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["cry, infant, frantically", "water, radio, man"], "captions_pred_video": ["of the baby crying in the car seat", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a baby cries loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a train horn sounds as it passes by", "people cheer as a vehicle engine revs"], "sample_ids": ["ukg5L09Wpvo", "xjhAnI2q6hM"], "start_seconds": ["150", "6"], "properties": ["sound, train, horn", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["w8uLijTqtlU", "w5W5Kqtc8E"], "start_seconds": ["70", "100"], "properties": ["wind, microphone, noise", "wind, blow, vehicle"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a recording of wind noise?", "label": 0}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["w-4gHptFNuU", "sLUnaPT5gM8"], "start_seconds": ["21", "0"], "properties": ["engine revs, accelerates, bump", "loud, laughter, intermittent"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["engines sputter roughly and tires squeal", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zhx6hoYrHeI", "sLUnaPT5gM8"], "start_seconds": ["160", "0"], "properties": ["engine, sputter, rough", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a scream", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vlJS7LN2XyM", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["background, clocks, ticking", "applause, audience, yells"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "someone whistles a song"], "sample_ids": ["tiDFTC-5vU", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["male, duck, laugh", "someone, song, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a person whistling a song"], "question": "which entity is a person", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a woman speaks as she rubs two objects together"], "sample_ids": ["sZPuqDgX2V0", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["commentator, race, track", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "multiple people speak and children yell while water gurgles"], "sample_ids": ["slZLHwNbbt4", "vb1fPSDI4c"], "start_seconds": ["300", "30"], "properties": ["clap, distance, horn", "multiple, people, yell"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks and is typing on a keyboard", "a vehicle engine accelerating then running on idle"], "sample_ids": ["x9JovgqUcs", "vYkA3cfXp5Q"], "start_seconds": ["500", "30"], "properties": ["a, man, speaks, keyboard", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man speaks and types on a keyboard", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["people speak softly as food sizzles", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yhQ2Lg-7qDY", "zFjIWfSD-4"], "start_seconds": ["130", "410"], "properties": ["food, sizzle, speak", "People, motor, brakes"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a motor?", "label": 1}, {"captions": ["a small engine spits as it runs", "multiple motorcycles pass by as a man speaks"], "sample_ids": ["sZvwOuuPGP0", "zcDwZ6W7E3E"], "start_seconds": ["50", "180"], "properties": ["spits, engine, runs", "man, speak, motorcycles"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "2 people riding motorcycles down a mountain road with trees lining the sides of the road"], "captions_pred_audio": ["a medium engine is running ", "a man is speaking while a car accelerates and revs its engine "], "question": "which entity is a video of multiple motorcycles passing by as a man speaks?", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["u--KhUW8l1Y", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["horn, siren, life", "engine, idle, woman"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "an infant crying as a woman laughs"], "sample_ids": ["zofjfKhqLk8", "xhmRY9yhC7c"], "start_seconds": ["10", "20"], "properties": ["background, metal, clings", "a, laugh, infant"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "small dogs yip and bark sharply"], "sample_ids": ["xfudFO976zE", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["animal, bleats, cry", "bark, yip, sharply"], "captions_pred_video": ["footage is blurry and shaky", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a dog barks and growls"], "question": "which animal is more aggressive", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["v0x1odnXtP0", "yks4cLgIDMc"], "start_seconds": ["210", "170"], "properties": ["keyboard, type, computer", "background, speaking, child"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "birds chirp and objects are moved around"], "sample_ids": ["vbZ-0lGPneg", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["a woman, a television program, a bird", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "insects buzz and a man speaks"], "question": "which entity has more birds", "label": 1}, {"captions": ["leaves rustle while man speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["zOZleIRqZm4", "vJ7JPEFhyLA"], "start_seconds": ["80", "16"], "properties": ["leaves, rustle, speak", "three men, wind, flow"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a person burps loudly for a long time nearby", "an infant crying frantically"], "sample_ids": ["vf44CgrjT0A", "zwOBqeFTgiU"], "start_seconds": ["20", "30"], "properties": ["loud, long, person", "cry, infant, frantically"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "of the baby crying in the car seat"], "captions_pred_audio": ["a loud burp", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a child speaks in closed space"], "sample_ids": ["sSMl2vc3ek", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["a person, laughs, snores", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 0}, {"captions": ["several beeps are followed by a hit and a woman talking", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["w34HjHr6gAY", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["beeps, hit, woman", "engine, revs, vehicle"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "an engine runs loudly"], "sample_ids": ["sLUnaPT5gM8", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["loud, laughter, intermittent", "loud, engine, run"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as crickets sing", "a chime of a clock followed by various tones of ticking with come clinking"], "sample_ids": ["ryFDPxgDOGc", "uqFtmnhuqA8"], "start_seconds": ["570", "30"], "properties": ["a, crickets, sing", "a, b, c"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "shows a clock on the wall of a room with a person standing in front of it"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "mechanisms are ticking and a hammer is striking "], "question": "which entity is a clock?", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a toilet flushes and a female speaks"], "sample_ids": ["smDKStoHBJo", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["a, infant, speaking", "female, flushes, toilet"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a toilet flushes and a man speaks"], "question": "which entity is about a toilet?", "label": 1}, {"captions": ["a man speaks as a machine runs", "an small aircraft engine runs and a boy speaks"], "sample_ids": ["vD6lYD1l0BY", "xSKJGCItUWE"], "start_seconds": ["330", "10"], "properties": ["a, machine, run", "engine, run, boy"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a high pitched engine is running and a child speaks"], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["people speak in a closed space", "water is sprayed across a hard surface"], "sample_ids": ["sTpirNYo8vQ", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["people, space, speak", "water, spray, surface"], "captions_pred_video": ["of a man taking a selfie on a bus", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "two men speak as a buffeting wind blows"], "sample_ids": ["wwyfGO2J4", "y8WEcpOlT3I"], "start_seconds": ["90", "40"], "properties": ["people, applaud, hoot", "wind, speak, buffeting"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "a toilet flushes and a female speaks"], "sample_ids": ["sjlVMgdGSK0", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["accelerates, vehicle, race car", "female, flushes, toilet"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage is blurry and out of focus"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a toilet flushes and a man speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a door slams shut roughly", "people applaud and hoot and chat quietly"], "sample_ids": ["zkKdxzNC97Y", "wwyfGO2J4"], "start_seconds": ["27", "90"], "properties": ["a door, slams, shut", "people, applaud, hoot"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["zFjIWfSD-4", "yks4cLgIDMc"], "start_seconds": ["410", "170"], "properties": ["People, motor, brakes", "background, speaking, child"], "captions_pred_video": [null, "footage of two kids wrestling on the floor"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a man speaks uses a drill"], "sample_ids": ["zcDwZ6W7E3E", "x5eIC7S0fbg"], "start_seconds": ["180", "60"], "properties": ["man, speak, motorcycles", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking and using a power tool "], "question": "which man is speaking", "label": 1}, {"captions": ["a jet engine spools up and takes off", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vBslzh7saPw", "tdWhHV3X25Q"], "start_seconds": ["90", "60"], "properties": ["engine, spools, takes", "applause, audience, yells"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u--KhUW8l1Y", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["horn, siren, life", "airplane, boy, fly"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a frog croaks as other frogs croak in the background"], "sample_ids": ["spJCm8tD9Zo", "yswmmRZFItk"], "start_seconds": ["90", "0"], "properties": ["snores, wheezes, sleeps", "background, frog, croak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a close up of a frog in the water"], "captions_pred_audio": ["a person is snoring loudly", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yZrFNS7GFBQ", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["pigeon, buzzes, insect", "men, talk, cars"], "captions_pred_video": ["of the bird in the cage", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a woman speaks happily and an animal chirps"], "sample_ids": ["wfHeoPDLMaM", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["quacking, squawking, ducks", "a woman, chirps, animal"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "a woman is speaking and a dog is barking "], "question": "which entity is a bird?", "label": 0}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wvKpEYswXO0", "vfYTJq7nU"], "start_seconds": ["150", "130"], "properties": ["water, tap, run", "rustling, ducks, quack"], "captions_pred_video": ["of the person preparing food in the kitchen", null], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["a person sniffles and sneezes", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uRlbY6aoBU", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["sneezes, sniffles, person", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is sneezing ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not followed by water sloshing nearby?", "label": 0}, {"captions": ["a toilet flushes and water drains", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sfAvvZwdLCY", "tdWhHV3X25Q"], "start_seconds": ["20", "60"], "properties": ["water drains, flushes, water", "applause, audience, yells"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["y1saVTXsKwc", "xKB8O8LTs6s"], "start_seconds": ["80", "70"], "properties": ["a, dog, talk", "music, gunfire, explosion"], "captions_pred_video": ["a dog playing with a pink ball", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a dog barks and a man speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["zl9Dqx-j7q4", "vlS6YMeWAPo"], "start_seconds": ["6", "40"], "properties": ["engine, laugh, loud", "sheep, baa, birds"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a jet engine roars ", "a goat bleats and birds chirp"], "question": "which entity is followed by a man laughing", "label": 0}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a man speaks followed by another man speaking outside"], "sample_ids": ["yZrFNS7GFBQ", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["pigeon, buzzes, insect", "two men, speak, follow"], "captions_pred_video": ["of the bird in the cage", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a single person speaking?", "label": 0}, {"captions": ["a motorcycle engine is idling", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vZAqdHZ81yA", "vYkA3cfXp5Q"], "start_seconds": ["180", "30"], "properties": ["engine, motorcycle, idling", "engine, accelerate, idle"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["an engine is idling loudly", "an engine is idling"], "question": "which entity has an engine that is idling", "label": 0}, {"captions": ["bees buzz and wind blows", "frogs croak and vocalize"], "sample_ids": ["tMJne1a4AFI", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["bees buzz, wind blows, bees", "croak, vocalize, frog"], "captions_pred_video": ["a swarm of bees on the ground", "a close up of a frog in the water"], "captions_pred_audio": ["a swarm of bees buzzing around", "a frog is croaking"], "question": "which animal is more likely to be a frog", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "birds twitter and chirp and clatter"], "sample_ids": ["vJvryTwuAV8", "yeFvk9x0wWI"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "chirp, twitter, clatter"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "birds chirp in the background as a car drives by "], "question": "which entity is more quiet", "label": 1}, {"captions": ["some people speak", "a car speeding up in the distance"], "sample_ids": ["vbZ-0lGPneg", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "distance, car, speed"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a man speaks as a motor runs in the background"], "sample_ids": ["uWPRNLnpy7Y", "xZepNM9qcRA"], "start_seconds": ["10", "30"], "properties": ["accelerate, laugh, vehicle", "background, motor, run"], "captions_pred_video": ["is taken from a car driving down the street", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a person speaks over rustling leaves", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zOZleIRqZm4", "w5W5Kqtc8E"], "start_seconds": ["80", "100"], "properties": ["rustling, leaves, person", "wind, blow, vehicle"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["birds chirp and objects are moved around", "an airplane flies overhead as a woman speaks"], "sample_ids": ["yPUYU6t3rwo", "zj2R0XoFr5k"], "start_seconds": ["370", "50"], "properties": ["birds chirp, objects are moved around, birds", "airplane, fly, overhead"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["insects buzz and a man speaks", "a woman speaks while a helicopter flies overhead "], "question": "which object is moving around", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "waves crash against a shoreline and people speak"], "sample_ids": ["sa6TLVbooCc", "yFB25fqfU8I"], "start_seconds": ["240", "300"], "properties": ["people, laugh, child", "wave, crash, shoreline"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "birds chirp and objects are moved around"], "sample_ids": ["vzxHnu-SFEw", "yPUYU6t3rwo"], "start_seconds": ["80", "370"], "properties": ["two objects, woman, speak", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["water flows and trickles", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tB7hWb9gTuQ", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["water, flow, trickle", "loud, multiple, distance"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", null], "captions_pred_audio": ["water is splashing and gurgling", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "several insects fly while two men talk"], "sample_ids": ["vmrxwuAMb2I", "s-T9OVOiMLo"], "start_seconds": ["40", "330"], "properties": ["a dog, inhales, exhales", "several, fly, men"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a still image?", "label": 0}, {"captions": ["someone whistles a tune", "a stream of water runs briefly"], "sample_ids": ["sIXTftIuUgw", "x-PeY8Yb8M4"], "start_seconds": ["90", "300"], "properties": ["someone, tune, whistle", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person whistling a song", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["someone is burping continuously", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["y636gklDioE", "tiDFTC-5vU"], "start_seconds": ["20", "30"], "properties": ["burps, burps, burps", "male, duck, laugh"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", null], "captions_pred_audio": ["a person burps loudly several times", "a man is speaking and ducks are quacking"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["smDKStoHBJo", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["a, talk, baby, cry", "rooster, crow, background, men"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "wind blowing followed by a zoom"], "sample_ids": ["slZLHwNbbt4", "vr8ZXjEBhMQ"], "start_seconds": ["300", "150"], "properties": ["a, horn, run", "wind, blow, zoom"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom of", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sa6TLVbooCc", "y8WEcpOlT3I"], "start_seconds": ["240", "40"], "properties": ["people, laugh, child", "harsh, wind, blows"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "water pouring and bubbling"], "sample_ids": ["wRBHTgrbiwg", "uyRfq-jKPpo"], "start_seconds": ["50", "50"], "properties": ["bird, owl, speak", "water, bubbles, pouring"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "water is running from a faucet"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uWPRNLnpy7Y", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["accelerate, laugh, vehicle", "female, spraying, scream"], "captions_pred_video": ["is taken from a car driving down the street", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["uRlbY6aoBU", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["a, distance, sneeze", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is sneezing ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 0}, {"captions": ["someone snores nearby", "people speak as gunfire rings out"], "sample_ids": ["spJCm8tD9Zo", "wqTCwqVRDlk"], "start_seconds": ["90", "80"], "properties": ["someone snores, nearby, someone", "gunfire, ring, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["motors runs briefly and tires screech", "water runs into a sink while men speak"], "sample_ids": ["yRx9txMcBl0", "vzceMbklWc"], "start_seconds": ["40", "180"], "properties": ["motors, tires, screech", "water, sink, run"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "water is running and a man is speaking"], "question": "which entity is a video of a sink?", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a person snores loudly multiple times at a close distance"], "sample_ids": ["y682ml90jGw", "sSMl2vc3ek"], "start_seconds": ["11", "20"], "properties": ["beeps, series, electronic", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wAAkbZToh8", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["burp, laugh, speak", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man burps and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a person speaking and laughing?", "label": 0}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["y2bVZ7rz-5M", "zl9Dqx-j7q4"], "start_seconds": ["280", "6"], "properties": ["engine, horn, siren", "engine, laugh, loud"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a jet engine roars "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["motors rev and run loudly as a person laughs", "water is sprayed across a hard surface"], "sample_ids": ["zl9Dqx-j7q4", "sQwlkXjQabo"], "start_seconds": ["6", "10"], "properties": ["motors rev, laugh, loudly", "water, spray, surface"], "captions_pred_video": ["footage of a man driving a car in the dark", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a jet engine roars ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "an insect buzzes around continuously"], "sample_ids": ["zj2R0XoFr5k", "v25l1jef3JY"], "start_seconds": ["50", "0"], "properties": ["airplane, fly, overhead", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a fly is buzzing around a microphone "], "question": "which entity is flying", "label": 0}, {"captions": ["a baby cries and a woman speaks", "some men converse over an engine running"], "sample_ids": ["tMbMDvT50j8", "sCiy7QS1U"], "start_seconds": ["12", "300"], "properties": ["a, cry, woman", "men, converse, engine"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a baby?", "label": 0}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a whistling owl calls out repeatedly and insects screech"], "sample_ids": ["wy1eKjR7KC0", "w6RTHR6AeAg"], "start_seconds": ["30", "40"], "properties": ["people, talk, distance", "call, owl, screech"], "captions_pred_video": ["two police officers riding motorcycles down the street", null], "captions_pred_audio": ["a man is speaking and a siren is going off", "an owl hoots and mechanisms operate "], "question": "which entity is a bird?", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["uJV8NDaHqqk", "uZesmtKZGSw"], "start_seconds": ["100", "250"], "properties": ["loud, fly, chirp", "men, talk, cars"], "captions_pred_video": ["a bee hive in a wooden box", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a toilet flushes and water drains", "an infant crying as a woman laughs"], "sample_ids": ["sfAvvZwdLCY", "xhmRY9yhC7c"], "start_seconds": ["20", "20"], "properties": ["water drains, flushes, water", "a, laugh, infant"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a toilet is flushed", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a heavy rain falls endlessly", "someone whistles a tune"], "sample_ids": ["wP8ZKrlx3oA", "sIXTftIuUgw"], "start_seconds": ["40", "90"], "properties": ["heavy, rain, fall", "someone, tune, whistle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a person whistling a song"], "question": "which is not a musical instrument", "label": 0}, {"captions": ["a person sneezes followed by another person speaking", "pigeons vocalize and birds chirp"], "sample_ids": ["t8CV69hcvF0", "uiS58TNyUiw"], "start_seconds": ["210", "430"], "properties": ["person, sneeze, follow", "vocalize, bird, chirp"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "of the pigeon in the cage"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "multiple people speak and children yell while water gurgles"], "sample_ids": ["t25U-v4k4ts", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["bees buzz, birds chirp, man speaks", "multiple, people, yell"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", null], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man is filing a hard object", "some men converse over an engine running"], "sample_ids": ["vveS8HT7Uog", "sCiy7QS1U"], "start_seconds": ["100", "300"], "properties": ["a man, hard, object", "men, converse, engine"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which object is harder to file", "label": 0}, {"captions": ["water splashes and wind noise is made into a microphone", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sDSppXIlJrs", "w5W5Kqtc8E"], "start_seconds": ["27", "100"], "properties": ["microphone, water, wind", "wind, blow, vehicle"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", null], "captions_pred_audio": ["the wind is blowing and water is splashing", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running?", "label": 1}, {"captions": ["a man speaks as horns blow", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["tHyNqRyK34A", "s7knHCFW82w"], "start_seconds": ["24", "30"], "properties": ["a, man, speaks", "blow horn, get close, train"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is about a train blowing its horn?", "label": 1}, {"captions": ["a car accelerates and wind blows", "people applaud and hoot and chat quietly"], "sample_ids": ["u0TrcHhkPQ", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["accelerates, wind, blows", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be at a concert", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a infant makes noise and is excited"], "sample_ids": ["zY3icUyMdh8", "wIJK3-5y0kA"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "noise, excited, infant"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "water splashes as an animal walks through"], "sample_ids": ["xzKKf9bKNUo", "w1ir-sZ3Im8"], "start_seconds": ["10", "90"], "properties": ["background, noise, snoring", "animal, water, splashes"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a person snoring loudly", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zOZleIRqZm4", "xBxDz0CFVn0"], "start_seconds": ["80", "30"], "properties": ["rustling, leaves, person", "stream, water, flow"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zofjfKhqLk8", "vbZ-0lGPneg"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking and a dog is whimpering"], "question": "which entity is a television program?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["v0x1odnXtP0", "tDVADusiIoc"], "start_seconds": ["210", "60"], "properties": ["keyboard, type, computer", "water, radio, man"], "captions_pred_video": ["how to make money on youtube in spanish", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a person?", "label": 0}, {"captions": ["some men converse over an engine running", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sCiy7QS1U", "sSMl2vc3ek"], "start_seconds": ["300", "20"], "properties": ["men, converse, engine", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a person snoring loudly"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zdYdyF9-m8U", "xKB8O8LTs6s"], "start_seconds": ["7", "70"], "properties": ["wind, crash, shoreline", "music, gunfire, explosion"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["waves crash and wind blows ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["people speak as gunfire rings out", "a stream of water runs briefly"], "sample_ids": ["wqTCwqVRDlk", "x-PeY8Yb8M4"], "start_seconds": ["80", "300"], "properties": ["gunfire, ring, speak", "stream, water, run"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "paper is crumpling consistently"], "sample_ids": ["yYJksgsxx5U", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["audio, woman, silverware", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "people speak as gunfire rings out"], "sample_ids": ["uPDn2BFTHk", "wqTCwqVRDlk"], "start_seconds": ["140", "80"], "properties": ["lady, laugh, baby", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["yDoT73BWsdA", "siJFXfGWgDk"], "start_seconds": ["10", "50"], "properties": ["engine revs, tires squeal, vehicle", "man, woman, vehicle"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and birds are chirping in the background "], "question": "which entity is a video of a vehicle?", "label": 0}, {"captions": ["water pouring and bubbling", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["uyRfq-jKPpo", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["water, bubbles, pouring", "a woman, a television program, a bird"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["water is running from a faucet", "a woman is speaking and a dog is whimpering"], "question": "which entity is a video of a television program?", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "water pouring and bubbling"], "sample_ids": ["w9lpbUn0hPc", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["male, wind, rustling", "water, bubbles, pouring"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "some men converse over an engine running"], "sample_ids": ["uYT5gxnyMWM", "sCiy7QS1U"], "start_seconds": ["50", "300"], "properties": ["a, scream, girl", "men, converse, engine"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a girl speaking followed by a scream and more girls talking?", "label": 0}, {"captions": ["a train horn sounds as a railroad passing bell rings", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zgUgkpk78xU", "tDVADusiIoc"], "start_seconds": ["70", "60"], "properties": ["horn, bell, train", "water, radio, man"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["tapping occurs then a baby cries", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wIJK3-5y0kA", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["a, cry, baby", "airplane, boy, fly"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["se87d6yxEOA", "ukg5L09Wpvo"], "start_seconds": ["10", "150"], "properties": ["run, whistle, pass", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a train blows its whistle and blows its horn "], "question": "which train whistle is continuous", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zVacuqSb4LI", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["blares, fades, train", "airplane, boy, fly"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sapQIQUhFc", "vb1fPSDI4c"], "start_seconds": ["280", "30"], "properties": ["water, stream, trickles", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a toilet flushes and a female speaks"], "sample_ids": ["vddP56-ogds", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["liquid, laughs, man", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uiS58TNyUiw", "sSMl2vc3ek"], "start_seconds": ["430", "20"], "properties": ["audio, man, speaking", "loud, multiple, distance"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["y2bVZ7rz-5M", "w34HjHr6gAY"], "start_seconds": ["280", "30"], "properties": ["motor noise, horn, siren", "beeps, hit, woman"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "an infant crying frantically"], "sample_ids": ["uYT5gxnyMWM", "zwOBqeFTgiU"], "start_seconds": ["50", "30"], "properties": ["female, spraying, scream", "cry, infant, frantically"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of the baby crying in the car seat"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["someone snores nearby", "three men talk while wind blows and some liquid flows"], "sample_ids": ["spJCm8tD9Zo", "vJ7JPEFhyLA"], "start_seconds": ["90", "16"], "properties": ["someone snores, nearby, someone", "three men, wind, flow"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["u--KhUW8l1Y", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["horn, siren, life", "a woman, a television program, a bird"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a still image?", "label": 0}, {"captions": ["a door opens and birds chirp", "pigeons vocalize and birds chirp"], "sample_ids": ["yeFvk9x0wWI", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["door, open, birds", "vocalize, bird, chirp"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of the pigeon in the cage"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an airplane accelerates briefly", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["zjTG0gaGCUI", "vddP56-ogds"], "start_seconds": ["80", "30"], "properties": ["accelerates, airplane, briefly", "liquid, laughs, man"], "captions_pred_video": [null, null], "captions_pred_audio": ["a jet engine roars as wind blows ", "water is running and gurgling and a man is speaking"], "question": "which entity is a video of a man talking?", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["xZepNM9qcRA", "rqu8iB22IY"], "start_seconds": ["30", "5"], "properties": ["background, motor, run", "sound, repeats, laugh"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a dog barks and a man speaks while music plays "], "question": "which entity has a motor running in the background?", "label": 0}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a toilet flushes and a female speaks"], "sample_ids": ["xKB8O8LTs6s", "yaln9y8I7ms"], "start_seconds": ["70", "230"], "properties": ["music, gunfire, explosion", "female, flushes, toilet"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage is blurry and out of focus"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a toilet flushes and a man speaks"], "question": "which entity is more likely to be in a movie", "label": 0}, {"captions": ["a girl speaks followed by a scream and more girls talking", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["uYT5gxnyMWM", "w34HjHr6gAY"], "start_seconds": ["50", "30"], "properties": ["a, scream, girl", "beeps, hit, woman"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["an infant crying as a woman laughs", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xhmRY9yhC7c", "wqZ135Ssz0"], "start_seconds": ["20", "60"], "properties": ["a, laugh, infant", "two men, woman, birds"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a vehicle engine runs as a siren and horn sound", "water pouring and bubbling"], "sample_ids": ["u--KhUW8l1Y", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["sound, vehicle, horn", "water, bubbles, pouring"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["wqUmIEzuNz4", "tDlysoZiA1I"], "start_seconds": ["30", "0"], "properties": ["frog, bird, vocalize", "animal, grunts, chirps"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a cat meows and rustles", "birds are chirping and a rooster is crowing "], "question": "which entity is a frog", "label": 0}, {"captions": ["a cat meows and children speak", "people speak as gunfire rings out"], "sample_ids": ["x5cuQjOdM3E", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["cat, speak, children", "gunfire, ring, speak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a toilet door squeaks as it is opened", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sdXV-ylviw", "zj2R0XoFr5k"], "start_seconds": ["190", "50"], "properties": ["door, toilet, squeaks", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a moving object", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "birds chirp and objects are moved around"], "sample_ids": ["wqZ135Ssz0", "yPUYU6t3rwo"], "start_seconds": ["60", "370"], "properties": ["man, woman, squawks", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["s59PfAghdkM", "sLUnaPT5gM8"], "start_seconds": ["0", "0"], "properties": ["bird, chirp, background, horse, neigh", "loud, laughter, intermittent"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["birds vocalize and a man speaks", "wind blowing followed by a zoom"], "sample_ids": ["v0wPrLBI3hg", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["vocalize, bird, speak", "wind, blow, zoom"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a train horn sounds as it passes by", "winds blows roughly as a vehicle races past"], "sample_ids": ["ukg5L09Wpvo", "xjvTpk2Zpr8"], "start_seconds": ["150", "70"], "properties": ["sound, train, horn", "wind, blows, vehicle"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a drill drills through something then people begin laughing", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tEE3MpBt1sg", "vYkA3cfXp5Q"], "start_seconds": ["50", "30"], "properties": ["drill, something, laugh", "engine, accelerate, idle"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "an engine is idling"], "question": "which is a vehicle", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a child speaks in closed space"], "sample_ids": ["sNB8zxXneIM", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["several, quack, cocks", "child, space, speak"], "captions_pred_video": ["a group of geese in a cage", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "pigeons vocalize and birds chirp"], "sample_ids": ["tIY7qOV3rEM", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "vocalize, bird, chirp"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of the pigeon in the cage"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking and a bee is buzzing"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tQWGZLItBXk", "tiDFTC-5vU"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "male, duck, laugh"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "wind blowing followed by a zoom"], "sample_ids": ["xjvTpk2Zpr8", "vr8ZXjEBhMQ"], "start_seconds": ["70", "150"], "properties": ["wind, blows, vehicle", "wind, blow, zoom"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a jet engine roars and wind blows ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more calm", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tgbONvsP47Y", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["noise, truck, accelerate", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a fire truck entering a garage", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a car is driving on the road ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tiDFTC-5vU", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a woman is speaking and a dog is whimpering"], "question": "which entity has a duck quacking?", "label": 0}, {"captions": ["an aircraft engine runs as wind blows heavily", "an engine runs loudly"], "sample_ids": ["xjvTpk2Zpr8", "vqZuVbG6-HI"], "start_seconds": ["70", "130"], "properties": ["engine, run, wind", "loud, engine, run"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a lawn mower is running and men are speaking "], "question": "which entity is running", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "a car speeding up in the distance"], "sample_ids": ["xMXvkIcaG0Y", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["sound, humming, rattling", "distance, car, speed"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", null], "captions_pred_audio": ["an engine is revving and accelerating ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "an infant crying as a woman laughs"], "sample_ids": ["sZPuqDgX2V0", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["commentator, race, track", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vbr9mHKc8WM", "wz7N8YRy74I"], "start_seconds": ["40", "30"], "properties": ["noise, loudness, engine", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["an engine is idling", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is quieter", "label": 0}, {"captions": ["an engine runs and a man speaks", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["yT5WfYMRr-U", "vddP56-ogds"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "liquid, laughs, man"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "water is running and gurgling and a man is speaking"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["speaking following by laughing and clapping", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u2f5NpsoHBg", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["person, laugh, clap", "airplane, boy, fly"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a car speeds away loudly followed by a car revving loudly and driving away while outside"], "sample_ids": ["x5cuQjOdM3E", "sjlVMgdGSK0"], "start_seconds": ["30", "30"], "properties": ["cat, talk, meow", "car, revving, loudly"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a cat meows and a woman speaks", "a car accelerates and revs its engine "], "question": "which entity is louder", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "a speedboat passes quickly on the water"], "sample_ids": ["zkKdxzNC97Y", "tjmoSi330GM"], "start_seconds": ["27", "23"], "properties": ["hard, surface, door", "speed, water, boat"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a door is opened and closed", "a motorboat speeds through water with wind noise "], "question": "which object is moving on a hard surface", "label": 0}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a duck quacks continuously"], "sample_ids": ["zALy31PjDl0", "vh30P49Po6s"], "start_seconds": ["21", "30"], "properties": ["a man, a vehicle, a horn", "quacks, continuously, duck"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a man speaks as bees buzz and birds chirp"], "sample_ids": ["smDKStoHBJo", "t25U-v4k4ts"], "start_seconds": ["0", "40"], "properties": ["a, talk, baby, cry", "bees buzz, birds chirp, man speaks"], "captions_pred_video": ["a man holding a crying baby in his arms", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and bees are buzzing"], "question": "which entity has a baby?", "label": 0}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "some tunes played by whistling"], "sample_ids": ["xjhAnI2q6hM", "u6BnG6YZqJ4"], "start_seconds": ["6", "0"], "properties": ["wind, blow, loudly", "tune, play, whistling"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["scraping and female speech with distant music", "a cat meows and children speak"], "sample_ids": ["yHeVV-xeOxQ", "x5cuQjOdM3E"], "start_seconds": ["130", "30"], "properties": ["female, speech, music", "cat, speak, children"], "captions_pred_video": ["of a girl milking a goat's udder", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a cat meows and a woman speaks"], "question": "which entity is more likely to be a cat", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "wind blows as people chatter quietly"], "sample_ids": ["yZmhM1HcsyE", "xBxDz0CFVn0"], "start_seconds": ["4", "30"], "properties": ["engine, roar, water", "wind, chatter, people"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "footage is blurry and out of focus"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an airplane accelerates briefly", "pigeons vocalize and birds chirp"], "sample_ids": ["zjTG0gaGCUI", "uiS58TNyUiw"], "start_seconds": ["80", "430"], "properties": ["accelerates, airplane, briefly", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "a infant makes noise and is excited"], "sample_ids": ["slZLHwNbbt4", "wIJK3-5y0kA"], "start_seconds": ["300", "30"], "properties": ["a, horn, run", "noise, excited, infant"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vbpKkWvfOu4", "zl9Dqx-j7q4"], "start_seconds": ["560", "6"], "properties": ["a, man, speaks", "engine, laugh, loud"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a dark barks and whimpers"], "sample_ids": ["un9VQlzgZM", "sYj4hpDUZDQ"], "start_seconds": ["5", "30"], "properties": ["females, talk, laugh", "barks, whimpers, dark"], "captions_pred_video": [null, "a brown and white dog standing in front of a wall with its mouth open"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a dog barks and a cat meows"], "question": "which entity is a dog", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a car speeding up in the distance"], "sample_ids": ["sNB8zxXneIM", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["several, quack, cocks", "distance, car, speed"], "captions_pred_video": ["a group of geese in a cage", null], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["an airplane engine spools and people speak", "a woman speaks as she rubs two objects together"], "sample_ids": ["wTjoRj1se3U", "vzxHnu-SFEw"], "start_seconds": ["390", "80"], "properties": ["airplane, engine, spool", "two objects, woman, speak"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a jet engine is running and people are talking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a machine?", "label": 0}, {"captions": ["a man talks while a clock does ticktock", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["spYNpeN7rPY", "tdWhHV3X25Q"], "start_seconds": ["1", "60"], "properties": ["a clock, ticktock, man", "applause, audience, yells"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "white noise and snoring with some rustling in the background"], "sample_ids": ["wztCSUxOf8", "xzKKf9bKNUo"], "start_seconds": ["130", "10"], "properties": ["a crowd, yells, applauds", "background, noise, snoring"], "captions_pred_video": [null, "shows a woman laying on a bed with her eyes closed and her mouth open"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a person snoring loudly"], "question": "which entity is more quiet", "label": 1}, {"captions": ["some tunes played by whistling", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u6BnG6YZqJ4", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["tune, play, whistling", "airplane, boy, fly"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person whistling a song", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "some men converse over an engine running"], "sample_ids": ["uPDn2BFTHk", "sCiy7QS1U"], "start_seconds": ["140", "300"], "properties": ["lady, laugh, baby", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a baby?", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wAAkbZToh8", "y8WEcpOlT3I"], "start_seconds": ["0", "40"], "properties": ["burp, laugh, speak", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man burps and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is about a person speaking and laughing?", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a propeller rotates loudly and intensely"], "sample_ids": ["uRExseg-0XI", "ugHJF0hfYkg"], "start_seconds": ["210", "10"], "properties": ["woman, man, water", "loud, intense, propeller"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a helicopter is flying overhead "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an electric engine works nearby followed by a child talking", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xSKJGCItUWE", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["engine, work, child", "wind, blow, vehicle"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine?", "label": 1}, {"captions": ["an infant crying as a woman laughs", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["xhmRY9yhC7c", "rqu8iB22IY"], "start_seconds": ["20", "5"], "properties": ["a, laugh, infant", "sound, repeats, laugh"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a dog barks and a man speaks while music plays "], "question": "which entity has a woman laugh?", "label": 0}, {"captions": ["a large crowd cheers and applauds", "a man speaks as a machine runs"], "sample_ids": ["rqfQRErjfk8", "vD6lYD1l0BY"], "start_seconds": ["170", "330"], "properties": ["crowd, cheers, applauds", "a, machine, run"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "game controller being held in the hands of the person"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a man is speaking and dishes are being washed "], "question": "which entity is a machine", "label": 1}, {"captions": ["water splashes and a door squeaks", "a man speaks over intermittent keyboard taps"], "sample_ids": ["sdXV-ylviw", "tw76HGONaKg"], "start_seconds": ["190", "570"], "properties": ["sound, splash, door", "audio, man, keyboard"], "captions_pred_video": [null, "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a man speaks and types on a computer keyboard "], "question": "which entity has a door?", "label": 0}, {"captions": ["a guy sneezes followed by another guy speaking and whistling and then a pigeon coos", "birds chirp and animals vocalize"], "sample_ids": ["vhIcOufIwo4", "sxIvBMSavMQ"], "start_seconds": ["30", "210"], "properties": ["sneeze, speaking, pigeon", "vocalize, chirp, animal"], "captions_pred_video": ["footage of a pigeon in a cage", "beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a"], "captions_pred_audio": ["a man is speaking and pigeons are cooing", "birds are chirping and insects are buzzing"], "question": "which entity is about animals vocalizing?", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "an airplane engine spools and people speak"], "sample_ids": ["s3cTDAj31g", "wTjoRj1se3U"], "start_seconds": ["80", "390"], "properties": ["man, talk, woman", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a jet engine is running and people are talking"], "question": "which entity is a video of a man talking and a woman shouting?", "label": 0}, {"captions": ["a male speaks over some small clicks", "a bird is chirping and tweeting a bird song"], "sample_ids": ["uXxVebHsGZ8", "wPz6QRAkEb4"], "start_seconds": ["30", "60"], "properties": ["male, clicks, speak", "chirps, tweets, song"], "captions_pred_video": [null, "a bird in a cage on top of a pole"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "birds are chirping in the background "], "question": "which entity is a bird", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "dishes cling together then a man begins to speak"], "sample_ids": ["y8WEcpOlT3I", "sQGXqGcwOTc"], "start_seconds": ["40", "3"], "properties": ["wind, speak, buffeting", "cling, speak, dishes"], "captions_pred_video": ["on how to use a sewing machine youtube", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about speaking?", "label": 0}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uEU-Hg5MTN8", "sSMl2vc3ek"], "start_seconds": ["27", "20"], "properties": ["a woman, laughs, animal", "loud, multiple, distance"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a kid speaks followed by music playing", "an insect buzzes around continuously"], "sample_ids": ["tQWGZLItBXk", "v25l1jef3JY"], "start_seconds": ["170", "0"], "properties": ["music, kid, speak", "buzzes, continuously, insect"], "captions_pred_video": ["worms revolution screenshots", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a man speaks as a car is passing by"], "sample_ids": ["sZPuqDgX2V0", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["commentator, race, track", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a race", "label": 1}, {"captions": ["someone is snoring while sleeping", "a person snores loudly multiple times at a close distance"], "sample_ids": ["ujMt0-D-x2k", "sSMl2vc3ek"], "start_seconds": ["0", "20"], "properties": ["snore, sleep, someone", "loud, multiple, distance"], "captions_pred_video": ["of the dog playing with a toy on the floor", null], "captions_pred_audio": ["a person is snoring loudly", "a person snoring loudly"], "question": "which entity is snoring while sleeping", "label": 0}, {"captions": ["an airplane engine roars increasingly louder", "a clock ticktocks"], "sample_ids": ["vBslzh7saPw", "v-g-j2uTByM"], "start_seconds": ["90", "30"], "properties": ["engine, roar, louder", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "some tunes played by whistling"], "sample_ids": ["vs65y4qmyBE", "u6BnG6YZqJ4"], "start_seconds": ["340", "0"], "properties": ["engine, run, man", "tune, play, whistling"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "a car speeding up in the distance"], "sample_ids": ["vz8868znkVQ", "u0TrcHhkPQ"], "start_seconds": ["60", "20"], "properties": ["audio, click, kid speaking", "distance, car, speed"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", null], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a man speaks as a motor runs in the background"], "sample_ids": ["tDlysoZiA1I", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["animal, grunts, chirps", "background, motor, run"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "vehicles pass by on a roadway"], "sample_ids": ["wvKpEYswXO0", "tgbONvsP47Y"], "start_seconds": ["150", "0"], "properties": ["water, tap, run", "pass, vehicle, roadway"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sSMl2vc3ek", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["a person, laughs, snores", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["xfudFO976zE", "zY3icUyMdh8"], "start_seconds": ["0", "20"], "properties": ["animal, bleats, cry", "dog, bark, engine"], "captions_pred_video": ["footage is blurry and shaky", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a car is driving and dogs are barking and squealing "], "question": "which entity is more active", "label": 1}, {"captions": ["a dog barks and whimpers", "a car speeding up in the distance"], "sample_ids": ["sShpyu2l4YQ", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["barks, whimpers, dog", "distance, car, speed"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a piece of wood is being placed down and sawed"], "sample_ids": ["tQWGZLItBXk", "uiItxDsDMFI"], "start_seconds": ["170", "30"], "properties": ["music, person, ding", "wood, piece, saw"], "captions_pred_video": ["worms revolution screenshots", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a saw is being used with background noise "], "question": "which entity is about a person speaking?", "label": 0}, {"captions": ["a small engine spits as it runs", "small dogs yip and bark sharply"], "sample_ids": ["sZvwOuuPGP0", "v-wcQf4BDY0"], "start_seconds": ["50", "120"], "properties": ["spits, engine, runs", "bark, yip, sharply"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a medium engine is running ", "a dog barks and growls"], "question": "which entity is more likely to be a person", "label": 1}, {"captions": ["a person is snoring while sleeping", "people speak as gunfire rings out"], "sample_ids": ["vJrjSeP17yE", "wqTCwqVRDlk"], "start_seconds": ["40", "80"], "properties": ["a person is sleeping, snoring, person", "gunfire, ring, speak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a motorcycle idles loudly as wind blows", "water splashes as an animal walks through"], "sample_ids": ["v7jJS8aAyA", "w1ir-sZ3Im8"], "start_seconds": ["10", "90"], "properties": ["wind, blows, loudly", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "water splashes and gurgles as people speak"], "question": "which entity is more quiet", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "water flows and trickles"], "sample_ids": ["vqZuVbG6-HI", "tB7hWb9gTuQ"], "start_seconds": ["130", "30"], "properties": ["background, male, female", "water, flow, trickle"], "captions_pred_video": ["footage is blurry because it's raining outside", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "water is splashing and gurgling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a train horn blows as it passes by"], "sample_ids": ["sSMl2vc3ek", "zVacuqSb4LI"], "start_seconds": ["20", "30"], "properties": ["a person, laughs, snores", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a person snoring loudly", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "motors rev and run loudly as a person laughs"], "sample_ids": ["vqZuVbG6-HI", "zl9Dqx-j7q4"], "start_seconds": ["130", "6"], "properties": ["background, male, female", "motors rev, laugh, loudly"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "wind blowing followed by a zoom"], "sample_ids": ["zl9Dqx-j7q4", "vr8ZXjEBhMQ"], "start_seconds": ["6", "150"], "properties": ["engine, laugh, loud", "wind, blow, zoom"], "captions_pred_video": ["footage of a man driving a car in the dark", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a jet engine roars ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["an emergency siren wails as it passes", "a woman speaks as she rubs two objects together"], "sample_ids": ["vGj1XLJvNrw", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["wails, wails, pass", "two objects, woman, speak"], "captions_pred_video": ["footage of a police car driving down a city street", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is silent", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "small dogs yip and bark sharply"], "sample_ids": ["tDlysoZiA1I", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["animal, grunts, chirps", "bark, yip, sharply"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a car accelerates and wind blows"], "sample_ids": ["tPJvjq9QePY", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["animal, bleat, moo", "accelerates, wind, blows"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks with water running", "several insects fly while two men talk"], "sample_ids": ["wTideSjRFS0", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["water, running, woman", "several, fly, men"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "birds chirp and objects are moved around"], "sample_ids": ["sapQIQUhFc", "yPUYU6t3rwo"], "start_seconds": ["280", "370"], "properties": ["water, trickles, flow", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a machine runs", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vD6lYD1l0BY", "su6FAOcOA8c"], "start_seconds": ["330", "4"], "properties": ["a, machine, run", "engine, idle, woman"], "captions_pred_video": ["game controller being held in the hands of the person", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a clock ticktocks"], "sample_ids": ["uKCSGgof8gI", "v-g-j2uTByM"], "start_seconds": ["12", "30"], "properties": ["chirps, distance, signal", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vuUVPzd2FXw", "tdWhHV3X25Q"], "start_seconds": ["160", "60"], "properties": ["a, steam, release", "applause, audience, yells"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zofjfKhqLk8", "su6FAOcOA8c"], "start_seconds": ["10", "4"], "properties": ["background, metal, clings", "engine, idle, woman"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "water splashes as an animal walks through"], "sample_ids": ["voJh2gJxXhA", "w1ir-sZ3Im8"], "start_seconds": ["50", "90"], "properties": ["music, frog, croak", "animal, water, splashes"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["music is playing and crickets are chirping ", "water splashes and gurgles as people speak"], "question": "which entity is about a frog?", "label": 0}, {"captions": ["a drill runs and two people laugh", "a man speaks as a motor runs in the background"], "sample_ids": ["tEE3MpBt1sg", "xZepNM9qcRA"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "background, motor, run"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["people clap and speak in the distance", "a speedboat passes quickly on the water"], "sample_ids": ["wwyfGO2J4", "tjmoSi330GM"], "start_seconds": ["90", "23"], "properties": ["clap, distance, speak", "speed, water, boat"], "captions_pred_video": [null, "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a motorboat speeds through water with wind noise "], "question": "which is moving faster", "label": 0}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["siJFXfGWgDk", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["a, bird, vehicle", "a woman, something, fried"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["multiple birds vocalize and wind blows", "vehicles pass by on a roadway"], "sample_ids": ["uoGVs9yUqY4", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["multiple, vocalize, wind", "pass, vehicle, roadway"], "captions_pred_video": ["for how to make a wooden shed door youtube", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tQWGZLItBXk", "tDVADusiIoc"], "start_seconds": ["170", "60"], "properties": ["music, person, ding", "water, radio, man"], "captions_pred_video": ["worms revolution screenshots", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["vBslzh7saPw", "uWPRNLnpy7Y"], "start_seconds": ["90", "10"], "properties": ["power, scream, increase", "accelerate, laugh, vehicle"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "is taken from a car driving down the street"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xjvTpk2Zpr8", "uZesmtKZGSw"], "start_seconds": ["70", "250"], "properties": ["wind, blows, vehicle", "men, talk, cars"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more likely to be a movie", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a man speaks as a machine runs"], "sample_ids": ["wfHeoPDLMaM", "vD6lYD1l0BY"], "start_seconds": ["30", "330"], "properties": ["quacking, squawking, ducks", "a, machine, run"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "game controller being held in the hands of the person"], "captions_pred_audio": ["ducks are quacking", "a man is speaking and dishes are being washed "], "question": "which entity is not a person?", "label": 0}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a vehicle engine accelerating then running on idle"], "sample_ids": ["rwtmaKiCcQU", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["nozzle, depressed, spray can", "engine, accelerate, idle"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["spraying and people speaking", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["birds fly and flutter around", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wGKgwOP3h30", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["fly, flutter, around", "engine, accelerate, idle"], "captions_pred_video": ["of the pigeons in the coop", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["pigeons coo and flap their wings", "an engine is idling"], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["electronic beeps occur in a short series", "water running down a sink while a man is talking"], "sample_ids": ["y682ml90jGw", "vSeGhaZt-aI"], "start_seconds": ["11", "50"], "properties": ["beeps, series, electronic", "water, sink, talk"], "captions_pred_video": [null, "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a beeping sound is being made ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a man speaks followed by another man speaking outside"], "sample_ids": ["wwyfGO2J4", "viuTg1M-dqg"], "start_seconds": ["90", "30"], "properties": ["people, applaud, hoot", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 1}, {"captions": ["a baby cries and a woman moans", "an insect buzzes around continuously"], "sample_ids": ["smDKStoHBJo", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["a, cry, woman", "buzzes, continuously, insect"], "captions_pred_video": ["a man holding a crying baby in his arms", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["skd2PphS6oI", "tDlysoZiA1I"], "start_seconds": ["190", "0"], "properties": ["ring, bird, vocalize", "animal, grunts, chirps"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "birds are chirping and a rooster is crowing "], "question": "which entity has a bird vocalize?", "label": 0}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "someone is typing on a computer keyboard"], "sample_ids": ["ylpYOorfH4o", "v0x1odnXtP0"], "start_seconds": ["410", "210"], "properties": ["engine, run, loud", "keyboard, type, computer"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vfYTJq7nU", "yajyRTUQk3U"], "start_seconds": ["130", "400"], "properties": ["ducks, quack, man", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a toilet door squeaks as it is opened"], "sample_ids": ["sShpyu2l4YQ", "sdXV-ylviw"], "start_seconds": ["0", "190"], "properties": ["growl, bark, yip", "door, toilet, squeaks"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a dog barks and taps with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sU53zg9Jp7s", "uZesmtKZGSw"], "start_seconds": ["380", "250"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "men, talk, cars"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["several ducks quack and cocks crow far away", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sNB8zxXneIM", "tDVADusiIoc"], "start_seconds": ["20", "60"], "properties": ["several, quack, cocks", "water, radio, man"], "captions_pred_video": ["a group of geese in a cage", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "tapping occurs then a baby cries"], "sample_ids": ["x5cuQjOdM3E", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["cat, meows, young woman", "a, cry, baby"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a cat meows and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is crying", "label": 1}, {"captions": ["several insects fly while two men talk", "some men converse over an engine running"], "sample_ids": ["s-T9OVOiMLo", "sCiy7QS1U"], "start_seconds": ["330", "300"], "properties": ["several, fly, men", "men, converse, engine"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", null], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has more people", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wqZ135Ssz0", "sLUnaPT5gM8"], "start_seconds": ["60", "0"], "properties": ["two men, woman, birds", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wyllXV6PjKo", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["a kid, talk, cry", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a baby is crying"], "question": "which entity has a kid?", "label": 0}, {"captions": ["a clock ticktocks briefly", "a person is burping then speaks and laughs"], "sample_ids": ["u7C-AEBQM", "wAAkbZToh8"], "start_seconds": ["30", "0"], "properties": ["ticktocks, clock, ticktocks briefly", "burp, laugh, speak"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a man burps and a woman speaks"], "question": "which entity is speaking?", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "several insects fly while two men talk"], "sample_ids": ["yajyRTUQk3U", "s-T9OVOiMLo"], "start_seconds": ["400", "330"], "properties": ["a woman, something, fried", "several, fly, men"], "captions_pred_video": ["- a woman cooking in the kitchen", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["an engine starts and increases in power", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zjTG0gaGCUI", "zFjIWfSD-4"], "start_seconds": ["80", "410"], "properties": ["power, increase, engine", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a jet engine roars as wind blows ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a motor?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xvDdE3zNf8Y", "wDVMhEdTiVw"], "start_seconds": ["120", "30"], "properties": ["a, female, speaks", "gun, shoot, water"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman speaks and crumples paper", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more violent", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "vehicles pass by on a roadway"], "sample_ids": ["wqZ135Ssz0", "tgbONvsP47Y"], "start_seconds": ["60", "0"], "properties": ["two men, woman, birds", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a car is driving on the road "], "question": "which entity is more active", "label": 1}, {"captions": ["a helicopter engine runs continuously", "vehicles pass by on a roadway"], "sample_ids": ["ugHJF0hfYkg", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["continuous sneezing together with speech", "wind blowing followed by a zoom"], "sample_ids": ["x4dZyf9Gbj0", "vr8ZXjEBhMQ"], "start_seconds": ["130", "150"], "properties": ["continuous, sneeze, speech", "wind, blow, zoom"], "captions_pred_video": ["footage is blurry and out of focus", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman sneezes and speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a door opens and closes", "several insects fly while two men talk"], "sample_ids": ["vBHyYJ8pL0", "s-T9OVOiMLo"], "start_seconds": ["2", "330"], "properties": ["open, close, door", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be in a museum", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vMf1dLD6Sng", "sLUnaPT5gM8"], "start_seconds": ["6", "0"], "properties": ["frog, bird, vocalize", "loud, laughter, intermittent"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a frog croaks loudly", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a female speaks softly as paper crinkles", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["xvDdE3zNf8Y", "tiDFTC-5vU"], "start_seconds": ["120", "30"], "properties": ["a, female, speaks", "male, duck, laugh"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", null], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking and ducks are quacking"], "question": "which entity is more likely to be a person speaking?", "label": 0}, {"captions": ["frogs croak and vocalize", "wind blowing followed by a zoom"], "sample_ids": ["yswmmRZFItk", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["croak, vocalize, frog", "wind, blow, zoom"], "captions_pred_video": ["a close up of a frog in the water", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a frog is croaking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "waves crash against a shoreline and people speak"], "sample_ids": ["w2JXXIAdUdg", "yFB25fqfU8I"], "start_seconds": ["10", "300"], "properties": ["snoring, distance, person", "wave, crash, shoreline"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be a recording", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "water is sprayed across a hard surface"], "sample_ids": ["sncRqQ67iJU", "sQwlkXjQabo"], "start_seconds": ["460", "10"], "properties": ["loud, repeatedly, man", "water, spray, surface"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a person is snoring", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vSeGhaZt-aI", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["water, sink, talk", "engine, laugh, loud"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "vehicles pass by on a roadway"], "sample_ids": ["ujMt0-D-x2k", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["snoring, rhythmical, nearby", "pass, vehicle, roadway"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person is snoring loudly", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "paper is crumpling consistently"], "sample_ids": ["vBslzh7saPw", "v5cSxLaHADY"], "start_seconds": ["90", "0"], "properties": ["power, scream, increase", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a jet engine roars and accelerates ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a person is burping while a girl speaks"], "sample_ids": ["wz7N8YRy74I", "vdoxuJn9lTc"], "start_seconds": ["30", "40"], "properties": ["rooster, crow, background, people", "person, burp, girl"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a child speaks followed by a burp"], "question": "which entity is a person speaking to a rooster?", "label": 0}, {"captions": ["a flush is followed by gurgling water, then another flush", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["tqR406bGiE", "wz7N8YRy74I"], "start_seconds": ["40", "30"], "properties": ["flush, water, gurgle", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a video of a toilet flushing?", "label": 0}, {"captions": ["birds tweet and squawk", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["w1mlz3Pe4fU", "xfaoyyzw2WU"], "start_seconds": ["300", "180"], "properties": ["squawk, tweet, scream", "loud, jet engine, roar"], "captions_pred_video": ["of a bird in a cage", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["birds are chirping and singing", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a man speaks as a car is passing by"], "sample_ids": ["uYT5gxnyMWM", "sK4u5T8hW78"], "start_seconds": ["50", "30"], "properties": ["a, scream, girl", "a, car, pass"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "a woman speaks as she rubs two objects together"], "sample_ids": ["vBslzh7saPw", "vzxHnu-SFEw"], "start_seconds": ["90", "80"], "properties": ["engine, roar, louder", "two objects, woman, speak"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is silent", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a car speeding up in the distance"], "sample_ids": ["smDKStoHBJo", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["a, infant, speaking", "distance, car, speed"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["uWPRNLnpy7Y", "zj2R0XoFr5k"], "start_seconds": ["10", "50"], "properties": ["accelerate, laugh, vehicle", "airplane, boy, fly"], "captions_pred_video": ["is taken from a car driving down the street", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["vcmWSmvti8", "zY3icUyMdh8"], "start_seconds": ["30", "20"], "properties": ["music, man, fire", "dog, bark, engine"], "captions_pred_video": [null, "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a car is driving and dogs are barking and squealing "], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a person snores loudly multiple times at a close distance"], "sample_ids": ["w5W5Kqtc8E", "sSMl2vc3ek"], "start_seconds": ["100", "20"], "properties": ["wind, blow, vehicle", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["electronic beeps occur in a short series", "waves crash against a shoreline and people speak"], "sample_ids": ["y682ml90jGw", "yFB25fqfU8I"], "start_seconds": ["11", "300"], "properties": ["beeps, series, electronic", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a beeping sound is being made ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["birds chirp and wind blows", "waves crash against a shoreline and people speak"], "sample_ids": ["sxIvBMSavMQ", "yFB25fqfU8I"], "start_seconds": ["210", "300"], "properties": ["birds, chirp, wind", "wave, crash, shoreline"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage of a person surfing in the ocean"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["slZLHwNbbt4", "vlS6YMeWAPo"], "start_seconds": ["300", "40"], "properties": ["clap, distance, horn", "sheep, baa, birds"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["uEU-Hg5MTN8", "vlJS7LN2XyM"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "background, clocks, ticking"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a ticktock of a clock"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a man speaks as a car is passing by"], "sample_ids": ["v5P-ThUCINM", "sK4u5T8hW78"], "start_seconds": ["400", "30"], "properties": ["background, chirp, bird", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "an infant crying as a woman laughs"], "sample_ids": ["wjsXBsc7M40", "xhmRY9yhC7c"], "start_seconds": ["10", "20"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "a, laugh, infant"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is a video of a baby laughing?", "label": 0}, {"captions": ["a toilet flushes and water sputters as it drains", "a woman speaks as she rubs two objects together"], "sample_ids": ["smGI3C1NZc", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["water, drain, toilet", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a process", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a man speaks as a motor runs in the background"], "sample_ids": ["sapQIQUhFc", "xZepNM9qcRA"], "start_seconds": ["280", "30"], "properties": ["liquid, flow, distance", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a child yells and another yells", "an engine runs loudly"], "sample_ids": ["vMDHu7Lxcgw", "vqZuVbG6-HI"], "start_seconds": ["410", "130"], "properties": ["two, yell, child", "loud, engine, run"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sTpirNYo8vQ", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["a, tone, fast", "a, scream, girl"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man speaks as a motor runs in the background"], "sample_ids": ["sfAvvZwdLCY", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "background, motor, run"], "captions_pred_video": ["footage of the toilet in the bathroom", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a toilet is flushed", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["tapping occurs then a baby cries", "people speak as gunfire rings out"], "sample_ids": ["wIJK3-5y0kA", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["a, cry, baby", "gunfire, ring, speak"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a man talks while vehicles pass by", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sK4u5T8hW78", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["a, man, talk", "People, motor, brakes"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has more people", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a frog vocalizes while birds chirp"], "sample_ids": ["wTideSjRFS0", "vMf1dLD6Sng"], "start_seconds": ["30", "6"], "properties": ["food, sizzle, woman", "frog, bird, vocalize"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a frog in a pond with pink flowers in the background"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a frog croaks loudly"], "question": "which entity is a frog?", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "a propeller rotates loudly and intensely"], "sample_ids": ["xzKKf9bKNUo", "ugHJF0hfYkg"], "start_seconds": ["10", "10"], "properties": ["background, noise, snoring", "loud, intense, propeller"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a person snoring loudly", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wnpJndXuxLc", "tDVADusiIoc"], "start_seconds": ["50", "60"], "properties": ["blows, vehicle, train", "water, radio, man"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a train?", "label": 0}, {"captions": ["some clanking with distant murmuring", "a man speaks with another voice speaking in the background"], "sample_ids": ["uMTTDZ2mb4", "u21-Z5gJCB8"], "start_seconds": ["30", "30"], "properties": ["clanking, murmuring, distant", "background, voice, man"], "captions_pred_video": [null, "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking with another voice speaking in the background?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["x5cuQjOdM3E", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["cat, meows, young woman", "rooster, crow, background, men"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more domesticated", "label": 0}, {"captions": ["a motor runs and stops, and animals squawk and croak", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["s4tUs779vBA", "w34HjHr6gAY"], "start_seconds": ["160", "30"], "properties": ["a, sound, stop", "beeps, hit, woman"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a baby cries and a woman moans"], "sample_ids": ["spJCm8tD9Zo", "smDKStoHBJo"], "start_seconds": ["90", "0"], "properties": ["snores, wheezes, sleeps", "a, cry, woman"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a man holding a crying baby in his arms"], "captions_pred_audio": ["a person is snoring loudly", "a baby is crying and a woman is speaking"], "question": "which entity is a person", "label": 0}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a child speaks in closed space"], "sample_ids": ["ziUT9IFTkjg", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["background, birds, rustling", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "people cheer as a vehicle engine revs"], "sample_ids": ["sfAvvZwdLCY", "xjhAnI2q6hM"], "start_seconds": ["20", "6"], "properties": ["flushes, drains, water", "engine revs, vehicle, people"], "captions_pred_video": ["footage of the toilet in the bathroom", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a toilet is flushed", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a clang followed by a toilet flushing", "paper is crumpling consistently"], "sample_ids": ["wNZ5thZM7XU", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["sound, flush, toilet", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a toilet in a bathroom stall", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a toilet flushes", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "roadway noise occurs and a truck accelerates"], "sample_ids": ["vqZuVbG6-HI", "tgbONvsP47Y"], "start_seconds": ["130", "0"], "properties": ["background, male, female", "noise, truck, accelerate"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a car is driving on the road "], "question": "which entity is more quiet", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a man speaks as a motor runs in the background"], "sample_ids": ["sWZzXuWYY", "xZepNM9qcRA"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a telephone rings followed by a woman talking"], "sample_ids": ["wyllXV6PjKo", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["a kid, talk, cry", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a motorcycle engine works nearby", "an insect buzzes around continuously"], "sample_ids": ["tOSWIURC-4", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["engine, work, nearby", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a lawn mower is running ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["an airplane engine runs", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yVPZ2MNWpms", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["engine, airplane, runs", "male, duck, laugh"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a clock ticktocks"], "sample_ids": ["y2bVZ7rz-5M", "v-g-j2uTByM"], "start_seconds": ["280", "30"], "properties": ["motor noise, horn, siren", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "paper is crumpling consistently"], "sample_ids": ["sd7xVssqlw", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["accelerates, tires, squealing", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a car accelerates and revs its engine ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a child yells and another yells", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vMDHu7Lxcgw", "tdWhHV3X25Q"], "start_seconds": ["410", "60"], "properties": ["two, yell, child", "applause, audience, yells"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tgbONvsP47Y", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["pass, vehicle, roadway", "engine, idle, woman"], "captions_pred_video": ["footage of a fire truck entering a garage", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a car is driving on the road ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "water running down a sink while a man is talking"], "sample_ids": ["sZPuqDgX2V0", "vSeGhaZt-aI"], "start_seconds": ["30", "50"], "properties": ["commentator, race, track", "water, sink, talk"], "captions_pred_video": [null, "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video of a race?", "label": 0}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a person is snoring while sleeping"], "sample_ids": ["uzQnlJXBbOM", "vJrjSeP17yE"], "start_seconds": ["50", "40"], "properties": ["ringing, beep, stop", "a person is sleeping, snoring, person"], "captions_pred_video": ["footage of a person using a cell phone on a table", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a telephone rings and a man speaks", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "females talk and laugh over gusting wind"], "sample_ids": ["tEE3MpBt1sg", "un9VQlzgZM"], "start_seconds": ["50", "5"], "properties": ["drill, something, laugh", "females, talk, laugh"], "captions_pred_video": ["footage is blurry due to the smoke in the air", null], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is about a drill?", "label": 0}, {"captions": ["water flows followed by women screaming", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["w5W5Kqtc8E", "vbZ-0lGPneg"], "start_seconds": ["100", "30"], "properties": ["water, flow, women", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more calm", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "people speak as gunfire rings out"], "sample_ids": ["tDVADusiIoc", "wqTCwqVRDlk"], "start_seconds": ["60", "80"], "properties": ["wind, radio, waves", "gunfire, ring, speak"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "wind blows as people chatter quietly"], "sample_ids": ["vXlk0lIQBFo", "xBxDz0CFVn0"], "start_seconds": ["470", "30"], "properties": ["wind, speak, vocalize", "wind, chatter, people"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage is blurry and out of focus"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a stream of water runs briefly"], "sample_ids": ["yLy-WycbVVE", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["background, people, talk", "stream, water, run"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["yeFvk9x0wWI", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["clack, bird, chirp", "gun, shoot, water"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "water splashes as an animal walks through"], "sample_ids": ["zliInBdC98Y", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["a, baby, cries, wails", "animal, water, splashes"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a baby cries and a woman speaks", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["an audience gives applause", "a man speaks as crickets sing"], "sample_ids": ["x6iCUDmRpKQ", "ryFDPxgDOGc"], "start_seconds": ["38", "570"], "properties": ["applause, audience, give", "a, crickets, sing"], "captions_pred_video": ["a black background with the moon and stars in the sky", "a group of people dressed in camouflage and hunting gear in the dark"], "captions_pred_audio": ["a group of people are clapping and cheering", "a man is speaking with crickets chirping in the background"], "question": "which is not a type of music", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["uWAAAL4CIoc", "tDlysoZiA1I"], "start_seconds": ["0", "0"], "properties": ["a woman, chirps, animal", "animal, grunts, chirps"], "captions_pred_video": [null, "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "birds are chirping and a rooster is crowing "], "question": "which entity has a woman speaking happily and an animal chirps?", "label": 0}, {"captions": ["people speak as gunfire rings out", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wqTCwqVRDlk", "xBxDz0CFVn0"], "start_seconds": ["80", "30"], "properties": ["gunfire, ring, speak", "stream, water, flow"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a person speaks briefly", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["zOZleIRqZm4", "w34HjHr6gAY"], "start_seconds": ["80", "30"], "properties": ["person, talk, brief", "beeps, hit, woman"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a beep sounds followed by a child speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "people cheer as a vehicle engine revs"], "sample_ids": ["sQwlkXjQabo", "xjhAnI2q6hM"], "start_seconds": ["10", "6"], "properties": ["liquid, surface, spray", "engine revs, vehicle, people"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["spraying followed by silence", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["someone is burping continuously", "a man speaks as a car is passing by"], "sample_ids": ["y636gklDioE", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["burps, burps, burps", "a, car, pass"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person burps loudly several times", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["continuous sneezing together with speech", "paper is crumpling consistently"], "sample_ids": ["x4dZyf9Gbj0", "v5cSxLaHADY"], "start_seconds": ["130", "0"], "properties": ["continuous, sneeze, speech", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry and out of focus", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman sneezes and speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling consistently", "label": 1}, {"captions": ["food is frying then a woman speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["ukxt9I7eMMg", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["food, woman, speak", "multiple, people, yell"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a clock ticktocks"], "sample_ids": ["uZesmtKZGSw", "v-g-j2uTByM"], "start_seconds": ["250", "30"], "properties": ["men, talk, cars", "ticktocks, clock, ticktocks"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "an airplane engine roars increasingly louder"], "sample_ids": ["wSVhSdj0F0", "vBslzh7saPw"], "start_seconds": ["10", "90"], "properties": ["beep, clang, footsteps", "engine, roar, louder"], "captions_pred_video": [null, "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a jet engine roars and accelerates "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "someone whistles a tune"], "sample_ids": ["w5W5Kqtc8E", "sIXTftIuUgw"], "start_seconds": ["100", "90"], "properties": ["wind, blow, vehicle", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a man speaks as a car is passing by"], "sample_ids": ["yRx9txMcBl0", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["motors, tires, screech", "a, car, pass"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a clock ticktocks continuously", "paper is crumpling consistently"], "sample_ids": ["vlJS7LN2XyM", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["ticktocks, clock, ticktocks continuously", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a ticktock of a clock", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["tgbONvsP47Y", "sLUnaPT5gM8"], "start_seconds": ["0", "0"], "properties": ["noise, truck, accelerate", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a fire truck entering a garage", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a car is driving on the road ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks while turning a water faucet on", "some men converse over an engine running"], "sample_ids": ["vf9xf3vMsGM", "sCiy7QS1U"], "start_seconds": ["540", "300"], "properties": ["A man speaks while turning a water faucet on.", "men, converse, engine"], "captions_pred_video": ["of the person washing their hands under the faucet", null], "captions_pred_audio": ["a man is speaking while water is running in the background", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "wind blows and people scream while an engine revs"], "sample_ids": ["s4Uz1Ffgo04", "w5W5Kqtc8E"], "start_seconds": ["100", "100"], "properties": ["water, rushes, motorcycle", "wind, engine, scream"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "waves crash against a shoreline and people speak"], "sample_ids": ["w5W5Kqtc8E", "yFB25fqfU8I"], "start_seconds": ["100", "300"], "properties": ["wind, engine, scream", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "a telephone rings followed by a woman talking"], "sample_ids": ["xOZfdgAgJ9o", "tGcFnX0GHI"], "start_seconds": ["40", "0"], "properties": ["woman, whimpering, speaking", "ring, talk, woman"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a dial tone sounds followed by a woman speaking"], "question": "which woman is speaking", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wTideSjRFS0", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["food, sizzle, woman", "applause, audience, yells"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman sneezes then speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["x4dZyf9Gbj0", "sSMl2vc3ek"], "start_seconds": ["130", "20"], "properties": ["sneezes, speaks, woman", "loud, multiple, distance"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a woman sneezes and speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["someone snores nearby", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["spJCm8tD9Zo", "vbZ-0lGPneg"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "a woman, a television program, a bird"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "people speak in the background as a clock ticktocks"], "sample_ids": ["slZLHwNbbt4", "vZAw4apG0Es"], "start_seconds": ["300", "30"], "properties": ["train, horn, sound", "background, clock, ticktocks"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a clock is ticking and people are talking"], "question": "which entity is a clock", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "people speak as gunfire rings out"], "sample_ids": ["uqFtmnhuqA8", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["a, b, c", "gunfire, ring, speak"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sK4u5T8hW78", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["a, car, pass", "airplane, boy, fly"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman speaks while a helicopter flies overhead "], "question": "which object is flying by", "label": 0}, {"captions": ["paper is repeatedly crumpled and crinkled", "water flows and trickles"], "sample_ids": ["vms5XGTDVQc", "tB7hWb9gTuQ"], "start_seconds": ["220", "30"], "properties": ["paper, crumpled, crinkled", "water, flow, trickle"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["paper is crumpled and crinkled", "water is splashing and gurgling"], "question": "which entity is flowing", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "winds blows roughly as a vehicle races past"], "sample_ids": ["tK4VlLsNxak", "xjvTpk2Zpr8"], "start_seconds": ["120", "70"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "wind, blows, vehicle"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "birds tweet and squawk"], "sample_ids": ["sLUnaPT5gM8", "w1mlz3Pe4fU"], "start_seconds": ["0", "300"], "properties": ["loud, laughter, intermittent", "squawk, tweet, scream"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "of a bird in a cage"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "birds are chirping and singing"], "question": "which entity is a bird", "label": 1}, {"captions": ["food is frying then a woman speaks", "several insects fly while two men talk"], "sample_ids": ["ukxt9I7eMMg", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["food, woman, speak", "several, fly, men"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["low humming with a clock ticking and birds chirping", "a infant makes noise and is excited"], "sample_ids": ["yVumC9TGknc", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["humming, clock, birds", "noise, excited, infant"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a series of beeps and chirps", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vYkA3cfXp5Q", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["speed, idle, accelerate", "three men, wind, flow"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["an engine is idling", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more like a movie", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["t25U-v4k4ts", "zl9Dqx-j7q4"], "start_seconds": ["40", "6"], "properties": ["bees buzz, birds chirp, man speaks", "engine, laugh, loud"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a stream of water runs briefly"], "sample_ids": ["sxYkFKFIZD0", "x-PeY8Yb8M4"], "start_seconds": ["20", "300"], "properties": ["screech, man, door", "stream, water, run"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "people applaud and hoot and chat quietly"], "sample_ids": ["xfudFO976zE", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["animal, bleats, cry", "people, applaud, hoot"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "two men speak as a buffeting wind blows"], "sample_ids": ["vf9xf3vMsGM", "y8WEcpOlT3I"], "start_seconds": ["540", "40"], "properties": ["A man speaks while turning a water faucet on.", "wind, speak, buffeting"], "captions_pred_video": ["of the person washing their hands under the faucet", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a man is speaking with wind noise in the background "], "question": "which entity is indoors", "label": 0}, {"captions": ["a male speaks and another male speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["viuTg1M-dqg", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["two males, speaking, male", "background, motor, run"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yJ0TePmaOo", "zl9Dqx-j7q4"], "start_seconds": ["390", "6"], "properties": ["two hard objects, man, speak", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wyllXV6PjKo", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["a baby, a woman, a man", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a baby?", "label": 0}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vuUVPzd2FXw", "yDoT73BWsdA"], "start_seconds": ["160", "10"], "properties": ["a, steam, release", "engine, revs, vehicle"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["an infant crying as a woman laughs", "wind blows as people chatter quietly"], "sample_ids": ["xhmRY9yhC7c", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["a, laugh, infant", "wind, chatter, people"], "captions_pred_video": ["of a baby crying in a baby bouncer", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a cat meows and children speak", "an engine revs and a turning noise is made"], "sample_ids": ["x5cuQjOdM3E", "tOSWIURC-4"], "start_seconds": ["30", "0"], "properties": ["cat, speak, children", "noise, engine, revs"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a lawn mower is running "], "question": "which entity is making a noise", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a child speaks in closed space"], "sample_ids": ["sfAvvZwdLCY", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["flushes, drains, water", "child, space, speak"], "captions_pred_video": ["footage of the toilet in the bathroom", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a man speaks as a motor runs in the background"], "sample_ids": ["sDSppXIlJrs", "xZepNM9qcRA"], "start_seconds": ["27", "30"], "properties": ["microphone, water, wind", "background, motor, run"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "a woman speaks as she rubs two objects together"], "sample_ids": ["slZLHwNbbt4", "vzxHnu-SFEw"], "start_seconds": ["300", "80"], "properties": ["a, horn, run", "two objects, woman, speak"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a baby cries and a woman moans", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["smDKStoHBJo", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["a, cry, woman", "a, scream, girl"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["motors runs briefly and tires screech", "people cheer as a vehicle engine revs"], "sample_ids": ["yRx9txMcBl0", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["motors, tires, screech", "engine revs, vehicle, people"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a person snoring several times", "a clock ticktocks"], "sample_ids": ["spJCm8tD9Zo", "v-g-j2uTByM"], "start_seconds": ["90", "30"], "properties": ["snore, person, several", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person is snoring loudly", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["rwtmaKiCcQU", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["nozzle, depressed, spray can", "water, radio, man"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["spraying and people speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water?", "label": 1}, {"captions": ["water splashes as an animal walks through", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["w1ir-sZ3Im8", "tDlysoZiA1I"], "start_seconds": ["90", "0"], "properties": ["animal, water, splashes", "animal, grunts, chirps"], "captions_pred_video": ["footage of a group of people riding horses through a river", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["water splashes and gurgles as people speak", "birds are chirping and a rooster is crowing "], "question": "which animal is making noise", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a car accelerates and wind blows"], "sample_ids": ["vddP56-ogds", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["water, flow, laugh", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a machine beeps continuously"], "sample_ids": ["sofxkNWaP0s", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["wind, engine, louder", "beeps, machine, continuously"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", null], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a power tool runs and touches a surface"], "sample_ids": ["vh30P49Po6s", "zfvPRf3chY"], "start_seconds": ["30", "290"], "properties": ["loud, continuous, quacks", "power tool, run, touch"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking while a power tool is being used "], "question": "which is not a continuous action", "label": 1}, {"captions": ["a baby cries and a woman moans", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["smDKStoHBJo", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["a, cry, woman", "engine, idle, woman"], "captions_pred_video": ["a man holding a crying baby in his arms", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "water pouring and bubbling"], "sample_ids": ["vb1fPSDI4c", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["multiple, people, yell", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a crowd of people are talking and laughing", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "people cheer as a vehicle engine revs"], "sample_ids": ["wTideSjRFS0", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["food, sizzle, woman", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wTideSjRFS0", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["food, sizzle, woman", "three men, wind, flow"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman speaking as frying food sizzles?", "label": 0}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zgUgkpk78xU", "wz7N8YRy74I"], "start_seconds": ["70", "30"], "properties": ["clinking, humming, horn", "rooster, crow, background, men"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "water pouring and bubbling"], "sample_ids": ["ylpYOorfH4o", "uyRfq-jKPpo"], "start_seconds": ["410", "50"], "properties": ["engine, running, wind", "water, bubbles, pouring"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and an engine is revving", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wtDqrBygTcU", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["man, engine, run", "harsh, wind, blows"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and a motor is running", "a man is speaking with wind noise in the background "], "question": "which entity is about a boat?", "label": 0}, {"captions": ["a kid speaks followed by music playing", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tQWGZLItBXk", "vfYTJq7nU"], "start_seconds": ["170", "130"], "properties": ["music, kid, speak", "rustling, ducks, quack"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "water pouring and bubbling"], "sample_ids": ["u21-Z5gJCB8", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["background, voice, man", "water, bubbles, pouring"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "water is running from a faucet"], "question": "which entity is more active", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "a car accelerates and wind blows"], "sample_ids": ["wqADXCzngMw", "u0TrcHhkPQ"], "start_seconds": ["340", "20"], "properties": ["engine, idle, man", "accelerates, wind, blows"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", null], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["birds fly and flutter around", "paper is crumpling consistently"], "sample_ids": ["wGKgwOP3h30", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["fly, flutter, around", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of the pigeons in the coop", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["pigeons coo and flap their wings", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks uses a drill", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["x5eIC7S0fbg", "uZesmtKZGSw"], "start_seconds": ["60", "250"], "properties": ["A man is speaking, uses a drill, and is a tool", "men, talk, cars"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a tool", "label": 0}, {"captions": ["a woman speaks in a fast tone with a male", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sTpirNYo8vQ", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["a, tone, fast", "rustling, ducks, quack"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a duck quacks and a woman speaks"], "question": "which entity is about a duck?", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a infant makes noise and is excited"], "sample_ids": ["rqfQRErjfk8", "wIJK3-5y0kA"], "start_seconds": ["170", "30"], "properties": ["crowd, cheers, applauds", "noise, excited, infant"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["an audience gives applause", "multiple people speak and children yell while water gurgles"], "sample_ids": ["x6iCUDmRpKQ", "vb1fPSDI4c"], "start_seconds": ["38", "30"], "properties": ["applause, audience, give", "multiple, people, yell"], "captions_pred_video": ["a black background with the moon and stars in the sky", null], "captions_pred_audio": ["a group of people are clapping and cheering", "a crowd of people are talking and laughing"], "question": "which entity is more active", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["zY3icUyMdh8", "vVhthZ45k3Y"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "cat, purr, hiss"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking and a cat is meowing"], "question": "which entity is more playful", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["uPDn2BFTHk", "zl9Dqx-j7q4"], "start_seconds": ["140", "6"], "properties": ["woman, laughs, speaks", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["loud, continuous burping", "multiple people speak and children yell while water gurgles"], "sample_ids": ["y636gklDioE", "vb1fPSDI4c"], "start_seconds": ["20", "30"], "properties": ["loud, continuous, burping", "multiple, people, yell"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", null], "captions_pred_audio": ["a person burps loudly several times", "a crowd of people are talking and laughing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a car accelerates and wind blows", "a man speaks as a car is passing by"], "sample_ids": ["u0TrcHhkPQ", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["accelerates, wind, blows", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which car is moving faster", "label": 0}, {"captions": ["a woman sneezes then speaks", "birds chirp and objects are moved around"], "sample_ids": ["x4dZyf9Gbj0", "yPUYU6t3rwo"], "start_seconds": ["130", "370"], "properties": ["sneezes, speaks, woman", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage is blurry and out of focus", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman sneezes and speaks", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a toilet flushes and water drains"], "sample_ids": ["tEE3MpBt1sg", "sfAvvZwdLCY"], "start_seconds": ["50", "20"], "properties": ["drill, something, laugh", "water drains, flushes, water"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of the toilet in the bathroom"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "an airplane engine runs"], "sample_ids": ["vbr9mHKc8WM", "yVPZ2MNWpms"], "start_seconds": ["40", "0"], "properties": ["noise, loudness, engine", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["an engine is idling", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a dog barks and whimpers"], "sample_ids": ["zj2R0XoFr5k", "sShpyu2l4YQ"], "start_seconds": ["50", "0"], "properties": ["airplane, boy, fly", "barks, whimpers, dog"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "the puppies are playing with a toy"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a dog is barking and growling"], "question": "which entity is a dog?", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["y2bVZ7rz-5M", "w5W5Kqtc8E"], "start_seconds": ["280", "100"], "properties": ["motor noise, horn, siren", "wind, blow, vehicle"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["food is frying then a woman speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["ukxt9I7eMMg", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["food, woman, speak", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["an airplane accelerates briefly", "dishes cling together then a man begins to speak"], "sample_ids": ["zjTG0gaGCUI", "sQGXqGcwOTc"], "start_seconds": ["80", "3"], "properties": ["accelerates, airplane, briefly", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a jet engine roars as wind blows ", "mechanisms are operating and water is splashing "], "question": "which entity is a moving object", "label": 0}, {"captions": ["a toilet flushes and water drains", "a man speaks while turning a water faucet on"], "sample_ids": ["sfAvvZwdLCY", "vf9xf3vMsGM"], "start_seconds": ["20", "540"], "properties": ["water drains, flushes, water", "A man speaks while turning a water faucet on."], "captions_pred_video": ["footage of the toilet in the bathroom", "of the person washing their hands under the faucet"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while water is running in the background"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "water pouring and bubbling"], "sample_ids": ["y1saVTXsKwc", "uyRfq-jKPpo"], "start_seconds": ["80", "50"], "properties": ["a, dog, talk", "water, bubbles, pouring"], "captions_pred_video": ["a dog playing with a pink ball", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a dog barks and a man speaks", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "vehicles pass by on a roadway"], "sample_ids": ["tdWhHV3X25Q", "tgbONvsP47Y"], "start_seconds": ["60", "0"], "properties": ["applause, audience, yells", "pass, vehicle, roadway"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a person sniffles and sneezes", "sucking and grunting followed by slurping with birds in the background"], "sample_ids": ["uRlbY6aoBU", "yYEVLuqEytU"], "start_seconds": ["0", "40"], "properties": ["sneezes, sniffles, person", "grunt, slurp, background"], "captions_pred_video": [null, "a baby goat is being petted by a person's hand"], "captions_pred_audio": ["a man is sneezing ", "several sheep bleat and a man speaks"], "question": "which entity is a person", "label": 0}, {"captions": ["a beep repeats multiple times", "a car speeding up in the distance"], "sample_ids": ["y682ml90jGw", "u0TrcHhkPQ"], "start_seconds": ["11", "20"], "properties": ["beep, repeat, multiple", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vBHyYJ8pL0", "w5W5Kqtc8E"], "start_seconds": ["2", "100"], "properties": ["noise, door, opening", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is accompanied by a door opening and closing", "label": 0}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a woman and man speak while food is frying"], "sample_ids": ["w8uLijTqtlU", "zk-xJGQU8-4"], "start_seconds": ["70", "130"], "properties": ["wind, microphone, noise", "food, man, woman"], "captions_pred_video": ["footage is blurry and shaky", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["the wind is blowing strongly", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity is a video of a man and woman speaking?", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["y2bVZ7rz-5M", "vbZ-0lGPneg"], "start_seconds": ["280", "30"], "properties": ["motor noise, horn, siren", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a horn?", "label": 0}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tDVADusiIoc", "zj2R0XoFr5k"], "start_seconds": ["60", "50"], "properties": ["water, radio, man", "airplane, boy, fly"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about flying?", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "water flows as a woman laughs and a man speaks"], "sample_ids": ["xvDdE3zNf8Y", "vddP56-ogds"], "start_seconds": ["120", "30"], "properties": ["A, crumple, paper", "water, flow, laugh"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", null], "captions_pred_audio": ["a woman speaks and crumples paper", "water is running and gurgling and a man is speaking"], "question": "which entity is a video of a woman speaking and crumpling paper?", "label": 0}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "a car speeding up in the distance"], "sample_ids": ["yNtRmrn0io8", "u0TrcHhkPQ"], "start_seconds": ["210", "20"], "properties": ["storm, distance, strike", "distance, car, speed"], "captions_pred_video": ["footage of a house in the middle of the night", null], "captions_pred_audio": ["rain falls and thunder roars", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["children speak and play together", "a clock ticktocks"], "sample_ids": ["yVVP8XvWJTo", "v-g-j2uTByM"], "start_seconds": ["260", "30"], "properties": ["children, speak, play", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a playground at a school or daycare center", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "birds chirp quietly and an adult man speaks"], "sample_ids": ["yRx9txMcBl0", "zuua6-5goWw"], "start_seconds": ["40", "30"], "properties": ["accelerates, tires, squeals", "birds, chirp, quiet, man, speaks"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot"], "captions_pred_audio": ["a car is revving its engine and skidding ", "birds are chirping and a man is speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "engines sputter roughly and tires squeal"], "sample_ids": ["se87d6yxEOA", "zhx6hoYrHeI"], "start_seconds": ["10", "160"], "properties": ["run, whistle, pass", "engine, sputter, rough"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage of a man working on a motorcycle's tire"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a car accelerates and revs its engine "], "question": "which engine is rougher", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a man speaks while a machine runs before a smoke alarm beeps"], "sample_ids": ["wudZTNBtVqc", "sG7TyPnFDR0"], "start_seconds": ["60", "180"], "properties": ["accelerates, engine, wind", "beeps, machine, smoke alarm"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "a person is using an espresso machine in a restaurant"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and a microwave oven is beeping "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a helicopter engine runs", "people cheer as a vehicle engine revs"], "sample_ids": ["t5ZbXbniOWk", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["engine, helicopter, run", "engine revs, vehicle, people"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a helicopter is flying overhead ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["birds coo incessantly", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yZrFNS7GFBQ", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["coo, bird, incessant", "rooster, crow, background, men"], "captions_pred_video": ["of the bird in the cage", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vveS8HT7Uog", "tDVADusiIoc"], "start_seconds": ["100", "60"], "properties": ["a man, objects, speak", "water, radio, man"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which man is speaking over a radio?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["wTideSjRFS0", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["food, sizzle, woman", "wind, blow, vehicle"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["an airplane engine spools and people speak", "a stream of water runs briefly"], "sample_ids": ["wTjoRj1se3U", "x-PeY8Yb8M4"], "start_seconds": ["390", "300"], "properties": ["airplane, engine, spool", "stream, water, run"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a jet engine is running and people are talking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a toilet flushes and a female speaks"], "sample_ids": ["u5RmF3c3Aw", "yaln9y8I7ms"], "start_seconds": ["60", "230"], "properties": ["engine, car, zoom", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yVumC9TGknc", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["humming, clock, birds", "a woman, something, fried"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a series of beeps and chirps", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vddP56-ogds", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["liquid, laughs, man", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking and a baby is crying"], "question": "which entity has a man talking?", "label": 0}, {"captions": ["females talk and laugh over gusting wind", "a person sniffles and then sneezes in the distance"], "sample_ids": ["un9VQlzgZM", "uRlbY6aoBU"], "start_seconds": ["5", "0"], "properties": ["females, talk, laugh", "a, distance, sneeze"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is sneezing "], "question": "which entity is about a person?", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "a propeller rotates loudly and intensely"], "sample_ids": ["tqR406bGiE", "ugHJF0hfYkg"], "start_seconds": ["40", "10"], "properties": ["flush, water, gurgle", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a toilet is flushed", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks over intermittent keyboard taps", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["tw76HGONaKg", "sapQIQUhFc"], "start_seconds": ["570", "280"], "properties": ["audio, man, keyboard", "liquid, flow, distance"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more distant", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wztCSUxOf8", "wz7N8YRy74I"], "start_seconds": ["130", "30"], "properties": ["a crowd, yells, applauds", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a woman speaks as she rubs two objects together"], "sample_ids": ["sncRqQ67iJU", "vzxHnu-SFEw"], "start_seconds": ["460", "80"], "properties": ["loud, repeatedly, man", "two objects, woman, speak"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is snoring", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "someone snores nearby"], "sample_ids": ["wy1eKjR7KC0", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["people, talk, distance", "someone snores, nearby, someone"], "captions_pred_video": ["two police officers riding motorcycles down the street", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a person is snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["someone is burping continuously", "a child speaks in closed space"], "sample_ids": ["y636gklDioE", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["burps, burps, burps", "child, space, speak"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person burps loudly several times", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not speaking?", "label": 0}, {"captions": ["sirens ring and approach with humming of distant traffic", "water is sprayed across a hard surface"], "sample_ids": ["xERFUeZONz8", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["ring, approach, traffic", "water, spray, surface"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["an emergency vehicle siren blares", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["some clanking with distant murmuring", "people applaud and hoot and chat quietly"], "sample_ids": ["uMTTDZ2mb4", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["clanking, murmuring, distant", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a performance", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "birds chirp and objects are moved around"], "sample_ids": ["wqADXCzngMw", "yPUYU6t3rwo"], "start_seconds": ["340", "370"], "properties": ["engine, idle, man", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wjsXBsc7M40", "zFjIWfSD-4"], "start_seconds": ["10", "410"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "People, motor, brakes"], "captions_pred_video": ["footage of the baby playing with a toothbrush", null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tDlfY3nmx1A", "vfYTJq7nU"], "start_seconds": ["160", "130"], "properties": ["applause, laugh, man", "rustling, ducks, quack"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", null], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a child speaks"], "sample_ids": ["zkKdxzNC97Y", "yW6FWLSLkx4"], "start_seconds": ["27", "40"], "properties": ["loud, bang, noise", "a, child, speaks"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a door is opened and closed", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is silent", "label": 0}, {"captions": ["a propeller moves loudly nearby", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["ugHJF0hfYkg", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["loud, propeller, move", "a, scream, girl"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is quieter", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zY3icUyMdh8", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["dog, bark, engine", "a woman, laughs, animal"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a person", "label": 1}, {"captions": ["paper folding and crinkling", "water flows as men speak and yell"], "sample_ids": ["zPpG3RD8lSs", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["paper, fold, crinkle", "water, flow, men"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "some tunes played by whistling"], "sample_ids": ["x6ijhqRY38s", "u6BnG6YZqJ4"], "start_seconds": ["250", "0"], "properties": ["something metal, glass, hit", "tune, play, whistling"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["an engine runs and a man speaks", "paper is crumpling consistently"], "sample_ids": ["yT5WfYMRr-U", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["engine, run, man", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "people cheer as a vehicle engine revs"], "sample_ids": ["vJvryTwuAV8", "xjhAnI2q6hM"], "start_seconds": ["16", "6"], "properties": ["audience, cheer, man", "engine revs, vehicle, people"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "some tunes played by whistling"], "sample_ids": ["sNB8zxXneIM", "u6BnG6YZqJ4"], "start_seconds": ["20", "0"], "properties": ["several, quack, cocks", "tune, play, whistling"], "captions_pred_video": ["a group of geese in a cage", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "pigeons vocalize and birds chirp"], "sample_ids": ["vYkA3cfXp5Q", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["speed, idle, accelerate", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "of the pigeon in the cage"], "captions_pred_audio": ["an engine is idling", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a girl talking, laughing and sneezing noise", "a person speaks over rustling leaves"], "sample_ids": ["y4tPJXBKDig", "zOZleIRqZm4"], "start_seconds": ["20", "80"], "properties": ["a, noise, talk", "rustling, leaves, person"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a man is speaking with crickets chirping in the background"], "question": "which entity is a person speaking over rustling leaves?", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vZAw4apG0Es", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["background, clock, ticktocks", "airplane, boy, fly"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a clock is ticking and people are talking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a church bell rings several times", "a car accelerates and wind blows"], "sample_ids": ["sUVVjE3Ucp8", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["ring, bell, several", "accelerates, wind, blows"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", null], "captions_pred_audio": ["a church bell is ringing ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man speaks as a car is passing by"], "sample_ids": ["sfAvvZwdLCY", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "a, car, pass"], "captions_pred_video": ["footage of the toilet in the bathroom", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "water flows and trickles"], "sample_ids": ["uoGVs9yUqY4", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["multiple, vocalize, wind", "water, flow, trickle"], "captions_pred_video": ["for how to make a wooden shed door youtube", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a male speaks over some small clicks", "a person is snoring while sleeping"], "sample_ids": ["uXxVebHsGZ8", "vJrjSeP17yE"], "start_seconds": ["30", "40"], "properties": ["male, clicks, speak", "a person is sleeping, snoring, person"], "captions_pred_video": [null, "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["male speech with light ticking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["xO-Q2BlIIPU", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["male, speech, ticking", "rooster, crow, background, men"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "an electric engine works nearby followed by a child talking"], "sample_ids": ["s59PfAghdkM", "xSKJGCItUWE"], "start_seconds": ["0", "10"], "properties": ["bird, chirp, background, horse, neigh", "engine, work, child"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "footage of the helicopter flying in the room"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a high pitched engine is running and a child speaks"], "question": "which entity has a child talking?", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a vehicle is skidding and squealing tires"], "sample_ids": ["ylpYOorfH4o", "soTOh3zYJfY"], "start_seconds": ["410", "40"], "properties": ["motor, run, steady", "vehicle, skid, tires"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["insects humming with a dog barking and small goat bleating", "a woman speaks happily and an animal chirps"], "sample_ids": ["tIY7qOV3rEM", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "a woman, chirps, animal"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking and a dog is barking "], "question": "which animal is speaking", "label": 0}, {"captions": ["a man speaks as a motor runs in the background", "wind blows as people chatter quietly"], "sample_ids": ["xZepNM9qcRA", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["background, motor, run", "wind, chatter, people"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "footage is blurry and out of focus"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["vcmWSmvti8", "siJFXfGWgDk"], "start_seconds": ["30", "50"], "properties": ["music, man, fire", "man, woman, vehicle"], "captions_pred_video": [null, "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking and birds are chirping in the background "], "question": "which entity has a man speaking as music plays before artillery is fired?", "label": 0}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["su6FAOcOA8c", "ukg5L09Wpvo"], "start_seconds": ["4", "150"], "properties": ["engine, idle, woman", "clickety-clack, train, whistle"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a dog barks and whimpers", "people speak as gunfire rings out"], "sample_ids": ["sShpyu2l4YQ", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["barks, whimpers, dog", "gunfire, ring, speak"], "captions_pred_video": ["the puppies are playing with a toy", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["someone whistles a song", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sIXTftIuUgw", "wDVMhEdTiVw"], "start_seconds": ["90", "30"], "properties": ["someone, song, whistle", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a person whistling a song", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["wind blows and people talk while livestock vocalizes", "an engine runs loudly"], "sample_ids": ["vXlk0lIQBFo", "vqZuVbG6-HI"], "start_seconds": ["470", "130"], "properties": ["wind, talk, vocalize", "loud, engine, run"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage is blurry because it's raining outside"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks uses a drill", "a woman speaks as she rubs two objects together"], "sample_ids": ["x5eIC7S0fbg", "vzxHnu-SFEw"], "start_seconds": ["60", "80"], "properties": ["A man is speaking, uses a drill, and is a tool", "two objects, woman, speak"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a tool", "label": 0}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["ylpYOorfH4o", "wqZ135Ssz0"], "start_seconds": ["410", "60"], "properties": ["motor, run, steady", "two men, woman, birds"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a man speaks as a car is passing by"], "sample_ids": ["ugHJF0hfYkg", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["loud, propeller, move", "a, car, pass"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a moving object", "label": 1}, {"captions": ["small dogs yip and bark sharply", "some men converse over an engine running"], "sample_ids": ["v-wcQf4BDY0", "sCiy7QS1U"], "start_seconds": ["120", "300"], "properties": ["bark, yip, sharply", "men, converse, engine"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", null], "captions_pred_audio": ["a dog barks and growls", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more quiet", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "winds blows roughly as a vehicle races past"], "sample_ids": ["weDbePuc-Xc", "xjvTpk2Zpr8"], "start_seconds": ["40", "70"], "properties": ["music, slaps, human", "wind, blows, vehicle"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a small engine spits as it runs", "a airplane flies overhead as a woman speaks"], "sample_ids": ["sZvwOuuPGP0", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["spits, engine, runs", "airplane, fly, woman"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a medium engine is running ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a man speaks over intermittent keyboard taps"], "sample_ids": ["sDSppXIlJrs", "tw76HGONaKg"], "start_seconds": ["27", "570"], "properties": ["microphone, water, wind", "audio, man, keyboard"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a man speaks and types on a computer keyboard "], "question": "which entity is a recording of a man speaking?", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "wind blowing followed by a zoom"], "sample_ids": ["tDlfY3nmx1A", "vr8ZXjEBhMQ"], "start_seconds": ["160", "150"], "properties": ["applause, laugh, man", "wind, blow, zoom"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a man speaks as a car is passing by"], "sample_ids": ["vhJWZheqaE", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["water drains unevenly, toilet flushes, water drains", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uEU-Hg5MTN8", "xBxDz0CFVn0"], "start_seconds": ["27", "30"], "properties": ["a woman, laughs, animal", "stream, water, flow"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a child speaks in closed space"], "sample_ids": ["sQwlkXjQabo", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["liquid, surface, spray", "child, space, speak"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["spraying followed by silence", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a machine beeps continuously", "a machine beeps continuously"], "sample_ids": ["y682ml90jGw", "y682ml90jGw"], "start_seconds": ["11", "11"], "properties": ["beeps, machine, continuously", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a beeping sound is being made "], "question": "which machine beeps continuously", "label": 1}, {"captions": ["birds chirp then an animal grunts", "paper is crumpling consistently"], "sample_ids": ["tDlysoZiA1I", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["animal, grunt, chirp", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tOj4tdLRaA", "xKB8O8LTs6s"], "start_seconds": ["70", "70"], "properties": ["woman, laugh, baby", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a baby laughs and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["plastic is tapped on while someone speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wvKpEYswXO0", "vfYTJq7nU"], "start_seconds": ["150", "130"], "properties": ["plastic, tap, speak", "rustling, ducks, quack"], "captions_pred_video": ["of the person preparing food in the kitchen", null], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a duck quacks and a woman speaks"], "question": "which entity is about a duck?", "label": 1}, {"captions": ["a person screams glaringly", "vehicles pass by on a roadway"], "sample_ids": ["xC8kbrKJmco", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["glaringly, screams, person", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a goat is bleating ", "a car is driving on the road "], "question": "which entity is more passive", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["tOSWIURC-4", "yDoT73BWsdA"], "start_seconds": ["0", "10"], "properties": ["engine, work, nearby", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a lawn mower is running ", "a race car accelerates and revs its engine "], "question": "which entity has a vehicle passing by?", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "a man speaks while a rooster crows and other people speak in the background"], "sample_ids": ["s3cTDAj31g", "wz7N8YRy74I"], "start_seconds": ["80", "30"], "properties": ["man, talk, woman", "rooster, crow, background, people"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["water flows and trickles", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tB7hWb9gTuQ", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["water, flow, trickle", "a woman, something, fried"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "- a woman cooking in the kitchen"], "captions_pred_audio": ["water is splashing and gurgling", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of something being fried?", "label": 1}, {"captions": ["speaking following by laughing and clapping", "dishes cling together then a man begins to speak"], "sample_ids": ["u2f5NpsoHBg", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["person, laugh, clap", "cling, speak, dishes"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "mechanisms are operating and water is splashing "], "question": "which entity shows a person speaking and laughing and clapping?", "label": 0}, {"captions": ["a woman talking as an infant is crying", "wind blows as people chatter quietly"], "sample_ids": ["tMbMDvT50j8", "xBxDz0CFVn0"], "start_seconds": ["12", "30"], "properties": ["a, talk, infant", "wind, chatter, people"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zuua6-5goWw", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["birds, chirp, quiet, man, speaks", "a woman, laughs, animal"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a person sniffs and sneezes", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["uRlbY6aoBU", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["sneezes, person, sniffs", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is sneezing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "a child speaks in closed space"], "sample_ids": ["xjhAnI2q6hM", "yW6FWLSLkx4"], "start_seconds": ["6", "40"], "properties": ["wind, blow, loudly", "child, space, speak"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a car speeding up in the distance"], "sample_ids": ["xBxDz0CFVn0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["wind, chatter, people", "distance, car, speed"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "people speak as gunfire rings out"], "sample_ids": ["wwyfGO2J4", "wqTCwqVRDlk"], "start_seconds": ["90", "80"], "properties": ["people, applaud, hoot", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["several insects fly while two men talk", "waves crash against a shoreline and people speak"], "sample_ids": ["s-T9OVOiMLo", "yFB25fqfU8I"], "start_seconds": ["330", "300"], "properties": ["several, fly, men", "wave, crash, shoreline"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a toilet flushes and a female speaks"], "sample_ids": ["se87d6yxEOA", "yaln9y8I7ms"], "start_seconds": ["10", "230"], "properties": ["run, whistle, pass", "female, flushes, toilet"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage is blurry and out of focus"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a toilet flushes and a man speaks"], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["ukxt9I7eMMg", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["continuous, woman, speaking", "wind, blow, vehicle"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a woman speaking towards the end?", "label": 0}, {"captions": ["a helicopter engine runs continuously", "someone whistles a tune"], "sample_ids": ["ugHJF0hfYkg", "sIXTftIuUgw"], "start_seconds": ["10", "90"], "properties": ["engine, running, continuously", "someone, tune, whistle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a person whistling a song"], "question": "which is not a continuous action", "label": 0}, {"captions": ["a person sneezes followed by another person speaking", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["t8CV69hcvF0", "wyllXV6PjKo"], "start_seconds": ["210", "30"], "properties": ["person, sneeze, follow", "a baby, a woman, a man"], "captions_pred_video": ["of an airplane flying in the dark sky at night", null], "captions_pred_audio": ["a woman sneezes and speaks", "a woman speaks and a baby cries"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "water pouring and bubbling"], "sample_ids": ["uOpoD0gGXcs", "uyRfq-jKPpo"], "start_seconds": ["120", "50"], "properties": ["chirps, woman, bird", "water, bubbles, pouring"], "captions_pred_video": ["a herd of cows grazing in the field", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["birds are chirping and a man is speaking", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "a man speaks uses a drill"], "sample_ids": ["w-4gHptFNuU", "x5eIC7S0fbg"], "start_seconds": ["21", "60"], "properties": ["engine revs, accelerates, bump", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and using a power tool "], "question": "which entity is a tool", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xfaoyyzw2WU", "xKB8O8LTs6s"], "start_seconds": ["180", "70"], "properties": ["loud, jet engine, roar", "music, gunfire, explosion"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wRBHTgrbiwg", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["bird, owl, speak", "a woman, a television program, a bird"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["sawing of wood and rustling with leaves blowing in the distance", "wind blowing followed by a zoom"], "sample_ids": ["uiItxDsDMFI", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["sound, distance, leaves", "wind, blow, zoom"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a saw is being used with background noise ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom?", "label": 1}, {"captions": ["an engine starts and increases in power", "waves crash against a shoreline and people speak"], "sample_ids": ["zjTG0gaGCUI", "yFB25fqfU8I"], "start_seconds": ["80", "300"], "properties": ["power, increase, engine", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["tK4VlLsNxak", "rqu8iB22IY"], "start_seconds": ["120", "5"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "sound, repeats, laugh"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a dog barks and a man speaks while music plays "], "question": "which entity is more like a scream", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "a stream of water runs briefly"], "sample_ids": ["wqADXCzngMw", "x-PeY8Yb8M4"], "start_seconds": ["340", "300"], "properties": ["audio, humming, revving", "stream, water, run"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a clock ticktocks continuously", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vlJS7LN2XyM", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["ticktocks, clock, ticktocks continuously", "loud, jet engine, roar"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a ticktock of a clock", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "a person snores loudly multiple times at a close distance"], "sample_ids": ["skd2PphS6oI", "sSMl2vc3ek"], "start_seconds": ["190", "20"], "properties": ["ring, bird, vocalize", "loud, multiple, distance"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", null], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["xyx6eNVEYRY", "xjhAnI2q6hM"], "start_seconds": ["380", "6"], "properties": ["loud, engine, muffles", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "wind blows as people chatter quietly"], "sample_ids": ["smDKStoHBJo", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["a, infant, speaking", "wind, chatter, people"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a machine beeps continuously"], "sample_ids": ["yaln9y8I7ms", "y682ml90jGw"], "start_seconds": ["230", "11"], "properties": ["female, flushes, toilet", "beeps, machine, continuously"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and a man speaks", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "birds chirp and objects are moved around"], "sample_ids": ["vJ7JPEFhyLA", "yPUYU6t3rwo"], "start_seconds": ["16", "370"], "properties": ["three men, wind, flow", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "an airplane engine runs"], "sample_ids": ["vddP56-ogds", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["water, splash, person, laugh", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wyllXV6PjKo", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["a baby, a woman, a man", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about a baby?", "label": 0}, {"captions": ["here comes the train and it starts to blow the horn and get close", "people speak as gunfire rings out"], "sample_ids": ["s7knHCFW82w", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["blow horn, get close, train", "gunfire, ring, speak"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "a door opens and closes"], "sample_ids": ["yswmmRZFItk", "vBHyYJ8pL0"], "start_seconds": ["0", "2"], "properties": ["background, frog, croak", "open, close, door"], "captions_pred_video": ["a close up of a frog in the water", null], "captions_pred_audio": ["a frog is croaking", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is a door?", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a infant makes noise and is excited"], "sample_ids": ["s7knHCFW82w", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["blow horn, get close, train", "noise, excited, infant"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a baby cries and a woman speaks"], "question": "which is louder", "label": 0}, {"captions": ["a man speaks while rain falls onto a hard surface", "water flows and trickles"], "sample_ids": ["wqN6IIHw3po", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["rain, surface, fall", "water, flow, trickle"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and water is splashing", "water is splashing and gurgling"], "question": "which entity is flowing", "label": 1}, {"captions": ["a dog barks and whimpers", "water splashes as an animal walks through"], "sample_ids": ["sShpyu2l4YQ", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["barks, whimpers, dog", "animal, water, splashes"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a dog is barking and growling", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "a helicopter engine runs continuously"], "sample_ids": ["vZAw4apG0Es", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["people, clock, converse", "engine, running, continuously"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a clock is ticking and people are talking", "a helicopter is flying overhead "], "question": "which entity is running continuously", "label": 1}, {"captions": ["an animal quacks rapidly", "a man speaks as a car is passing by"], "sample_ids": ["vh30P49Po6s", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["animal, quacks, rapidly", "a, car, pass"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a child speaks in closed space"], "sample_ids": ["tDVADusiIoc", "yW6FWLSLkx4"], "start_seconds": ["60", "40"], "properties": ["man, radio, blows", "child, space, speak"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a duck quacks continuously"], "sample_ids": ["tQWGZLItBXk", "vh30P49Po6s"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "quacks, continuously, duck"], "captions_pred_video": ["worms revolution screenshots", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a duck is quacking loudly"], "question": "which entity is a duck?", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "water splashes as an animal walks through"], "sample_ids": ["s3cTDAj31g", "w1ir-sZ3Im8"], "start_seconds": ["80", "90"], "properties": ["man, talk, woman", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and a baby is crying", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yeFvk9x0wWI", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["clack, bird, chirp", "three men, wind, flow"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a liquid flowing?", "label": 1}, {"captions": ["a clock ticktocks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["v-g-j2uTByM", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["ticktocks, clock, ticktocks", "loud, multiple, distance"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", null], "captions_pred_audio": ["a clock is ticking loudly", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["water bubbles and gurgles.", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tB7hWb9gTuQ", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["bubbles, gurgles, water", "gun, shoot, water"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["water is splashing and gurgling", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a man speaks as a car is passing by"], "sample_ids": ["u--KhUW8l1Y", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["horn, siren, life", "a, car, pass"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a person is burping while a girl speaks", "people speak as gunfire rings out"], "sample_ids": ["vdoxuJn9lTc", "wqTCwqVRDlk"], "start_seconds": ["40", "80"], "properties": ["person, burp, girl", "gunfire, ring, speak"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tgbONvsP47Y", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["noise, truck, accelerate", "airplane, boy, fly"], "captions_pred_video": ["footage of a fire truck entering a garage", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a car is driving on the road ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a woman speaks happily and an animal chirps"], "sample_ids": ["wPz6QRAkEb4", "uWAAAL4CIoc"], "start_seconds": ["60", "0"], "properties": ["chirps, tweets, song", "a woman, chirps, animal"], "captions_pred_video": ["a bird in a cage on top of a pole", null], "captions_pred_audio": ["birds are chirping in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a child speaks in closed space"], "sample_ids": ["wvKpEYswXO0", "yW6FWLSLkx4"], "start_seconds": ["150", "40"], "properties": ["water, tap, run", "child, space, speak"], "captions_pred_video": ["of the person preparing food in the kitchen", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a man speaks as a motor runs in the background"], "sample_ids": ["s7knHCFW82w", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["blow horn, get close, train", "background, motor, run"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a man speaking to a background?", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wz7N8YRy74I", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["rooster, crow, background, people", "a, scream, girl"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "someone is typing on a computer keyboard"], "sample_ids": ["wSVhSdj0F0", "v0x1odnXtP0"], "start_seconds": ["10", "210"], "properties": ["horn honks, keys jingle, slam", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a person is typing on a keyboard"], "question": "which entity is typing on a computer keyboard", "label": 1}, {"captions": ["a person speaks briefly", "continuous sneezing together with speech"], "sample_ids": ["zOZleIRqZm4", "x4dZyf9Gbj0"], "start_seconds": ["80", "130"], "properties": ["person, talk, brief", "continuous, sneeze, speech"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman sneezes and speaks"], "question": "which entity is more like a person talking", "label": 0}, {"captions": ["a vehicle engine revs and tires squeal", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["yDoT73BWsdA", "xfaoyyzw2WU"], "start_seconds": ["10", "180"], "properties": ["engine revs, tires squeal, vehicle", "loud, jet engine, roar"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a man speaks as a motor runs in the background"], "sample_ids": ["yYEVLuqEytU", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["grunt, slurp, background", "background, motor, run"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a frog vocalizes as birds chirp"], "sample_ids": ["yajyRTUQk3U", "wqUmIEzuNz4"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "frog, bird, vocalize"], "captions_pred_video": ["- a woman cooking in the kitchen", "a frog sitting in the grass on a sunny day"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a cat meows and rustles"], "question": "which entity is a frog", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a helicopter engine idles continuously"], "sample_ids": ["ugHJF0hfYkg", "ugHJF0hfYkg"], "start_seconds": ["10", "10"], "properties": ["loud, intense, propeller", "engine, idle, continuously"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a helicopter is flying overhead ", "a helicopter is flying overhead "], "question": "which entity is quieter", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yZrFNS7GFBQ", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["pigeon, buzzes, insect", "clickety-clack, train, whistle"], "captions_pred_video": ["of the bird in the cage", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["an owl hoots in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "waves crash against a shoreline and wind blows"], "sample_ids": ["uJV8NDaHqqk", "zdYdyF9-m8U"], "start_seconds": ["100", "7"], "properties": ["loud, fly, chirp", "wind, crash, shoreline"], "captions_pred_video": ["a bee hive in a wooden box", "a person kayaking in the ocean near a cliff"], "captions_pred_audio": ["a swarm of bees buzzing around", "waves crash and wind blows "], "question": "which entity is more quiet", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vJ7JPEFhyLA", "w5W5Kqtc8E"], "start_seconds": ["16", "100"], "properties": ["three men, wind, flow", "wind, blow, vehicle"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has more wind blowing", "label": 1}, {"captions": ["speaking following by laughing and clapping", "paper folding and crinkling"], "sample_ids": ["u2f5NpsoHBg", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["person, laugh, clap", "paper, fold, crinkle"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "the wind blows and a mouse clicks "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["su6FAOcOA8c", "rqu8iB22IY"], "start_seconds": ["4", "5"], "properties": ["engine, run, woman", "sound, repeats, laugh"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a dog barks and a man speaks while music plays "], "question": "which entity has a woman making an announcement?", "label": 0}, {"captions": ["a woman speaks with water running", "a toilet flushes and a female speaks"], "sample_ids": ["wTideSjRFS0", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["water, running, woman", "female, flushes, toilet"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a toilet flushes and a man speaks"], "question": "which entity has a woman speaking while water is running?", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "paper is crumpling consistently"], "sample_ids": ["zkKdxzNC97Y", "v5cSxLaHADY"], "start_seconds": ["27", "0"], "properties": ["loud, bang, noise", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a door is opened and closed", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["some clanking with distant murmuring", "wind blows as people chatter quietly"], "sample_ids": ["uMTTDZ2mb4", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["clanking, murmuring, distant", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["long loud burping by a man", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xmiUIOhtZyQ", "uZesmtKZGSw"], "start_seconds": ["60", "250"], "properties": ["loud, burp, man", "men, talk, cars"], "captions_pred_video": ["homer simpson drinking a beer", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person burps and music plays in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vuUVPzd2FXw", "su6FAOcOA8c"], "start_seconds": ["160", "4"], "properties": ["a, steam, release", "engine, idle, woman"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a woman is speaking and a subway train is moving "], "question": "which entity is a man?", "label": 0}, {"captions": ["multiple motorcycles pass by as a man speaks", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zcDwZ6W7E3E", "tiDFTC-5vU"], "start_seconds": ["180", "30"], "properties": ["man, speak, motorcycles", "male, duck, laugh"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "paper is crumpling consistently"], "sample_ids": ["wwyfGO2J4", "v5cSxLaHADY"], "start_seconds": ["90", "0"], "properties": ["people, applaud, hoot", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w34HjHr6gAY", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["beeps, hit, woman", "stream, water, flow"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage is blurry and out of focus"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "people speak as gunfire rings out"], "sample_ids": ["vK93VuO0yNc", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["male voice, bus, rumble", "gunfire, ring, speak"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["uKCSGgof8gI", "xKB8O8LTs6s"], "start_seconds": ["12", "70"], "properties": ["chirps, distance, signal", "music, gunfire, explosion"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "an insect buzzes around continuously"], "sample_ids": ["sjlVMgdGSK0", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["accelerates, vehicle, race car", "buzzes, continuously, insect"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a fly is buzzing around a microphone "], "question": "which entity buzzes around continuously", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["u--KhUW8l1Y", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["horn, siren, life", "wind, blow, vehicle"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", null], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a man speaks as a car is passing by"], "sample_ids": ["wyllXV6PjKo", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a baby, a woman, a man", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a baby?", "label": 0}, {"captions": ["people speak and tapping occurs", "repeated tapping is accompanied by water running and a woman speaking softly"], "sample_ids": ["tFCUUGdREgA", "wvKpEYswXO0"], "start_seconds": ["70", "150"], "properties": ["people, tap, speak", "sound, water, running"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity has a woman speaking softly?", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "an airplane engine runs"], "sample_ids": ["zl9Dqx-j7q4", "yVPZ2MNWpms"], "start_seconds": ["6", "0"], "properties": ["engine, laugh, loud", "engine, airplane, runs"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a jet engine roars ", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vbr9mHKc8WM", "vlS6YMeWAPo"], "start_seconds": ["40", "40"], "properties": ["noise, loudness, engine", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["an engine is idling", "a goat bleats and birds chirp"], "question": "which entity is quieter", "label": 0}, {"captions": ["a man speaks as music plays before artillery is fired", "a woman speaks happily and an animal chirps"], "sample_ids": ["vcmWSmvti8", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["music, man, fire", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a woman is speaking and a dog is barking "], "question": "which entity is more peaceful", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vSeGhaZt-aI", "w5W5Kqtc8E"], "start_seconds": ["50", "100"], "properties": ["water, sink, talk", "wind, blow, vehicle"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about water running down a sink?", "label": 0}, {"captions": ["leaves rustle while man speaks", "plastic is tapped on while someone speaks"], "sample_ids": ["zOZleIRqZm4", "wvKpEYswXO0"], "start_seconds": ["80", "150"], "properties": ["leaves, rustle, speak", "plastic, tap, speak"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vs65y4qmyBE", "wqZ135Ssz0"], "start_seconds": ["340", "60"], "properties": ["engine, run, man", "two men, woman, birds"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "an airplane engine spools and people speak"], "sample_ids": ["uEU-Hg5MTN8", "wTjoRj1se3U"], "start_seconds": ["27", "390"], "properties": ["a woman, laughs, animal", "airplane, engine, spool"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a jet engine is running and people are talking"], "question": "which entity is a video of a person speaking and an animal snorting?", "label": 0}, {"captions": ["water splashes and wind noise is made into a microphone", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["sDSppXIlJrs", "vqZuVbG6-HI"], "start_seconds": ["27", "130"], "properties": ["microphone, water, wind", "background, male, female"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "footage is blurry because it's raining outside"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a lawn mower is running and men are speaking "], "question": "which entity has more background noise", "label": 1}, {"captions": ["roadway noise occurs and a truck accelerates", "a car accelerates and wind blows"], "sample_ids": ["tgbONvsP47Y", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["noise, truck, accelerate", "accelerates, wind, blows"], "captions_pred_video": ["footage of a fire truck entering a garage", null], "captions_pred_audio": ["a car is driving on the road ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["someone snores nearby", "multiple people speak and children yell while water gurgles"], "sample_ids": ["spJCm8tD9Zo", "vb1fPSDI4c"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "multiple, people, yell"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a sleeping person emits a gravely snore", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w2JXXIAdUdg", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["emits, sleeping, person", "three men, wind, flow"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a person emitting a gravely snore?", "label": 0}, {"captions": ["motors rev and run loudly as a person laughs", "three men talk while wind blows and some liquid flows"], "sample_ids": ["zl9Dqx-j7q4", "vJ7JPEFhyLA"], "start_seconds": ["6", "16"], "properties": ["motors rev, laugh, loudly", "three men, wind, flow"], "captions_pred_video": ["footage of a man driving a car in the dark", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "someone whistles a tune"], "sample_ids": ["u21-Z5gJCB8", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["background, voice, man", "someone, tune, whistle"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a stream runs then someone speaks", "people speak as gunfire rings out"], "sample_ids": ["wbHTKEJZyhc", "wqTCwqVRDlk"], "start_seconds": ["20", "80"], "properties": ["stream, run, someone", "gunfire, ring, speak"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["slZLHwNbbt4", "w5W5Kqtc8E"], "start_seconds": ["300", "100"], "properties": ["clap, distance, horn", "wind, blow, vehicle"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vBslzh7saPw", "w5W5Kqtc8E"], "start_seconds": ["90", "100"], "properties": ["engine, roar, louder", "wind, blow, vehicle"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "water splashes and wind noise is made into a microphone"], "sample_ids": ["y4tPJXBKDig", "sDSppXIlJrs"], "start_seconds": ["20", "27"], "properties": ["a, noise, talk", "microphone, water, wind"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "a man is paddling a small wooden boat in the water"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "the wind is blowing and water is splashing"], "question": "which noise is made by water and wind", "label": 1}, {"captions": ["a man is filing a hard object", "a car accelerates and wind blows"], "sample_ids": ["vveS8HT7Uog", "u0TrcHhkPQ"], "start_seconds": ["100", "20"], "properties": ["a man, hard, object", "accelerates, wind, blows"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a race car accelerates and revs its engine "], "question": "which object is moving", "label": 1}, {"captions": ["a stream of water runs briefly", "a man speaks as a car is passing by"], "sample_ids": ["x-PeY8Yb8M4", "sK4u5T8hW78"], "start_seconds": ["300", "30"], "properties": ["stream, water, run", "a, car, pass"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a woman speaks as she rubs two objects together"], "sample_ids": ["viuTg1M-dqg", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["two men, speak, follow", "two objects, woman, speak"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is about a woman speaking?", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a child speaks in closed space"], "sample_ids": ["tDVADusiIoc", "yW6FWLSLkx4"], "start_seconds": ["60", "40"], "properties": ["wind, radio, waves", "child, space, speak"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a man speaks as a machine runs", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vD6lYD1l0BY", "uEU-Hg5MTN8"], "start_seconds": ["330", "27"], "properties": ["a, machine, run", "a woman, laughs, animal"], "captions_pred_video": ["game controller being held in the hands of the person", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking as a machine runs?", "label": 0}, {"captions": ["a person sniffles and sneezes", "people speak and tapping occurs"], "sample_ids": ["uRlbY6aoBU", "tFCUUGdREgA"], "start_seconds": ["0", "70"], "properties": ["sneezes, sniffles, person", "people, tap, speak"], "captions_pred_video": [null, "a person riding a white horse in an indoor arena"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and walking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "an infant crying frantically"], "sample_ids": ["vmrxwuAMb2I", "zwOBqeFTgiU"], "start_seconds": ["40", "30"], "properties": ["a dog, inhales, exhales", "cry, infant, frantically"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "of the baby crying in the car seat"], "captions_pred_audio": ["a dog barks and growls", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a horn rings out as a machine runs by"], "sample_ids": ["xKB8O8LTs6s", "slZLHwNbbt4"], "start_seconds": ["70", "300"], "properties": ["music, radio, gunshots", "a, horn, run"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity has a horn", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "paper is crumpling consistently"], "sample_ids": ["sEprKHm8Sj8", "v5cSxLaHADY"], "start_seconds": ["90", "0"], "properties": ["noise, loud, buzzing", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["birds chirp, a woman speaks, and insects buzz", "a woman speaks happily and an animal chirps"], "sample_ids": ["t97k0cejSQE", "uWAAAL4CIoc"], "start_seconds": ["250", "0"], "properties": ["sound, chirp, buzz", "a woman, chirps, animal"], "captions_pred_video": ["a bee on a purple thistle flower", null], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a woman is speaking and a dog is barking "], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["uRlbY6aoBU", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["a, distance, sneeze", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is sneezing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man talks as several small engines run", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["u9A6VZQCZpU", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["a, man, talk", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["loud, continuous burping", "people applaud and hoot and chat quietly"], "sample_ids": ["y636gklDioE", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["loud, continuous, burping", "people, applaud, hoot"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", null], "captions_pred_audio": ["a person burps loudly several times", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a toilet flushes and a female speaks"], "sample_ids": ["y2bVZ7rz-5M", "yaln9y8I7ms"], "start_seconds": ["280", "230"], "properties": ["engine, horn, siren", "female, flushes, toilet"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage is blurry and out of focus"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["s6DESzUTGjY", "wz7N8YRy74I"], "start_seconds": ["16", "30"], "properties": ["wind, laugh, woman", "rooster, crow, background, men"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["ukxt9I7eMMg", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["continuous, woman, speaking", "men, talk, cars"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has a woman speaking towards the end?", "label": 0}, {"captions": ["a man speaks with another voice speaking in the background", "a horn rings out as a machine runs by"], "sample_ids": ["u21-Z5gJCB8", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["background, voice, man", "a, horn, run"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "paper is crumpling consistently"], "sample_ids": ["y8WEcpOlT3I", "v5cSxLaHADY"], "start_seconds": ["40", "0"], "properties": ["harsh, wind, blows", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vmrxwuAMb2I", "xKB8O8LTs6s"], "start_seconds": ["40", "70"], "properties": ["a dog, inhales, exhales", "music, gunfire, explosion"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a dog barks and growls", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["tEE3MpBt1sg", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["drill, something, laugh", "rooster, crow, background, men"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["xZepNM9qcRA", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["background, motor, run", "sheep, baa, birds"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vlJS7LN2XyM", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["background, clocks, ticking", "beeps, hit, woman"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a ticktock of a clock", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["paper is crumpling consistently", "an infant crying as a woman laughs"], "sample_ids": ["v5cSxLaHADY", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "a, laugh, infant"], "captions_pred_video": ["footage of the person holding a pair of scissors", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["paper is crumpled and crinkled", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["tapping occurs then a baby cries", "a person is whistling"], "sample_ids": ["wIJK3-5y0kA", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["a, cry, baby", "person, whistling, person"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a person whistling a song"], "question": "which entity is a person?", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "a man speaks followed by another man speaking outside"], "sample_ids": ["zuua6-5goWw", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["sound, pop, bird", "two men, speak, follow"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["y2ZBGpgbhHM", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["birds, tweet, pant", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a machine runs continuously", "waves crash against a shoreline and people speak"], "sample_ids": ["wdXV3Pv0jiY", "yFB25fqfU8I"], "start_seconds": ["11", "300"], "properties": ["machine, running, continuously", "wave, crash, shoreline"], "captions_pred_video": ["footage is blurry and shaky", "footage of a person surfing in the ocean"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is not a machine?", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "dishes cling together then a man begins to speak"], "sample_ids": ["yZrFNS7GFBQ", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["pigeon, buzzes, insect", "cling, speak, dishes"], "captions_pred_video": ["of the bird in the cage", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["an owl hoots in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a man speaks followed by another man speaking outside"], "sample_ids": ["xvDdE3zNf8Y", "viuTg1M-dqg"], "start_seconds": ["120", "30"], "properties": ["a, female, speaks", "two men, speak, follow"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a person speaks briefly", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zOZleIRqZm4", "uYT5gxnyMWM"], "start_seconds": ["80", "50"], "properties": ["person, talk, brief", "female, spraying, scream"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a baby is crying"], "question": "which entity is a person talking?", "label": 0}, {"captions": ["a person snoring", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["t8tv5YRMJUg", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["a person, snore, loud", "engine, laugh, loud"], "captions_pred_video": ["of a man getting his face licked by another man", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ziUT9IFTkjg", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["background, birds, rustling", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "three men talk while wind blows and some liquid flows"], "sample_ids": ["smDKStoHBJo", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["a, talk, baby, cry", "three men, wind, flow"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["an engine runs and wind blows", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vs65y4qmyBE", "vYkA3cfXp5Q"], "start_seconds": ["340", "30"], "properties": ["engine, run, wind", "engine, accelerate, idle"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "an engine is idling"], "question": "which entity is a vehicle engine?", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yYEVLuqEytU", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["grunt, slurp, background", "clickety-clack, train, whistle"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "a clock ticktocks"], "sample_ids": ["vVhthZ45k3Y", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["cat, purr, hiss", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is blurry and out of focus", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["bees buzz and wind blows", "a horn rings out as a machine runs by"], "sample_ids": ["tMJne1a4AFI", "slZLHwNbbt4"], "start_seconds": ["0", "300"], "properties": ["bees buzz, wind blows, bees", "a, horn, run"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a swarm of bees buzzing around", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is moving", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "birds chirp and wind blows"], "sample_ids": ["s4Uz1Ffgo04", "sxIvBMSavMQ"], "start_seconds": ["100", "210"], "properties": ["roars, background, people speaking", "birds, chirp, wind"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "birds are chirping and insects are buzzing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vmrxwuAMb2I", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["a dog, inhales, exhales", "a woman, something, fried"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a dog barks and growls", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a man talks as several small engines run", "vehicles pass by on a roadway"], "sample_ids": ["u9A6VZQCZpU", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["a, man, talk", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a car is driving on the road "], "question": "which entity is about vehicles?", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "an airplane accelerates briefly"], "sample_ids": ["xjvTpk2Zpr8", "zjTG0gaGCUI"], "start_seconds": ["70", "80"], "properties": ["wind, blows, vehicle", "accelerates, airplane, briefly"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "a jet engine roars as wind blows "], "question": "which is not a vehicle", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a man speaks as a car is passing by"], "sample_ids": ["vimzuGQvdcU", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "a, car, pass"], "captions_pred_video": ["a group of people are rafting down a river", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more calm", "label": 1}, {"captions": ["loud, continuous burping", "a person speaks briefly"], "sample_ids": ["y636gklDioE", "zOZleIRqZm4"], "start_seconds": ["20", "80"], "properties": ["loud, continuous, burping", "person, talk, brief"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a person burps loudly several times", "a man is speaking with crickets chirping in the background"], "question": "which entity is talking", "label": 1}, {"captions": ["ticking continues without interruption", "a train horn blows as it passes by"], "sample_ids": ["v-g-j2uTByM", "zVacuqSb4LI"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "horn, blows, train"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a clock is ticking loudly", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is not continuous", "label": 1}, {"captions": ["a person is whistling", "a man speaks as a motor runs in the background"], "sample_ids": ["sIXTftIuUgw", "xZepNM9qcRA"], "start_seconds": ["90", "30"], "properties": ["person, whistling, person", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person whistling a song", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a low rumbling in the distance followed by a motorcycle engine revving up", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vr8ZXjEBhMQ", "su6FAOcOA8c"], "start_seconds": ["150", "4"], "properties": ["sound, distance, engine", "engine, idle, woman"], "captions_pred_video": ["is taken from a motorcycle's point of view", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["weDbePuc-Xc", "yDoT73BWsdA"], "start_seconds": ["40", "10"], "properties": ["cartoon character, music, vocalize", "engine, revs, vehicle"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["people clap and speak in the distance", "motors rev and run loudly as a person laughs"], "sample_ids": ["wwyfGO2J4", "zl9Dqx-j7q4"], "start_seconds": ["90", "6"], "properties": ["clap, distance, speak", "motors rev, laugh, loudly"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "water flows as men speak and yell"], "sample_ids": ["w2JXXIAdUdg", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["emits, sleeping, person", "water, flow, men"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sQwlkXjQabo", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["water, spray, surface", "engine, revs, vehicle"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["spraying followed by silence", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["water bubbles and gurgles.", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tB7hWb9gTuQ", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["bubbles, gurgles, water", "music, gunfire, explosion"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["water is splashing and gurgling", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a man speaks followed by another man speaking outside"], "sample_ids": ["ylpYOorfH4o", "viuTg1M-dqg"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "two men, speak, follow"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker?", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a woman speaks as she rubs two objects together"], "sample_ids": ["wz7N8YRy74I", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["rooster, crow, background, people", "two objects, woman, speak"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet flushes and water drains", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sfAvvZwdLCY", "w5W5Kqtc8E"], "start_seconds": ["20", "100"], "properties": ["water drains, flushes, water", "wind, blow, vehicle"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a train engine runs and a horn blows", "wind blows as people chatter quietly"], "sample_ids": ["zPX9o1uDiI", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["engine, horn, run", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vdoxuJn9lTc", "wz7N8YRy74I"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "rooster, crow, background, men"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "birds chirp and objects are moved around"], "sample_ids": ["zofjfKhqLk8", "yPUYU6t3rwo"], "start_seconds": ["10", "370"], "properties": ["noise, stop, motor", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "people cheer as a vehicle engine revs"], "sample_ids": ["u21-Z5gJCB8", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["background, voice, man", "engine revs, vehicle, people"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a truck is revving its engine and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "multiple motorcycles pass by as a man speaks"], "sample_ids": ["w6RTHR6AeAg", "zcDwZ6W7E3E"], "start_seconds": ["40", "180"], "properties": ["call, owl, screech", "man, speak, motorcycles"], "captions_pred_video": [null, "2 people riding motorcycles down a mountain road with trees lining the sides of the road"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a man is speaking while a car accelerates and revs its engine "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["yZrFNS7GFBQ", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["pigeon, buzzes, insect", "two men, woman, birds"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a chime of a clock followed by various tones of ticking with come clinking"], "sample_ids": ["siJFXfGWgDk", "uqFtmnhuqA8"], "start_seconds": ["50", "30"], "properties": ["man, woman, vehicle", "a, b, c"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "shows a clock on the wall of a room with a person standing in front of it"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "mechanisms are ticking and a hammer is striking "], "question": "which entity is a clock?", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "vehicles pass by on a roadway"], "sample_ids": ["sLUnaPT5gM8", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["loud, laughter, intermittent", "pass, vehicle, roadway"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a jet engine spools up and takes off", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vBslzh7saPw", "yDoT73BWsdA"], "start_seconds": ["90", "10"], "properties": ["engine, spools, takes", "engine, revs, vehicle"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a race car accelerates and revs its engine "], "question": "which engine is revving", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["yPUYU6t3rwo", "tw76HGONaKg"], "start_seconds": ["370", "570"], "properties": ["birds chirp, objects are moved around, birds", "A, game, keyboard"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["insects buzz and a man speaks", "a man speaks and types on a computer keyboard "], "question": "which entity is a video of a man playing a video game?", "label": 1}, {"captions": ["goats bleat and metal clings", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tH17JPjDPnc", "xKB8O8LTs6s"], "start_seconds": ["260", "70"], "properties": ["bleat, metal, clings", "music, gunfire, explosion"], "captions_pred_video": ["feed of the goats eating hay in the barn", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["zVacuqSb4LI", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["blares, fades, train", "A, game, keyboard"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a man speaks and types on a computer keyboard "], "question": "which entity is a video of a person playing a video game?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["tQWGZLItBXk", "tDlysoZiA1I"], "start_seconds": ["170", "0"], "properties": ["voice, music, whoosh", "animal, grunts, chirps"], "captions_pred_video": ["worms revolution screenshots", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "birds are chirping and a rooster is crowing "], "question": "which entity is more animal-like", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "an infant crying as a woman laughs"], "sample_ids": ["sU53zg9Jp7s", "xhmRY9yhC7c"], "start_seconds": ["380", "20"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "a, laugh, infant"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "multiple people speak and children yell while water gurgles"], "sample_ids": ["shmR4OZtzqA", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["man, engine, idle", "multiple, people, yell"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", null], "captions_pred_audio": ["a man speaks while a motor runs", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["sawing of wood and rustling with leaves blowing in the distance", "a stream of water runs briefly"], "sample_ids": ["uiItxDsDMFI", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["sound, distance, leaves", "stream, water, run"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a saw is being used with background noise ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "a woman speaks as she rubs two objects together"], "sample_ids": ["u6jIvCtKarQ", "vzxHnu-SFEw"], "start_seconds": ["70", "80"], "properties": ["a, man, speaks", "two objects, woman, speak"], "captions_pred_video": ["footage of a person using a blender on a stove top", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a man speaking over glass clinking?", "label": 0}, {"captions": ["water pouring and bubbling", "paper is crumpling consistently"], "sample_ids": ["uyRfq-jKPpo", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, pouring", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["water is running from a faucet", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a car speeding up in the distance"], "sample_ids": ["un9VQlzgZM", "u0TrcHhkPQ"], "start_seconds": ["5", "20"], "properties": ["wind, speak, laugh", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["an airplane accelerates briefly", "paper is crumpling consistently"], "sample_ids": ["zjTG0gaGCUI", "v5cSxLaHADY"], "start_seconds": ["80", "0"], "properties": ["accelerates, airplane, briefly", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a jet engine roars as wind blows ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a woman talking as an infant is crying", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tMbMDvT50j8", "zl9Dqx-j7q4"], "start_seconds": ["12", "6"], "properties": ["a, talk, infant", "engine, laugh, loud"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a baby cries and a woman speaks", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["an airplane engine runs", "people cheer as a vehicle engine revs"], "sample_ids": ["yVPZ2MNWpms", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["engine, airplane, runs", "engine revs, vehicle, people"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a car is driving by on the road ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zcDwZ6W7E3E", "tdWhHV3X25Q"], "start_seconds": ["180", "60"], "properties": ["a, man, speak", "applause, audience, yells"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a car accelerates and wind blows"], "sample_ids": ["vZAw4apG0Es", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, tick, repeat", "accelerates, wind, blows"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tw76HGONaKg", "yajyRTUQk3U"], "start_seconds": ["570", "400"], "properties": ["music, click, man", "a woman, something, fried"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a beep repeats multiple times", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["y682ml90jGw", "zl9Dqx-j7q4"], "start_seconds": ["11", "6"], "properties": ["beep, repeat, multiple", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a beeping sound is being made ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "paper is crumpling consistently"], "sample_ids": ["uPDn2BFTHk", "v5cSxLaHADY"], "start_seconds": ["140", "0"], "properties": ["woman, laughs, speaks", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a baby laughs and a woman speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "an infant crying as a woman laughs"], "sample_ids": ["uOpoD0gGXcs", "xhmRY9yhC7c"], "start_seconds": ["120", "20"], "properties": ["chirps, woman, bird", "a, laugh, infant"], "captions_pred_video": ["a herd of cows grazing in the field", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "vehicles pass by on a roadway"], "sample_ids": ["vXlk0lIQBFo", "tgbONvsP47Y"], "start_seconds": ["470", "0"], "properties": ["wind, speak, vocalize", "pass, vehicle, roadway"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of a fire truck entering a garage"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "wind blows as people chatter quietly"], "sample_ids": ["uEU-Hg5MTN8", "xBxDz0CFVn0"], "start_seconds": ["27", "30"], "properties": ["a woman, laughs, animal", "wind, chatter, people"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vXlk0lIQBFo", "uYT5gxnyMWM"], "start_seconds": ["470", "50"], "properties": ["wind, talk, vocalize", "a, scream, girl"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "people cheer as a vehicle engine revs"], "sample_ids": ["wqADXCzngMw", "xjhAnI2q6hM"], "start_seconds": ["340", "6"], "properties": ["engine, idle, man", "engine revs, vehicle, people"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle engine?", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a propeller rotates loudly and intensely"], "sample_ids": ["xBxDz0CFVn0", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["stream, water, flow", "loud, intense, propeller"], "captions_pred_video": ["footage is blurry and out of focus", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a helicopter is flying overhead "], "question": "which entity is quieter", "label": 0}, {"captions": ["a baby cries and a woman speaks", "a duck quacks continuously"], "sample_ids": ["tMbMDvT50j8", "vh30P49Po6s"], "start_seconds": ["12", "30"], "properties": ["a, cry, woman", "quacks, continuously, duck"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a baby cries and a woman speaks", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yI-KvObbDoY", "vJ7JPEFhyLA"], "start_seconds": ["260", "16"], "properties": ["sound, smack, wind", "three men, wind, flow"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a liquid flowing?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w5W5Kqtc8E", "wDVMhEdTiVw"], "start_seconds": ["100", "30"], "properties": ["water, splashes, motorboat", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is about water?", "label": 0}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a car speeding up in the distance"], "sample_ids": ["wRBHTgrbiwg", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["bird, owl, speak", "distance, car, speed"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "an engine idles quietly then gradually becomes louder"], "sample_ids": ["wSVhSdj0F0", "vbr9mHKc8WM"], "start_seconds": ["10", "40"], "properties": ["horn honks, keys jingle, slam", "noise, loudness, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "an engine is idling"], "question": "which entity is quieter", "label": 1}, {"captions": ["birds coo incessantly", "a clock ticktocks"], "sample_ids": ["yZrFNS7GFBQ", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["coo, bird, incessant", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of the bird in the cage", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["an owl hoots in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks uses a drill", "an infant crying as a woman laughs"], "sample_ids": ["x5eIC7S0fbg", "xhmRY9yhC7c"], "start_seconds": ["60", "20"], "properties": ["A man is speaking, uses a drill, and is a tool", "a, laugh, infant"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["v0x1odnXtP0", "zj2R0XoFr5k"], "start_seconds": ["210", "50"], "properties": ["keyboard, type, computer", "airplane, boy, fly"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman speaks while a helicopter flies overhead "], "question": "which object is moving", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "paper is crumpling consistently"], "sample_ids": ["uRExseg-0XI", "v5cSxLaHADY"], "start_seconds": ["210", "0"], "properties": ["woman, man, water", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xBxDz0CFVn0", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["stream, water, flow", "wind, blow, vehicle"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["goats bleat and metal clings", "a man speaks as a car is passing by"], "sample_ids": ["tH17JPjDPnc", "sK4u5T8hW78"], "start_seconds": ["260", "30"], "properties": ["bleat, metal, clings", "a, car, pass"], "captions_pred_video": ["feed of the goats eating hay in the barn", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a woman speaks happily and an animal chirps"], "sample_ids": ["y2ZBGpgbhHM", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["birds, tweet, pant", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking and a dog is barking "], "question": "which entity is more likely to be in a forest", "label": 0}, {"captions": ["birds chirp and a pop occurs before a man speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["zuua6-5goWw", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["sound, pop, bird", "engine, laugh, loud"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["se87d6yxEOA", "uZesmtKZGSw"], "start_seconds": ["10", "250"], "properties": ["run, whistle, pass", "men, talk, cars"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is moving faster", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a child speaks in closed space"], "sample_ids": ["vqZuVbG6-HI", "yW6FWLSLkx4"], "start_seconds": ["130", "40"], "properties": ["background, male, female", "child, space, speak"], "captions_pred_video": ["footage is blurry because it's raining outside", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "small dogs yip and bark sharply"], "sample_ids": ["sapQIQUhFc", "v-wcQf4BDY0"], "start_seconds": ["280", "120"], "properties": ["liquid, flow, distance", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["wvKpEYswXO0", "xjhAnI2q6hM"], "start_seconds": ["150", "6"], "properties": ["plastic, tap, speak", "engine revs, vehicle, people"], "captions_pred_video": ["of the person preparing food in the kitchen", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a truck is revving its engine and a man is speaking "], "question": "what is being tapped on?", "label": 0}, {"captions": ["a man speaks then multiple motorcycles pass by", "a woman speaks as she rubs two objects together"], "sample_ids": ["zcDwZ6W7E3E", "vzxHnu-SFEw"], "start_seconds": ["180", "80"], "properties": ["a, man, speak", "two objects, woman, speak"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["multiple birds chirp and an animal grunts", "a woman and man speak while food is frying"], "sample_ids": ["tDlysoZiA1I", "zk-xJGQU8-4"], "start_seconds": ["0", "130"], "properties": ["animal, grunt, multiple", "food, man, woman"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a toilet flushes and a female speaks"], "sample_ids": ["un9VQlzgZM", "yaln9y8I7ms"], "start_seconds": ["5", "230"], "properties": ["wind, speak, laugh", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["someone whistles a tune", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sIXTftIuUgw", "vb1fPSDI4c"], "start_seconds": ["90", "30"], "properties": ["someone, tune, whistle", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a dog whimpers and a woman briefly talks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["y1saVTXsKwc", "uZesmtKZGSw"], "start_seconds": ["80", "250"], "properties": ["a, dog, talk", "men, talk, cars"], "captions_pred_video": ["a dog playing with a pink ball", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a dog barks and a man speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a dog and a woman talking?", "label": 0}, {"captions": ["a motorcycle idles loudly as wind blows", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["v7jJS8aAyA", "tDlysoZiA1I"], "start_seconds": ["10", "0"], "properties": ["wind, blows, loudly", "animal, grunts, chirps"], "captions_pred_video": [null, "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "birds are chirping and a rooster is crowing "], "question": "which entity is quieter", "label": 1}, {"captions": ["paper is crumpling consistently", "a man speaks as a motor runs in the background"], "sample_ids": ["v5cSxLaHADY", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "background, motor, run"], "captions_pred_video": ["footage of the person holding a pair of scissors", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "a car accelerates and wind blows"], "sample_ids": ["y2ZBGpgbhHM", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["dog, chirp, breathe", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a clock ticktocks"], "sample_ids": ["x5cuQjOdM3E", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["cat, talk, meow", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a black background with an airplane flying in the sky", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a cat meows and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["paper is crumpling consistently", "a woman and man are speaking"], "sample_ids": ["v5cSxLaHADY", "vbpKkWvfOu4"], "start_seconds": ["0", "560"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "two people, speaking, woman, man"], "captions_pred_video": ["footage of the person holding a pair of scissors", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["paper is crumpled and crinkled", "a woman is speaking and a man is speaking"], "question": "which entity is a video of two people speaking?", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vms5XGTDVQc", "vJ7JPEFhyLA"], "start_seconds": ["220", "16"], "properties": ["paper, crumpled, crinkled", "three men, wind, flow"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a video of a man talking while wind blows and some liquid flows?", "label": 0}, {"captions": ["small dogs yip and bark sharply", "a car speeding up in the distance"], "sample_ids": ["v-wcQf4BDY0", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["bark, yip, sharply", "distance, car, speed"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", null], "captions_pred_audio": ["a dog barks and growls", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a person speaks over rustling leaves", "some tunes played by whistling"], "sample_ids": ["zOZleIRqZm4", "u6BnG6YZqJ4"], "start_seconds": ["80", "0"], "properties": ["rustling, leaves, person", "tune, play, whistling"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["tDVADusiIoc", "tw76HGONaKg"], "start_seconds": ["60", "570"], "properties": ["man, radio, blows", "A, game, keyboard"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man speaks and types on a computer keyboard "], "question": "which man is speaking", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "people applaud and hoot and chat quietly"], "sample_ids": ["su6FAOcOA8c", "wwyfGO2J4"], "start_seconds": ["4", "90"], "properties": ["engine, idle, woman", "people, applaud, hoot"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a person speaks over rustling leaves", "wind blowing followed by a zoom"], "sample_ids": ["zOZleIRqZm4", "vr8ZXjEBhMQ"], "start_seconds": ["80", "150"], "properties": ["rustling, leaves, person", "wind, blow, zoom"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a dark barks and whimpers", "pigeons vocalize and birds chirp"], "sample_ids": ["sYj4hpDUZDQ", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["barks, whimpers, dark", "vocalize, bird, chirp"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "of the pigeon in the cage"], "captions_pred_audio": ["a dog barks and a cat meows", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a person snores loudly multiple times at a close distance"], "sample_ids": ["w2M4i1mklOA", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["alarm, gears, turn", "loud, multiple, distance"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "an adult male speaks and dials a rotary phone"], "sample_ids": ["wPz6QRAkEb4", "tK4VlLsNxak"], "start_seconds": ["60", "120"], "properties": ["chirps, tweets, song", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": ["a bird in a cage on top of a pole", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["birds are chirping in the background ", "a man is speaking and using a sewing machine"], "question": "which entity is speaking", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "a child speaks in closed space"], "sample_ids": ["w9lpbUn0hPc", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["male, wind, rustling", "child, space, speak"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sapQIQUhFc", "uEU-Hg5MTN8"], "start_seconds": ["280", "27"], "properties": ["water, stream, trickles", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a telephone rings followed by a woman talking"], "sample_ids": ["xjhAnI2q6hM", "tGcFnX0GHI"], "start_seconds": ["6", "0"], "properties": ["engine revs, vehicle, people", "ring, talk, woman"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a man speaks as a car is passing by"], "sample_ids": ["y2bVZ7rz-5M", "sK4u5T8hW78"], "start_seconds": ["280", "30"], "properties": ["engine, horn, siren", "a, car, pass"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sEprKHm8Sj8", "yajyRTUQk3U"], "start_seconds": ["90", "400"], "properties": ["car, tires, slows", "a woman, something, fried"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sHbXC6na9hg", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["a person, saw, wood", "engine, accelerate, idle"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["an engine is idling and vibrating", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["continuous snoring", "a car accelerates and wind blows"], "sample_ids": ["sLkeqCDJIyw", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["loud, snoring, noise", "accelerates, wind, blows"], "captions_pred_video": [", what is the man doing on the couch? sleeping", null], "captions_pred_audio": ["a person is snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is not a noise", "label": 1}, {"captions": ["bees buzz and wind blows", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tMJne1a4AFI", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["bees buzz, wind blows, bees", "a woman, a television program, a bird"], "captions_pred_video": ["a swarm of bees on the ground", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking and a dog is whimpering"], "question": "which entity is a video of a television program?", "label": 1}, {"captions": ["several ducks are quacking and squawking", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wfHeoPDLMaM", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["quacking, squawking, ducks", "multiple, people, yell"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["water gurgles, metal squeaks and the water stops", "water splashes and a door squeaks"], "sample_ids": ["x4a9YGIw4ok", "sdXV-ylviw"], "start_seconds": ["120", "190"], "properties": ["water, gurgles, stops", "sound, splash, door"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and water splashes", "a dog barks and taps with background noise "], "question": "which entity has a door squeak?", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "some tunes played by whistling"], "sample_ids": ["spYNpeN7rPY", "u6BnG6YZqJ4"], "start_seconds": ["1", "0"], "properties": ["a clock, ticktock, man", "tune, play, whistling"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "a horn rings out as a machine runs by"], "sample_ids": ["x6ijhqRY38s", "slZLHwNbbt4"], "start_seconds": ["250", "300"], "properties": ["something metal, glass, hit", "a, horn, run"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["bees buzz as wind blows", "a man speaks followed by another man speaking outside"], "sample_ids": ["tMJne1a4AFI", "viuTg1M-dqg"], "start_seconds": ["0", "30"], "properties": ["bees, buzz, wind", "two men, speak, follow"], "captions_pred_video": ["a swarm of bees on the ground", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a single speaker?", "label": 0}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "pigeons vocalize and birds chirp"], "sample_ids": ["uEU-Hg5MTN8", "uiS58TNyUiw"], "start_seconds": ["27", "430"], "properties": ["animal, grunts, snorts", "vocalize, bird, chirp"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of the pigeon in the cage"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["food is frying while a woman speaks", "someone snores nearby"], "sample_ids": ["yhQ2Lg-7qDY", "spJCm8tD9Zo"], "start_seconds": ["130", "90"], "properties": ["food, woman, speak", "someone snores, nearby, someone"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a person is snoring loudly"], "question": "what is a person doing in the first picture?", "label": 0}, {"captions": ["ticking continues without interruption", "a stream of water flows as people talk and wind blows"], "sample_ids": ["v-g-j2uTByM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "stream, water, flow"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "a person is whistling a tune"], "sample_ids": ["xERFUeZONz8", "scYRUkrFLiQ"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "a, tune, whistle"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "of the man wearing a bow tie and a suit jacket in front of a red door"], "captions_pred_audio": ["an emergency vehicle siren blares", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "an infant crying as a woman laughs"], "sample_ids": ["rqu8iB22IY", "xhmRY9yhC7c"], "start_seconds": ["5", "20"], "properties": ["sound, repeats, laugh", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["yPUYU6t3rwo", "yeFvk9x0wWI"], "start_seconds": ["370", "30"], "properties": ["birds chirp, objects are moved around, birds", "clack, bird, chirp"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["insects buzz and a man speaks", "birds chirp in the background as a car drives by "], "question": "which entity is about birds?", "label": 0}, {"captions": ["multiple birds vocalize and wind blows", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uoGVs9yUqY4", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["multiple, vocalize, wind", "applause, audience, yells"], "captions_pred_video": ["for how to make a wooden shed door youtube", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a machine runs continuously", "vehicles pass by on a roadway"], "sample_ids": ["wdXV3Pv0jiY", "tgbONvsP47Y"], "start_seconds": ["11", "0"], "properties": ["machine, running, continuously", "pass, vehicle, roadway"], "captions_pred_video": ["footage is blurry and shaky", "footage of a fire truck entering a garage"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yeFvk9x0wWI", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["clack, bird, chirp", "airplane, boy, fly"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "people applaud and hoot and chat quietly"], "sample_ids": ["uqFtmnhuqA8", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["a, b, c", "people, applaud, hoot"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", null], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "wind noise takes place into a microphone while rustling occurs"], "sample_ids": ["yeFvk9x0wWI", "w8uLijTqtlU"], "start_seconds": ["30", "70"], "properties": ["chirp, twitter, clatter", "wind, microphone, noise"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage is blurry and shaky"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "the wind is blowing strongly"], "question": "which noise is not made by birds", "label": 1}, {"captions": ["a child yells and another yells", "a infant makes noise and is excited"], "sample_ids": ["vMDHu7Lxcgw", "wIJK3-5y0kA"], "start_seconds": ["410", "30"], "properties": ["two, yell, child", "noise, excited, infant"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a baby cries and a woman speaks"], "question": "which entity is more excited", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a man speaks followed by another man speaking outside"], "sample_ids": ["y8WEcpOlT3I", "viuTg1M-dqg"], "start_seconds": ["40", "30"], "properties": ["wind, speak, buffeting", "two men, speak, follow"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 0}, {"captions": ["white noise and birds chirping", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["wRBHTgrbiwg", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["noise, white, chirping", "engine, laugh, loud"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a stream of water runs briefly", "wind blows as people chatter quietly"], "sample_ids": ["x-PeY8Yb8M4", "xBxDz0CFVn0"], "start_seconds": ["300", "30"], "properties": ["stream, water, run", "wind, chatter, people"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage is blurry and out of focus"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zuua6-5goWw", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["birds, chirp, quiet, man, speaks", "stream, water, flow"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as crickets sing", "a man speaks, then dials a rotary telephone"], "sample_ids": ["ryFDPxgDOGc", "tK4VlLsNxak"], "start_seconds": ["570", "120"], "properties": ["a, crickets, sing", "a, dial, telephone"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and using a sewing machine"], "question": "which entity is a man speaking to a rotary telephone?", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a stream of water runs briefly"], "sample_ids": ["yeFvk9x0wWI", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["clack, bird, chirp", "stream, water, run"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["water running down a sink while a man is talking", "paper is crumpling consistently"], "sample_ids": ["vSeGhaZt-aI", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["water, sink, talk", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wztCSUxOf8", "vJ7JPEFhyLA"], "start_seconds": ["130", "16"], "properties": ["a crowd, yells, applauds", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 0}, {"captions": ["people speak and laugh as a child speaks", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["sa6TLVbooCc", "vlS6YMeWAPo"], "start_seconds": ["240", "40"], "properties": ["people, laugh, child", "sheep, baa, birds"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yajyRTUQk3U", "yajyRTUQk3U"], "start_seconds": ["400", "400"], "properties": ["noise, woman, speak", "a woman, something, fried"], "captions_pred_video": ["- a woman cooking in the kitchen", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman is speaking while food is frying in the background"], "question": "which woman is speaking over sizzling noise", "label": 0}, {"captions": ["water runs into a sink while men speak", "a man is filing a hard object"], "sample_ids": ["vzceMbklWc", "vveS8HT7Uog"], "start_seconds": ["180", "100"], "properties": ["water, sink, run", "a man, hard, object"], "captions_pred_video": [null, "footage is of a workbench with various tools on it including a hammer and a screwdriver"], "captions_pred_audio": ["water is running and a man is speaking", "a man is filing and speaking with background noise and breathing "], "question": "which object is harder to file", "label": 0}, {"captions": ["a propeller moves loudly nearby", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["ugHJF0hfYkg", "vfYTJq7nU"], "start_seconds": ["10", "130"], "properties": ["loud, propeller, move", "rustling, ducks, quack"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a duck quacks and a woman speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["a toilet flushes and water drains", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sfAvvZwdLCY", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["water drains, flushes, water", "female, spraying, scream"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "loud, continuous burping"], "sample_ids": ["x5cuQjOdM3E", "y636gklDioE"], "start_seconds": ["30", "20"], "properties": ["cat, meows, young woman", "loud, continuous, burping"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a dog sitting on a red chair in front of an old telephone"], "captions_pred_audio": ["a cat meows and a woman speaks", "a person burps loudly several times"], "question": "which is louder", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "an engine runs loudly"], "sample_ids": ["sOa7g-44Dag", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["audio, scratching, man", "loud, engine, run"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "some tunes played by whistling"], "sample_ids": ["soTOh3zYJfY", "u6BnG6YZqJ4"], "start_seconds": ["40", "0"], "properties": ["vehicle, skid, tires", "tune, play, whistling"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "pigeons vocalize and birds chirp"], "sample_ids": ["vms5XGTDVQc", "uiS58TNyUiw"], "start_seconds": ["220", "430"], "properties": ["paper, crumpled, crinkled", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "of the pigeon in the cage"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "some tunes played by whistling"], "sample_ids": ["xl2PIWyXaM", "u6BnG6YZqJ4"], "start_seconds": ["160", "0"], "properties": ["chirp, man, younger person", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["birds are chirping and people are talking", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a machine beeps continuously", "people cheer as a vehicle engine revs"], "sample_ids": ["y682ml90jGw", "xjhAnI2q6hM"], "start_seconds": ["11", "6"], "properties": ["beeps, machine, continuously", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a beeping sound is being made ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "some tunes played by whistling"], "sample_ids": ["sxYkFKFIZD0", "u6BnG6YZqJ4"], "start_seconds": ["20", "0"], "properties": ["screech, man, door", "tune, play, whistling"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a telephone rings followed by a woman talking"], "sample_ids": ["su6FAOcOA8c", "tGcFnX0GHI"], "start_seconds": ["4", "0"], "properties": ["engine, idle, woman", "ring, talk, woman"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a recording", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["vZAw4apG0Es", "wSVhSdj0F0"], "start_seconds": ["30", "10"], "properties": ["background, tick, repeat", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a car horn honks and keys jangle with background noise "], "question": "which entity has a horn honk?", "label": 1}, {"captions": ["an audience gives applause", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["x6iCUDmRpKQ", "xKB8O8LTs6s"], "start_seconds": ["38", "70"], "properties": ["applause, audience, give", "music, gunfire, explosion"], "captions_pred_video": ["a black background with the moon and stars in the sky", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a group of people are clapping and cheering", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["x5cuQjOdM3E", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["cat, meows, young woman", "loud, multiple, distance"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["water flows followed by women screaming", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["w5W5Kqtc8E", "zl9Dqx-j7q4"], "start_seconds": ["100", "6"], "properties": ["water, flow, women", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a jet engine roars "], "question": "which entity is followed by laughter", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vf9xf3vMsGM", "vb1fPSDI4c"], "start_seconds": ["540", "30"], "properties": ["A man speaks while turning a water faucet on.", "multiple, people, yell"], "captions_pred_video": ["of the person washing their hands under the faucet", null], "captions_pred_audio": ["a man is speaking while water is running in the background", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a vehicle engine accelerates and wind blows"], "sample_ids": ["xl2PIWyXaM", "wudZTNBtVqc"], "start_seconds": ["160", "60"], "properties": ["chirp, man, younger person", "accelerates, engine, wind"], "captions_pred_video": [null, "footage is of a parking lot with cars parked in it"], "captions_pred_audio": ["birds are chirping and people are talking", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "water flows as men speak and yell"], "sample_ids": ["wTideSjRFS0", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["food, sizzle, woman", "water, flow, men"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing?", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "roadway noise occurs and a truck accelerates"], "sample_ids": ["wnpJndXuxLc", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["blows, vehicle, train", "noise, truck, accelerate"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a baby cries and a woman moans", "a stream of water runs briefly"], "sample_ids": ["smDKStoHBJo", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["a, cry, woman", "stream, water, run"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "some men converse over an engine running"], "sample_ids": ["uWAAAL4CIoc", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["a woman, chirps, animal", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a woman speaking and an animal chirps?", "label": 0}, {"captions": ["three men talk while wind blows and some liquid flows", "winds blows roughly as a vehicle races past"], "sample_ids": ["vJ7JPEFhyLA", "xjvTpk2Zpr8"], "start_seconds": ["16", "70"], "properties": ["three men, wind, flow", "wind, blows, vehicle"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine roars and wind blows "], "question": "which entity shows a vehicle racing past?", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "an engine runs loudly"], "sample_ids": ["y2bVZ7rz-5M", "vqZuVbG6-HI"], "start_seconds": ["280", "130"], "properties": ["engine, horn, siren", "loud, engine, run"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a lawn mower is running and men are speaking "], "question": "which entity has a louder engine", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "water is sprayed across a hard surface"], "sample_ids": ["sEprKHm8Sj8", "sQwlkXjQabo"], "start_seconds": ["90", "10"], "properties": ["car, tires, slows", "water, spray, surface"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "spraying followed by silence"], "question": "which is a liquid", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "water is sprayed across a hard surface"], "sample_ids": ["tK4VlLsNxak", "sQwlkXjQabo"], "start_seconds": ["120", "10"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "water, spray, surface"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "wind blows as people chatter quietly"], "sample_ids": ["u5RmF3c3Aw", "xBxDz0CFVn0"], "start_seconds": ["60", "30"], "properties": ["engine, car, zoom", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "small dogs yip and bark sharply"], "sample_ids": ["wqZ135Ssz0", "v-wcQf4BDY0"], "start_seconds": ["60", "120"], "properties": ["two men, woman, birds", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a person is whistling", "waves crash against a shoreline and people speak"], "sample_ids": ["sIXTftIuUgw", "yFB25fqfU8I"], "start_seconds": ["90", "300"], "properties": ["person, whistling, person", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a person whistling a song", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["uEU-Hg5MTN8", "tDVADusiIoc"], "start_seconds": ["27", "60"], "properties": ["a woman, laughs, animal", "water, radio, man"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a man speaking with light rustling", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zOZleIRqZm4", "uEU-Hg5MTN8"], "start_seconds": ["80", "27"], "properties": ["light, rustling, man", "a woman, laughs, animal"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a recording", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "an infant crying frantically"], "sample_ids": ["xfaoyyzw2WU", "zwOBqeFTgiU"], "start_seconds": ["180", "30"], "properties": ["loud, jet engine, roar", "cry, infant, frantically"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "of the baby crying in the car seat"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a baby cries loudly"], "question": "which is louder", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "birds chirp and objects are moved around"], "sample_ids": ["zkKdxzNC97Y", "yPUYU6t3rwo"], "start_seconds": ["27", "370"], "properties": ["loud, bang, noise", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a door is opened and closed", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person speaks briefly", "people applaud and hoot and chat quietly"], "sample_ids": ["zOZleIRqZm4", "wwyfGO2J4"], "start_seconds": ["80", "90"], "properties": ["person, talk, brief", "people, applaud, hoot"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a propeller rotates loudly and intensely"], "sample_ids": ["uiItxDsDMFI", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["wood, piece, saw", "loud, intense, propeller"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a saw is being used with background noise ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a helicopter engine runs", "a car speeding up in the distance"], "sample_ids": ["t5ZbXbniOWk", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["engine, helicopter, run", "distance, car, speed"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tdWhHV3X25Q", "yajyRTUQk3U"], "start_seconds": ["60", "400"], "properties": ["applause, audience, yells", "a woman, something, fried"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a woman is speaking while food is frying in the background"], "question": "which entity is a demonstration", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sWZzXuWYY", "vfYTJq7nU"], "start_seconds": ["420", "130"], "properties": ["male, clanks, thumps", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a duck quacks and a woman speaks"], "question": "which entity is about a machine?", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vbpKkWvfOu4", "tDVADusiIoc"], "start_seconds": ["560", "60"], "properties": ["a, man, speaks", "water, radio, man"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yRx9txMcBl0", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["accelerates, tires, squeals", "a woman, laughs, animal"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "a drill runs and two people laugh"], "sample_ids": ["zdYdyF9-m8U", "tEE3MpBt1sg"], "start_seconds": ["7", "50"], "properties": ["wind, crash, shoreline", "two people, laugh, drill"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["waves crash and wind blows ", "people are laughing breathing and speaking with background noise "], "question": "which entity is stationary", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a race car approaches quickly and slows down squealing tires"], "sample_ids": ["zk-xJGQU8-4", "sEprKHm8Sj8"], "start_seconds": ["130", "90"], "properties": ["food, man, woman", "car, tires, slows"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "an airplane engine spools and people speak"], "sample_ids": ["vbZ-0lGPneg", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["a woman, a television program, a bird", "airplane, engine, spool"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a jet engine is running and people are talking"], "question": "which entity has a bird in it?", "label": 0}, {"captions": ["heavy rain splashes as it falls", "a man speaks followed by another man speaking outside"], "sample_ids": ["wP8ZKrlx3oA", "viuTg1M-dqg"], "start_seconds": ["40", "30"], "properties": ["fall, rain, splash", "two men, speak, follow"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vYkA3cfXp5Q", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["speed, idle, accelerate", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["an engine is idling", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a person", "label": 1}, {"captions": ["a person is burping while a girl speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["vdoxuJn9lTc", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["person, burp, girl", "people, applaud, hoot"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", null], "captions_pred_audio": ["a child speaks followed by a burp", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be at a party", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "small dogs yip and bark sharply"], "sample_ids": ["sapQIQUhFc", "v-wcQf4BDY0"], "start_seconds": ["280", "120"], "properties": ["water, trickles, flow", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a dark barks and whimpers"], "sample_ids": ["zF8yoL0rkbI", "sYj4hpDUZDQ"], "start_seconds": ["30", "30"], "properties": ["engine, run, someone", "barks, whimpers, dark"], "captions_pred_video": ["footage of the traffic on the street at night", "a brown and white dog standing in front of a wall with its mouth open"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a dog barks and a cat meows"], "question": "which entity is more quiet", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "a child speaks in closed space"], "sample_ids": ["uC9dtII1KDI", "yW6FWLSLkx4"], "start_seconds": ["150", "40"], "properties": ["wind, gusts, distance", "child, space, speak"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a woman and man are speaking", "water flows as men speak and yell"], "sample_ids": ["vbpKkWvfOu4", "vJ7JPEFhyLA"], "start_seconds": ["560", "16"], "properties": ["two people, speaking, woman, man", "water, flow, men"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a man and a woman speaking?", "label": 0}, {"captions": ["a man speaks and is typing on a keyboard", "paper folding and crinkling"], "sample_ids": ["x9JovgqUcs", "zPpG3RD8lSs"], "start_seconds": ["500", "20"], "properties": ["a, man, speaks, keyboard", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man speaks and types on a keyboard", "the wind blows and a mouse clicks "], "question": "which is not a person", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "someone is typing on a computer keyboard"], "sample_ids": ["rqu8iB22IY", "v0x1odnXtP0"], "start_seconds": ["5", "210"], "properties": ["sound, repeats, laugh", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a man woman speak while crickets sing", "people speak as gunfire rings out"], "sample_ids": ["zTLVJCo4WEE", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["a, crickets, sing", "gunfire, ring, speak"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a train horn blows as it passes by", "an airplane engine runs"], "sample_ids": ["zVacuqSb4LI", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["horn, blows, train", "engine, airplane, runs"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a car is driving by on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a stream of water runs briefly", "a man speaks as horns blow"], "sample_ids": ["x-PeY8Yb8M4", "tHyNqRyK34A"], "start_seconds": ["300", "24"], "properties": ["stream, water, run", "a, man, speaks"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "being taken from inside a vehicle on the street at night"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking and a car is honking with background noise "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["an airplane engine spools and people speak", "a car speeding up in the distance"], "sample_ids": ["wTjoRj1se3U", "u0TrcHhkPQ"], "start_seconds": ["390", "20"], "properties": ["airplane, engine, spool", "distance, car, speed"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a race car accelerates and revs its engine "], "question": "which object is moving faster", "label": 0}, {"captions": ["male speech with light ticking", "a woman speaks as she rubs two objects together"], "sample_ids": ["xO-Q2BlIIPU", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["male, speech, ticking", "two objects, woman, speak"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a woman speaking?", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zcDwZ6W7E3E", "vb1fPSDI4c"], "start_seconds": ["180", "30"], "properties": ["man, speak, motorcycles", "multiple, people, yell"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["w2bYrCVLT60", "yajyRTUQk3U"], "start_seconds": ["120", "400"], "properties": ["ducks, speak, quack", "a woman, something, fried"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "- a woman cooking in the kitchen"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "an airplane engine spools and people speak"], "sample_ids": ["sG7TyPnFDR0", "wTjoRj1se3U"], "start_seconds": ["180", "390"], "properties": ["beeps, machine, smoke alarm", "airplane, engine, spool"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a jet engine is running and people are talking"], "question": "which entity is a machine?", "label": 0}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "water splashes as an animal walks through"], "sample_ids": ["wP8ZKrlx3oA", "w1ir-sZ3Im8"], "start_seconds": ["40", "90"], "properties": ["rain, storm, thunder", "animal, water, splashes"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a heavy rain is falling on a surface", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be in a storm", "label": 0}, {"captions": ["an engine runs and wind blows", "a machine beeps continuously"], "sample_ids": ["vs65y4qmyBE", "y682ml90jGw"], "start_seconds": ["340", "11"], "properties": ["engine, run, wind", "beeps, machine, continuously"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xKB8O8LTs6s", "uZesmtKZGSw"], "start_seconds": ["70", "250"], "properties": ["music, radio, gunshots", "men, talk, cars"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "someone whistles a tune"], "sample_ids": ["uEU-Hg5MTN8", "sIXTftIuUgw"], "start_seconds": ["27", "90"], "properties": ["a woman, laughs, animal", "someone, tune, whistle"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["people speak softly as food sizzles", "metal clacking as food and oil sizzles followed by a woman talking"], "sample_ids": ["yhQ2Lg-7qDY", "vW4x7S1VfQc"], "start_seconds": ["130", "150"], "properties": ["food, sizzle, speak", "clacking, oil, woman"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of a person cooking fish in a frying pan on a stove top"], "captions_pred_audio": ["a faucet is running and a man is speaking", "food sizzles in a frying pan"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["zofjfKhqLk8", "ukg5L09Wpvo"], "start_seconds": ["10", "150"], "properties": ["noise, stop, motor", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a man talks while a clock does ticktock"], "sample_ids": ["sK4u5T8hW78", "spYNpeN7rPY"], "start_seconds": ["30", "1"], "properties": ["a, car, pass", "a clock, ticktock, man"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and breathing with background noise "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a person snoring", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["t8tv5YRMJUg", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["a person, snore, loud", "People, motor, brakes"], "captions_pred_video": ["of a man getting his face licked by another man", null], "captions_pred_audio": ["a person sniffs and breathes heavily", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xvDdE3zNf8Y", "tdWhHV3X25Q"], "start_seconds": ["120", "60"], "properties": ["a, female, speaks", "applause, audience, yells"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "people applaud and hoot and chat quietly"], "sample_ids": ["xyL9F5VrjkE", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["wind, blows, vehicle", "people, applaud, hoot"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be in a theater", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vbpKkWvfOu4", "uZesmtKZGSw"], "start_seconds": ["560", "250"], "properties": ["a, man, speaks", "men, talk, cars"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a person is snoring while sleeping"], "sample_ids": ["yLy-WycbVVE", "vJrjSeP17yE"], "start_seconds": ["30", "40"], "properties": ["background, people, talk", "a person is sleeping, snoring, person"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["tQWGZLItBXk", "uEU-Hg5MTN8"], "start_seconds": ["170", "27"], "properties": ["music, person, ding", "a woman, laughs, animal"], "captions_pred_video": ["worms revolution screenshots", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a baby is crying"], "question": "which entity has a person speaking?", "label": 0}, {"captions": ["a woman speaks as frying food sizzles", "a man speaks followed by another man speaking outside"], "sample_ids": ["wTideSjRFS0", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["food, sizzle, woman", "two men, speak, follow"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "material crumbles into a microphone"], "sample_ids": ["t69a8aRKhmc", "vofpvUo6NAw"], "start_seconds": ["30", "220"], "properties": ["a, b, c", "material, crumbles, microphone"], "captions_pred_video": ["footage is blurry and out of focus", "person wrapping a toy car in a plastic bag"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "paper is being crumpled and crinkled"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "dishes cling together then a man begins to speak"], "sample_ids": ["w8uLijTqtlU", "sQGXqGcwOTc"], "start_seconds": ["70", "3"], "properties": ["wind, microphone, noise", "cling, speak, dishes"], "captions_pred_video": ["footage is blurry and shaky", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["the wind is blowing strongly", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vXlk0lIQBFo", "xKB8O8LTs6s"], "start_seconds": ["470", "70"], "properties": ["wind, speak, vocalize", "music, gunfire, explosion"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "dishes cling together then a man begins to speak"], "sample_ids": ["ylpYOorfH4o", "sQGXqGcwOTc"], "start_seconds": ["410", "3"], "properties": ["engine, running, wind", "cling, speak, dishes"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and an engine is revving", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["plastic is tapped on while someone speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["wvKpEYswXO0", "zl9Dqx-j7q4"], "start_seconds": ["150", "6"], "properties": ["plastic, tap, speak", "engine, laugh, loud"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a door slams shut roughly"], "sample_ids": ["w5W5Kqtc8E", "zkKdxzNC97Y"], "start_seconds": ["100", "27"], "properties": ["wind, engine, scream", "a door, slams, shut"], "captions_pred_video": [null, "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a door is opened and closed"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "birds chirp and objects are moved around"], "sample_ids": ["wztCSUxOf8", "yPUYU6t3rwo"], "start_seconds": ["130", "370"], "properties": ["a crowd, yells, applauds", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 0}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "someone is typing on a computer keyboard"], "sample_ids": ["sG7TyPnFDR0", "v0x1odnXtP0"], "start_seconds": ["180", "210"], "properties": ["beeps, machine, smoke alarm", "keyboard, type, computer"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a person is typing on a keyboard"], "question": "which entity is typing on a computer keyboard?", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "people applaud and hoot and chat quietly"], "sample_ids": ["sAam2NqGhLY", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["snoring, breathing, child", "people, applaud, hoot"], "captions_pred_video": ["of a little girl sleeping on a couch", null], "captions_pred_audio": ["a person is snoring", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "some men converse over an engine running"], "sample_ids": ["u5RmF3c3Aw", "sCiy7QS1U"], "start_seconds": ["60", "300"], "properties": ["engine, car, zoom", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a car zooming by?", "label": 0}, {"captions": ["a man yells and speaks as water splashes", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vimzuGQvdcU", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["a, man, yells", "wind, blow, vehicle"], "captions_pred_video": ["a group of people are rafting down a river", null], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a man yelling?", "label": 0}, {"captions": ["continuous chugging with birds chirping in the background", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["xM4joTqDVp4", "uYT5gxnyMWM"], "start_seconds": ["160", "50"], "properties": ["background, chirp, birds", "a, scream, girl"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream in the background", "label": 1}, {"captions": ["water pouring and bubbling", "an infant crying as a woman laughs"], "sample_ids": ["uyRfq-jKPpo", "xhmRY9yhC7c"], "start_seconds": ["50", "20"], "properties": ["water, bubbles, pouring", "a, laugh, infant"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["water is running from a faucet", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sWZzXuWYY", "wz7N8YRy74I"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "a car speeding up in the distance"], "sample_ids": ["wSVhSdj0F0", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["beep, clang, footsteps", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a clock ticktocks"], "sample_ids": ["voJh2gJxXhA", "v-g-j2uTByM"], "start_seconds": ["50", "30"], "properties": ["music, frog, croak", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["people speak then an engine runs", "a jet engine spools up and takes off"], "sample_ids": ["uMTTDZ2mb4", "vBslzh7saPw"], "start_seconds": ["30", "90"], "properties": ["engine, run, people", "engine, spools, takes"], "captions_pred_video": [null, "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a jet engine roars and accelerates "], "question": "which entity is a video of an engine running?", "label": 0}, {"captions": ["a fly buzzes around loudly as birds chirp", "birds vocalize and chirp continuously"], "sample_ids": ["uJV8NDaHqqk", "w1mlz3Pe4fU"], "start_seconds": ["100", "300"], "properties": ["loud, fly, chirp", "vocalize, chirp, continuously"], "captions_pred_video": ["a bee hive in a wooden box", "of a bird in a cage"], "captions_pred_audio": ["a swarm of bees buzzing around", "birds are chirping and singing"], "question": "which entity is quieter", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "frogs croak and vocalize"], "sample_ids": ["weDbePuc-Xc", "yswmmRZFItk"], "start_seconds": ["40", "0"], "properties": ["cartoon character, music, vocalize", "croak, vocalize, frog"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a close up of a frog in the water"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["xl2PIWyXaM", "wwyfGO2J4"], "start_seconds": ["160", "90"], "properties": ["chirp, man, younger person", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and people are talking", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["an insect buzzes around continuously", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["v25l1jef3JY", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["buzzes, continuously, insect", "airplane, boy, fly"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a man speaks followed by another man speaking outside"], "sample_ids": ["vf9xf3vMsGM", "viuTg1M-dqg"], "start_seconds": ["540", "30"], "properties": ["A man speaks while turning a water faucet on.", "two men, speak, follow"], "captions_pred_video": ["of the person washing their hands under the faucet", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["food is frying while a woman speaks", "a car accelerates and wind blows"], "sample_ids": ["yhQ2Lg-7qDY", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["food, woman, speak", "accelerates, wind, blows"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["uRExseg-0XI", "w34HjHr6gAY"], "start_seconds": ["210", "30"], "properties": ["woman, man, water", "beeps, hit, woman"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a clock ticktocks"], "sample_ids": ["y2bVZ7rz-5M", "v-g-j2uTByM"], "start_seconds": ["280", "30"], "properties": ["engine, horn, siren", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a man talks followed by a woman shouting"], "sample_ids": ["sEprKHm8Sj8", "s3cTDAj31g"], "start_seconds": ["90", "80"], "properties": ["car, tires, slows", "man, talk, woman"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a man speaks while water trickles and flows"], "sample_ids": ["x6ijhqRY38s", "sapQIQUhFc"], "start_seconds": ["250", "280"], "properties": ["bowl, silverware, man", "water, trickles, flow"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is about water flowing?", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zk-xJGQU8-4", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["food, man, woman", "a woman, laughs, animal"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "people applaud and hoot and chat quietly"], "sample_ids": ["y2bVZ7rz-5M", "wwyfGO2J4"], "start_seconds": ["280", "90"], "properties": ["motor noise, horn, siren", "people, applaud, hoot"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a machine runs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vD6lYD1l0BY", "yajyRTUQk3U"], "start_seconds": ["330", "400"], "properties": ["a, machine, run", "a woman, something, fried"], "captions_pred_video": ["game controller being held in the hands of the person", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["some clanking with distant murmuring", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["uMTTDZ2mb4", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["clanking, murmuring, distant", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a duck quacks continuously", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vh30P49Po6s", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["quacks, continuously, duck", "applause, audience, yells"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["people speak softly as food sizzles", "water flows as men speak and yell"], "sample_ids": ["yhQ2Lg-7qDY", "vJ7JPEFhyLA"], "start_seconds": ["130", "16"], "properties": ["food, sizzle, speak", "water, flow, men"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["food is frying then a woman speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["ukxt9I7eMMg", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["food, woman, speak", "applause, audience, yells"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a propeller rotates loudly and intensely"], "sample_ids": ["vXlk0lIQBFo", "ugHJF0hfYkg"], "start_seconds": ["470", "10"], "properties": ["wind, talk, vocalize", "loud, intense, propeller"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["xO-Q2BlIIPU", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["two men, exclamation, speak", "a, scream, girl"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "an airplane engine spools and people speak"], "sample_ids": ["s4Uz1Ffgo04", "wTjoRj1se3U"], "start_seconds": ["100", "390"], "properties": ["roars, background, people speaking", "airplane, engine, spool"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a jet engine is running and people are talking"], "question": "which entity is quieter", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["xjvTpk2Zpr8", "siJFXfGWgDk"], "start_seconds": ["70", "50"], "properties": ["engine, run, wind", "man, woman, vehicle"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking and birds are chirping in the background "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "paper folding and crinkling"], "sample_ids": ["xZepNM9qcRA", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["background, motor, run", "paper, fold, crinkle"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "the wind blows and a mouse clicks "], "question": "which entity is more likely to be used in a classroom", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "someone is typing on a computer keyboard"], "sample_ids": ["x5cuQjOdM3E", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["cat, talk, meow", "keyboard, type, computer"], "captions_pred_video": ["a black background with an airplane flying in the sky", "how to make money on youtube in spanish"], "captions_pred_audio": ["a cat meows and a woman speaks", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wsHBIgzs9Fs", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["horn, continuous, buzzing", "female, spraying, scream"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a woman is speaking and a baby is crying"], "question": "which entity is more abrasive", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "an engine runs loudly"], "sample_ids": ["w2JXXIAdUdg", "vqZuVbG6-HI"], "start_seconds": ["10", "130"], "properties": ["emits, sleeping, person", "loud, engine, run"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["siJFXfGWgDk", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["man, woman, vehicle", "people, applaud, hoot"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "an airplane engine runs"], "sample_ids": ["wRV8yMk886E", "yVPZ2MNWpms"], "start_seconds": ["0", "0"], "properties": ["liquid, spray, nozzle", "engine, airplane, runs"], "captions_pred_video": ["two cars are parked in a parking lot at night", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["a male speaks and another male speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["viuTg1M-dqg", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["two males, speaking, male", "people, applaud, hoot"], "captions_pred_video": ["footage of water coming out of a hole in the ground", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "music plays followed by gunshots and then an explosion"], "sample_ids": ["vf44CgrjT0A", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["loud, long, person", "music, gunshots, explosion"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a loud burp", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is louder", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sAam2NqGhLY", "zj2R0XoFr5k"], "start_seconds": ["20", "50"], "properties": ["snoring, breathing, child", "airplane, boy, fly"], "captions_pred_video": ["of a little girl sleeping on a couch", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person is snoring", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a man speaks followed by another man speaking outside"], "sample_ids": ["xjvTpk2Zpr8", "viuTg1M-dqg"], "start_seconds": ["70", "30"], "properties": ["engine, run, wind", "two men, speak, follow"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a single person speaking?", "label": 0}, {"captions": ["some people speak", "a man speaks as a motor runs in the background"], "sample_ids": ["vbZ-0lGPneg", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "background, motor, run"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vbZ-0lGPneg", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["a woman, a television program, a bird", "gun, shoot, water"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["w6RTHR6AeAg", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["call, owl, screech", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["u5RmF3c3Aw", "xfaoyyzw2WU"], "start_seconds": ["60", "180"], "properties": ["engine, car, zoom", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows strongly", "a man speaks followed by another man speaking outside"], "sample_ids": ["w8uLijTqtlU", "viuTg1M-dqg"], "start_seconds": ["70", "30"], "properties": ["wind, blows, strongly", "two men, speak, follow"], "captions_pred_video": ["footage is blurry and shaky", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sDSppXIlJrs", "sLUnaPT5gM8"], "start_seconds": ["27", "0"], "properties": ["microphone, water, wind", "loud, laughter, intermittent"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a scream", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "an adult man speaks over glass clinking"], "sample_ids": ["uYT5gxnyMWM", "u6jIvCtKarQ"], "start_seconds": ["50", "70"], "properties": ["a, scream, girl", "a, man, speaks"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a person using a blender on a stove top"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and dishes are being moved with background noise "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "an airplane engine runs"], "sample_ids": ["x9JovgqUcs", "yVPZ2MNWpms"], "start_seconds": ["500", "0"], "properties": ["a, man, speaks, keyboard", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a car is driving by on the road "], "question": "which is a moving object", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xyL9F5VrjkE", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["wind, blows, vehicle", "men, talk, cars"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wyllXV6PjKo", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["a kid, talk, cry", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a grown man speaks and water bubbles and runs"], "sample_ids": ["smDKStoHBJo", "vSeGhaZt-aI"], "start_seconds": ["0", "50"], "properties": ["a, talk, baby, cry", "water, bubbles, run"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and pouring liquid with background noise "], "question": "which entity has a baby?", "label": 0}, {"captions": ["a frog vocalizes while birds chirp", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vMf1dLD6Sng", "y8WEcpOlT3I"], "start_seconds": ["6", "40"], "properties": ["frog, bird, vocalize", "harsh, wind, blows"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a frog croaks loudly", "a man is speaking with wind noise in the background "], "question": "which entity is not a frog?", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a duck quacks continuously"], "sample_ids": ["t97k0cejSQE", "vh30P49Po6s"], "start_seconds": ["250", "30"], "properties": ["bird, chirp, insect", "quacks, continuously, duck"], "captions_pred_video": ["a bee on a purple thistle flower", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 0}, {"captions": ["a baby laugh at a sputter", "a man speaks as a car is passing by"], "sample_ids": ["sLUnaPT5gM8", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["laugh, sputter, baby", "a, car, pass"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks while vehicles pass by", "a telephone rings followed by a woman talking"], "sample_ids": ["sK4u5T8hW78", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["a, man, talk", "ring, talk, woman"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "three men talk while wind blows and some liquid flows"], "sample_ids": ["tOj4tdLRaA", "vJ7JPEFhyLA"], "start_seconds": ["70", "16"], "properties": ["woman, laugh, baby", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "an airplane engine spools and people speak"], "sample_ids": ["vzceMbklWc", "wTjoRj1se3U"], "start_seconds": ["180", "390"], "properties": ["water, faucet, sink", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["water is running and a man is speaking", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a frog croaks as other frogs croak in the background"], "sample_ids": ["tw76HGONaKg", "yswmmRZFItk"], "start_seconds": ["570", "0"], "properties": ["A, game, keyboard", "background, frog, croak"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "a close up of a frog in the water"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["a stream runs then someone speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wbHTKEJZyhc", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["stream, run, someone", "loud, multiple, distance"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaking with light rustling", "people cheer as a vehicle engine revs"], "sample_ids": ["zOZleIRqZm4", "xjhAnI2q6hM"], "start_seconds": ["80", "6"], "properties": ["light, rustling, man", "engine revs, vehicle, people"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["people clap and speak in the distance", "multiple motorcycles pass by as a man speaks"], "sample_ids": ["wwyfGO2J4", "zcDwZ6W7E3E"], "start_seconds": ["90", "180"], "properties": ["clap, distance, speak", "man, speak, motorcycles"], "captions_pred_video": [null, "2 people riding motorcycles down a mountain road with trees lining the sides of the road"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking while a car accelerates and revs its engine "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["someone whistles a tune", "people speak softly as food sizzles"], "sample_ids": ["sIXTftIuUgw", "yhQ2Lg-7qDY"], "start_seconds": ["90", "130"], "properties": ["someone, tune, whistle", "food, sizzle, speak"], "captions_pred_video": [null, "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a person whistling a song", "a faucet is running and a man is speaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "waves crash against a shoreline and people speak"], "sample_ids": ["uPDn2BFTHk", "yFB25fqfU8I"], "start_seconds": ["140", "300"], "properties": ["woman, laughs, speaks", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "an insect buzzes around continuously"], "sample_ids": ["yZmhM1HcsyE", "v25l1jef3JY"], "start_seconds": ["4", "0"], "properties": ["engine, roar, water", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "dishes cling together then a man begins to speak"], "sample_ids": ["zkKdxzNC97Y", "sQGXqGcwOTc"], "start_seconds": ["27", "3"], "properties": ["hard, surface, door", "cling, speak, dishes"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a door is opened and closed", "mechanisms are operating and water is splashing "], "question": "which entity is about a door slamming shut?", "label": 0}, {"captions": ["loud intermittent buzzing with intermittent laughter", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sLUnaPT5gM8", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["loud, laughter, intermittent", "three men, wind, flow"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vBHyYJ8pL0", "wDVMhEdTiVw"], "start_seconds": ["2", "30"], "properties": ["noise, door, opening", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["tw76HGONaKg", "ziUT9IFTkjg"], "start_seconds": ["570", "10"], "properties": ["A, game, keyboard", "background, birds, rustling"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "birds are chirping and a chime is ringing "], "question": "which entity is a video?", "label": 0}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a car accelerates and wind blows"], "sample_ids": ["xNMovAf3o50", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["rain, thunder, music", "accelerates, wind, blows"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", null], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a small engine spits as it runs", "people applaud and hoot and chat quietly"], "sample_ids": ["sZvwOuuPGP0", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["spits, engine, runs", "people, applaud, hoot"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", null], "captions_pred_audio": ["a medium engine is running ", "people are clapping and speaking with background noise "], "question": "which entity is a human activity", "label": 1}, {"captions": ["a person is whistling a tune", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["scYRUkrFLiQ", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["a, tune, whistle", "harsh, wind, blows"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with wind noise in the background "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "some tunes played by whistling"], "sample_ids": ["zcDwZ6W7E3E", "u6BnG6YZqJ4"], "start_seconds": ["180", "0"], "properties": ["a, man, speak", "tune, play, whistling"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a person whistles a meandering tune", "a man sprays as a scraping occurs in the background"], "sample_ids": ["uFoga8sHpiw", "sOa7g-44Dag"], "start_seconds": ["90", "30"], "properties": ["person, tune, whistle", "background, man, spray"], "captions_pred_video": ["footage of a bird in a cage", "footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic"], "captions_pred_audio": ["a person whistles a song", "a man is speaking and rubbing his hands together "], "question": "which entity is more active", "label": 1}, {"captions": ["a helicopter engine runs", "pigeons vocalize and birds chirp"], "sample_ids": ["t5ZbXbniOWk", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["engine, helicopter, run", "vocalize, bird, chirp"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "of the pigeon in the cage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a man is filing a hard object", "a man speaks as a motor runs in the background"], "sample_ids": ["vveS8HT7Uog", "xZepNM9qcRA"], "start_seconds": ["100", "30"], "properties": ["a man, hard, object", "background, motor, run"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which object is harder to file", "label": 0}, {"captions": ["continuous chugging with birds chirping in the background", "a frog croaks as other frogs croak in the background"], "sample_ids": ["xM4joTqDVp4", "yswmmRZFItk"], "start_seconds": ["160", "0"], "properties": ["background, chirp, birds", "background, frog, croak"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a close up of a frog in the water"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tDVADusiIoc", "vb1fPSDI4c"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "multiple, people, yell"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an emergency siren wails as it passes", "people talk quietly in the distance, followed by a police car siren wailing"], "sample_ids": ["vGj1XLJvNrw", "wy1eKjR7KC0"], "start_seconds": ["0", "30"], "properties": ["wails, wails, pass", "people, talk, distance"], "captions_pred_video": ["footage of a police car driving down a city street", "two police officers riding motorcycles down the street"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a man is speaking and a siren is going off"], "question": "which entity is wails as it passes?", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wyllXV6PjKo", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["a kid, talk, cry", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a baby is crying"], "question": "which entity has a female spraying and a female screaming?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w2M4i1mklOA", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["alarm, gears, turn", "three men, wind, flow"], "captions_pred_video": ["footage of an antique clock", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a clock?", "label": 0}, {"captions": ["people speak as gunfire rings out", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["wqTCwqVRDlk", "w5W5Kqtc8E"], "start_seconds": ["80", "100"], "properties": ["gunfire, ring, speak", "wind, blow, vehicle"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", null], "captions_pred_audio": ["a man is speaking and a gun is fired", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "multiple people speak and children yell while water gurgles"], "sample_ids": ["v5P-ThUCINM", "vb1fPSDI4c"], "start_seconds": ["400", "30"], "properties": ["background, chirp, bird", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and birds are chirping", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "water is sprayed across a hard surface"], "sample_ids": ["sa6TLVbooCc", "sQwlkXjQabo"], "start_seconds": ["240", "10"], "properties": ["people, laugh, child", "water, spray, surface"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a beep repeats multiple times", "a person snores loudly multiple times at a close distance"], "sample_ids": ["y682ml90jGw", "sSMl2vc3ek"], "start_seconds": ["11", "20"], "properties": ["beep, repeat, multiple", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["food is frying and sizzles", "a toilet flushes and water drains"], "sample_ids": ["zNRChLjqcU", "sfAvvZwdLCY"], "start_seconds": ["220", "20"], "properties": ["food is frying, sizzles, food", "water drains, flushes, water"], "captions_pred_video": [null, "footage of the toilet in the bathroom"], "captions_pred_audio": ["water is running from a faucet into a sink", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a man talks while vehicles pass by", "water flows as men speak and yell"], "sample_ids": ["sK4u5T8hW78", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["a, man, talk", "water, flow, men"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a man talking while vehicles pass by?", "label": 0}, {"captions": ["multiple ducks quack continuously", "an infant crying frantically"], "sample_ids": ["wfHeoPDLMaM", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "cry, infant, frantically"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "of the baby crying in the car seat"], "captions_pred_audio": ["ducks are quacking", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vf9xf3vMsGM", "su6FAOcOA8c"], "start_seconds": ["540", "4"], "properties": ["A man speaks while turning a water faucet on.", "engine, idle, woman"], "captions_pred_video": ["of the person washing their hands under the faucet", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a woman is speaking and a subway train is moving "], "question": "which entity is a man speaking while turning a water faucet on?", "label": 0}, {"captions": ["a man talks followed by a woman shouting", "a machine beeps continuously"], "sample_ids": ["s3cTDAj31g", "y682ml90jGw"], "start_seconds": ["80", "11"], "properties": ["man, talk, woman", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a baby is crying", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["water flows as men speak and yell", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vJ7JPEFhyLA", "vfYTJq7nU"], "start_seconds": ["16", "130"], "properties": ["water, flow, men", "rustling, ducks, quack"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["wind blows as people chatter quietly", "people applaud and hoot and chat quietly"], "sample_ids": ["xBxDz0CFVn0", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["wind, chatter, people", "people, applaud, hoot"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a person is whistling", "plastic is tapped on while someone speaks"], "sample_ids": ["sIXTftIuUgw", "wvKpEYswXO0"], "start_seconds": ["90", "150"], "properties": ["person, whistling, person", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a sleeping person emits a gravely snore"], "sample_ids": ["ylpYOorfH4o", "w2JXXIAdUdg"], "start_seconds": ["410", "10"], "properties": ["motor, run, steady", "emits, sleeping, person"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a close up shot of a person's mouth with a toothbrush in it"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a person snoring and a dog whimpering"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a person screams glaringly", "a man speaks as a car is passing by"], "sample_ids": ["xC8kbrKJmco", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["glaringly, screams, person", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a goat is bleating ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more passive", "label": 1}, {"captions": ["water flows as men speak and yell", "a man speaks as a car is passing by"], "sample_ids": ["vJ7JPEFhyLA", "sK4u5T8hW78"], "start_seconds": ["16", "30"], "properties": ["water, flow, men", "a, car, pass"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more calm", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "birds chirp and objects are moved around"], "sample_ids": ["w9lpbUn0hPc", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["male, wind, rustling", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a clock ticktocks"], "sample_ids": ["xBxDz0CFVn0", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["stream, water, flow", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is blurry and out of focus", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a clock is ticking loudly"], "question": "which entity is a clock", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "a cat meows as a young woman speaks"], "sample_ids": ["zj2R0XoFr5k", "x5cuQjOdM3E"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, overhead", "cat, meows, young woman"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a cat meows and a woman speaks"], "question": "which entity is a pet", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["tEE3MpBt1sg", "zFjIWfSD-4"], "start_seconds": ["50", "410"], "properties": ["drill, something, laugh", "People, motor, brakes"], "captions_pred_video": ["footage is blurry due to the smoke in the air", null], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a drill?", "label": 0}, {"captions": ["children cheer as a man speaks then an audience screams", "paper folding and crinkling"], "sample_ids": ["vJvryTwuAV8", "zPpG3RD8lSs"], "start_seconds": ["16", "20"], "properties": ["audience, cheer, man", "paper, fold, crinkle"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of folding and crinkling paper?", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["ylpYOorfH4o", "xKB8O8LTs6s"], "start_seconds": ["410", "70"], "properties": ["engine, running, wind", "music, gunfire, explosion"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and an engine is revving", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "pigeons vocalize and birds chirp"], "sample_ids": ["vz8868znkVQ", "uiS58TNyUiw"], "start_seconds": ["60", "430"], "properties": ["audio, click, kid speaking", "vocalize, bird, chirp"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "of the pigeon in the cage"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uoGVs9yUqY4", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["multiple, vocalize, wind", "a woman, laughs, animal"], "captions_pred_video": ["for how to make a wooden shed door youtube", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vJ7JPEFhyLA", "xKB8O8LTs6s"], "start_seconds": ["16", "70"], "properties": ["three men, wind, flow", "music, gunfire, explosion"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more action", "label": 1}, {"captions": ["a child speaks in closed space", "a infant makes noise and is excited"], "sample_ids": ["yW6FWLSLkx4", "wIJK3-5y0kA"], "start_seconds": ["40", "30"], "properties": ["child, space, speak", "noise, excited, infant"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a baby cries and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w0xsN8X18Y", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["music, surface, rain", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause water to slosh", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a man speaks as a motor runs in the background"], "sample_ids": ["vYkA3cfXp5Q", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["engine, accelerate, idle", "background, motor, run"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["an engine is idling", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a vehicle engine?", "label": 0}, {"captions": ["people speak and laugh as a child speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["sa6TLVbooCc", "xjhAnI2q6hM"], "start_seconds": ["240", "6"], "properties": ["people, laugh, child", "engine revs, vehicle, people"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an airplane engine spools and people speak", "people speak as gunfire rings out"], "sample_ids": ["wTjoRj1se3U", "wqTCwqVRDlk"], "start_seconds": ["390", "80"], "properties": ["airplane, engine, spool", "gunfire, ring, speak"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["a train engine runs and a horn blows", "a stream of water runs briefly"], "sample_ids": ["zPX9o1uDiI", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["engine, horn, run", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a car is driving on a wet road "], "question": "which entity is a moving object", "label": 0}, {"captions": ["a stream runs then someone speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wbHTKEJZyhc", "tdWhHV3X25Q"], "start_seconds": ["20", "60"], "properties": ["stream, run, someone", "applause, audience, yells"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "a clock ticktocks"], "sample_ids": ["xvDdE3zNf8Y", "v-g-j2uTByM"], "start_seconds": ["120", "30"], "properties": ["A, crumple, paper", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman speaks and crumples paper", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as crickets sing", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["ryFDPxgDOGc", "tdWhHV3X25Q"], "start_seconds": ["570", "60"], "properties": ["a, crickets, sing", "applause, audience, yells"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["someone snores nearby", "paper folding and crinkling"], "sample_ids": ["spJCm8tD9Zo", "zPpG3RD8lSs"], "start_seconds": ["90", "20"], "properties": ["someone snores, nearby, someone", "paper, fold, crinkle"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a person is snoring loudly", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "wind blowing followed by a zoom"], "sample_ids": ["yVumC9TGknc", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["humming, clock, birds", "wind, blow, zoom"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a series of beeps and chirps", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "an infant crying as a woman laughs"], "sample_ids": ["tw76HGONaKg", "xhmRY9yhC7c"], "start_seconds": ["570", "20"], "properties": ["music, click, man", "a, laugh, infant"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a toilet flushes and water drains", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sfAvvZwdLCY", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["water drains, flushes, water", "female, spraying, scream"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["smGI3C1NZc", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["water, drain, toilet", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and laughing?", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a man speaks as a car is passing by"], "sample_ids": ["su6FAOcOA8c", "sK4u5T8hW78"], "start_seconds": ["4", "30"], "properties": ["engine, run, woman", "a, car, pass"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a woman making an announcement?", "label": 0}, {"captions": ["an engine sputters followed by a car zooming by", "females talk and laugh over gusting wind"], "sample_ids": ["u5RmF3c3Aw", "un9VQlzgZM"], "start_seconds": ["60", "5"], "properties": ["engine, car, zoom", "females, talk, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is talking", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vr8ZXjEBhMQ", "vlS6YMeWAPo"], "start_seconds": ["150", "40"], "properties": ["wind, blow, zoom", "sheep, baa, birds"], "captions_pred_video": ["is taken from a motorcycle's point of view", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a goat bleats and birds chirp"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a person is whistling a tune", "water splashes as an animal walks through"], "sample_ids": ["scYRUkrFLiQ", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["a, tune, whistle", "animal, water, splashes"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a person whistling a song", "water splashes and gurgles as people speak"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wAAkbZToh8", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["burp, laugh, speak", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man burps and a woman speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a train horn blows as it passes by", "a vehicle accelerates and squeals tires"], "sample_ids": ["zVacuqSb4LI", "yRx9txMcBl0"], "start_seconds": ["30", "40"], "properties": ["horn, blows, train", "accelerates, tires, squeals"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wsHBIgzs9Fs", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["horn, continuous, buzzing", "rooster, crow, background, men"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird?", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "some tunes played by whistling"], "sample_ids": ["uKCSGgof8gI", "u6BnG6YZqJ4"], "start_seconds": ["12", "0"], "properties": ["chirps, distance, signal", "tune, play, whistling"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "water pouring and bubbling"], "sample_ids": ["yI-KvObbDoY", "uyRfq-jKPpo"], "start_seconds": ["260", "50"], "properties": ["sound, smack, wind", "water, bubbles, pouring"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as horns blow", "winds blows roughly as a vehicle races past"], "sample_ids": ["tHyNqRyK34A", "xjvTpk2Zpr8"], "start_seconds": ["24", "70"], "properties": ["a, man, speaks", "wind, blows, vehicle"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yYEVLuqEytU", "zl9Dqx-j7q4"], "start_seconds": ["40", "6"], "properties": ["animal, pig, background", "engine, laugh, loud"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of a man driving a car in the dark"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a jet engine roars "], "question": "which entity is followed by a man laughing?", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["tPJvjq9QePY", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["animal, bleat, moo", "background, motor, run"], "captions_pred_video": ["a dog and a sheep in a barn", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a baby cries and a man speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a man speaks as a car is passing by"], "sample_ids": ["vdoxuJn9lTc", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "a, car, pass"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "a guy speaks with birds chirping in the background"], "sample_ids": ["uWAAAL4CIoc", "v5P-ThUCINM"], "start_seconds": ["0", "400"], "properties": ["a woman, chirps, animal", "background, chirp, bird"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking and birds are chirping"], "question": "which entity has a bird chirping in the background?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "waves crash against a shoreline and people speak"], "sample_ids": ["v0x1odnXtP0", "yFB25fqfU8I"], "start_seconds": ["210", "300"], "properties": ["keyboard, type, computer", "wave, crash, shoreline"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which is a natural phenomenon", "label": 1}, {"captions": ["a male speaks and another male speaks", "a man speaks as a machine runs"], "sample_ids": ["viuTg1M-dqg", "vD6lYD1l0BY"], "start_seconds": ["30", "330"], "properties": ["two males, speaking, male", "a, machine, run"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "game controller being held in the hands of the person"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and dishes are being washed "], "question": "which entity has a machine running", "label": 1}, {"captions": ["a consistent ticking pattern", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sCeWURVHfOM", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["ticking, pattern, clock", "music, gunfire, explosion"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["ticking of a clock", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "some men converse over an engine running"], "sample_ids": ["w2M4i1mklOA", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["alarm, gears, turn", "men, converse, engine"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a clock?", "label": 0}, {"captions": ["a telephone rings followed by a woman talking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["tGcFnX0GHI", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["ring, talk, woman", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster crow?", "label": 1}, {"captions": ["people speak as gunfire rings out", "an infant crying frantically"], "sample_ids": ["wqTCwqVRDlk", "zwOBqeFTgiU"], "start_seconds": ["80", "30"], "properties": ["gunfire, ring, speak", "cry, infant, frantically"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "a car speeding up in the distance"], "sample_ids": ["uWAAAL4CIoc", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["a woman, chirps, animal", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a woman speaks as she rubs two objects together"], "sample_ids": ["vddP56-ogds", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["liquid, laughs, man", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["an animal quacks rapidly", "a loud engine muffles a man as he speaks"], "sample_ids": ["vh30P49Po6s", "xyx6eNVEYRY"], "start_seconds": ["30", "380"], "properties": ["animal, quacks, rapidly", "loud, engine, muffles"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a duck is quacking loudly", "an aircraft engine is running and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a vehicle engine runs and someone speaks"], "sample_ids": ["w5W5Kqtc8E", "zF8yoL0rkbI"], "start_seconds": ["100", "30"], "properties": ["wind, blow, vehicle", "engine, run, someone"], "captions_pred_video": [null, "footage of the traffic on the street at night"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "the wind is blowing hard and water is splashing"], "question": "which entity has a vehicle engine running and someone speaking?", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "an infant crying as a woman laughs"], "sample_ids": ["wyllXV6PjKo", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["a baby, a woman, a man", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman speaks and a baby cries", "a baby cries and a woman speaks"], "question": "which entity is a video of a baby crying?", "label": 0}, {"captions": ["a toilet flushes and water drains", "wind blowing followed by a zoom"], "sample_ids": ["sfAvvZwdLCY", "vr8ZXjEBhMQ"], "start_seconds": ["20", "150"], "properties": ["water drains, flushes, water", "wind, blow, zoom"], "captions_pred_video": ["footage of the toilet in the bathroom", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a toilet is flushed", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["birds chirp and wind blows", "vehicles pass by on a roadway"], "sample_ids": ["sxIvBMSavMQ", "tgbONvsP47Y"], "start_seconds": ["210", "0"], "properties": ["birds, chirp, wind", "pass, vehicle, roadway"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a person sniffs and sneezes", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uRlbY6aoBU", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["sneezes, person, sniffs", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is sneezing ", "a crowd of people are talking and laughing"], "question": "which entity is more active", "label": 1}, {"captions": ["water splashes and a door squeaks", "a telephone rings followed by a woman talking"], "sample_ids": ["sdXV-ylviw", "tGcFnX0GHI"], "start_seconds": ["190", "0"], "properties": ["sound, splash, door", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and taps with background noise ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a frog croaks as other frogs croak in the background"], "sample_ids": ["ugHJF0hfYkg", "yswmmRZFItk"], "start_seconds": ["10", "0"], "properties": ["loud, intense, propeller", "background, frog, croak"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a close up of a frog in the water"], "captions_pred_audio": ["a helicopter is flying overhead ", "a frog is croaking"], "question": "which is quieter", "label": 1}, {"captions": ["water running down a sink while a man is talking", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vSeGhaZt-aI", "wqZ135Ssz0"], "start_seconds": ["50", "60"], "properties": ["water, sink, talk", "two men, woman, birds"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a stream of water runs briefly"], "sample_ids": ["xl2PIWyXaM", "x-PeY8Yb8M4"], "start_seconds": ["160", "300"], "properties": ["chirp, man, younger person", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["birds are chirping and people are talking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a machine runs continuously", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wdXV3Pv0jiY", "yDoT73BWsdA"], "start_seconds": ["11", "10"], "properties": ["machine, running, continuously", "engine, revs, vehicle"], "captions_pred_video": ["footage is blurry and shaky", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a race car accelerates and revs its engine "], "question": "which machine is running continuously", "label": 0}, {"captions": ["a man speaks as a machine runs", "people speak as gunfire rings out"], "sample_ids": ["vD6lYD1l0BY", "wqTCwqVRDlk"], "start_seconds": ["330", "80"], "properties": ["a, machine, run", "gunfire, ring, speak"], "captions_pred_video": ["game controller being held in the hands of the person", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vddP56-ogds", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["liquid, laughs, man", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a weapon fires multiple times", "people applaud and hoot and chat quietly"], "sample_ids": ["sMC07Ucy7kg", "wwyfGO2J4"], "start_seconds": ["10", "90"], "properties": ["weapon, fire, multiple", "people, applaud, hoot"], "captions_pred_video": ["footage is from a car's point of view", null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 0}, {"captions": ["a cat meows and children speak", "people cheer as a vehicle engine revs"], "sample_ids": ["x5cuQjOdM3E", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["cat, speak, children", "engine revs, vehicle, people"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a cat meows and a woman speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "waves crash against a shoreline and people speak"], "sample_ids": ["w2M4i1mklOA", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["loud, chime, bell", "wave, crash, shoreline"], "captions_pred_video": ["footage of an antique clock", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone is burping continuously", "a duck quacks loudly and continuously"], "sample_ids": ["y636gklDioE", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["burps, burps, burps", "loud, continuous, quacks"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person burps loudly several times", "a duck is quacking loudly"], "question": "which entity is making a noise", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["s6DESzUTGjY", "uYT5gxnyMWM"], "start_seconds": ["16", "50"], "properties": ["wind, laugh, woman", "a, scream, girl"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "two men speak as a buffeting wind blows"], "sample_ids": ["s6DESzUTGjY", "y8WEcpOlT3I"], "start_seconds": ["16", "40"], "properties": ["wind, laugh, woman", "wind, speak, buffeting"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 0}, {"captions": ["some people speak", "a woman speaks happily and an animal chirps"], "sample_ids": ["vbZ-0lGPneg", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "a woman, chirps, animal"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman is speaking and a dog is barking "], "question": "which entity has a more chirpy animal", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "people applaud and hoot and chat quietly"], "sample_ids": ["tjmoSi330GM", "wwyfGO2J4"], "start_seconds": ["23", "90"], "properties": ["speed, water, boat", "people, applaud, hoot"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", null], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "people are clapping and speaking with background noise "], "question": "which entity is moving at a slower speed", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "paper folding and crinkling"], "sample_ids": ["wTideSjRFS0", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["food, sizzle, woman", "paper, fold, crinkle"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["an insect buzzes around continuously", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["v25l1jef3JY", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["buzzes, continuously, insect", "wind, blow, vehicle"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["water splashes and wind noise is made into a microphone", "a woman speaks followed by another woman whimpering and speaking"], "sample_ids": ["sDSppXIlJrs", "xOZfdgAgJ9o"], "start_seconds": ["27", "40"], "properties": ["microphone, water, wind", "woman, whimpering, speaking"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "footage of a woman talking to a man in a doctor's office"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a woman is speaking and a baby is crying"], "question": "which entity is a recording of a woman speaking?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "an engine runs loudly"], "sample_ids": ["v0x1odnXtP0", "vqZuVbG6-HI"], "start_seconds": ["210", "130"], "properties": ["keyboard, type, computer", "loud, engine, run"], "captions_pred_video": ["how to make money on youtube in spanish", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a person is typing on a keyboard", "a lawn mower is running and men are speaking "], "question": "which is quieter", "label": 0}, {"captions": ["goats bleat and people speak", "a car accelerates and wind blows"], "sample_ids": ["z5iUE5h0EPs", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["goats bleat, people speak, language", "accelerates, wind, blows"], "captions_pred_video": ["of the goat in the barn", null], "captions_pred_audio": ["a goat bleats and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as horns blow", "water flows as men speak and yell"], "sample_ids": ["tHyNqRyK34A", "vJ7JPEFhyLA"], "start_seconds": ["24", "16"], "properties": ["a, man, speaks", "water, flow, men"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "a person sniffs and sneezes"], "sample_ids": ["x9JovgqUcs", "uRlbY6aoBU"], "start_seconds": ["500", "0"], "properties": ["a, man, speaks, keyboard", "sneezes, person, sniffs"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a keyboard", "a man is sneezing "], "question": "which person is sick", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "someone whistles a tune"], "sample_ids": ["x9JovgqUcs", "sIXTftIuUgw"], "start_seconds": ["500", "90"], "properties": ["a, man, speaks, keyboard", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a keyboard", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vfYTJq7nU", "vJ7JPEFhyLA"], "start_seconds": ["130", "16"], "properties": ["rustling, ducks, quack", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a duck?", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["y8WEcpOlT3I", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["wind, speak, buffeting", "men, talk, cars"], "captions_pred_video": ["on how to use a sewing machine youtube", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a person is whistling", "a telephone rings followed by a woman talking"], "sample_ids": ["sIXTftIuUgw", "tGcFnX0GHI"], "start_seconds": ["90", "0"], "properties": ["person, whistling, person", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a person talking?", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wwyfGO2J4", "xfaoyyzw2WU"], "start_seconds": ["90", "180"], "properties": ["people, applaud, hoot", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a stream of water runs briefly"], "sample_ids": ["tK4VlLsNxak", "x-PeY8Yb8M4"], "start_seconds": ["120", "300"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "stream, water, run"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a heavy rain falls endlessly", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["wP8ZKrlx3oA", "vVhthZ45k3Y"], "start_seconds": ["40", "30"], "properties": ["heavy, rain, fall", "cat, purr, hiss"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage is blurry and out of focus"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking and a cat is meowing"], "question": "which entity is more likely to be a natural phenomenon", "label": 0}, {"captions": ["an engine starts and increases in power", "a car accelerates and wind blows"], "sample_ids": ["zjTG0gaGCUI", "u0TrcHhkPQ"], "start_seconds": ["80", "20"], "properties": ["power, increase, engine", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a jet engine roars as wind blows ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a small engine idles continuously", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["y5WII6cTH7k", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["engine, idle, continuously", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a woman is speaking and a dog is whimpering"], "question": "which entity is not a television program?", "label": 0}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "some men converse over an engine running"], "sample_ids": ["xyL9F5VrjkE", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["wind, motor, distance", "men, converse, engine"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man speaks while a motorcycle revs and accelerates "], "question": "which is a more active scene", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sU53zg9Jp7s", "uYT5gxnyMWM"], "start_seconds": ["380", "50"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "female, spraying, scream"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more violent", "label": 1}, {"captions": ["food is frying while a woman speaks", "water pouring and bubbling"], "sample_ids": ["yhQ2Lg-7qDY", "uyRfq-jKPpo"], "start_seconds": ["130", "50"], "properties": ["food, woman, speak", "water, bubbles, pouring"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a faucet is running and a man is speaking", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a man speaks as a car is passing by"], "sample_ids": ["sWZzXuWYY", "sK4u5T8hW78"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a heavy rain falls endlessly"], "sample_ids": ["sofxkNWaP0s", "wP8ZKrlx3oA"], "start_seconds": ["30", "40"], "properties": ["wind, engine, louder", "heavy, rain, fall"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a heavy rain is falling on a surface"], "question": "which entity is falling", "label": 1}, {"captions": ["water flows and trickles", "a child speaks in closed space"], "sample_ids": ["tB7hWb9gTuQ", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["water, flow, trickle", "child, space, speak"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["water is splashing and gurgling", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not a liquid", "label": 1}, {"captions": ["a church bell rings several times", "a speedboat passes quickly on the water"], "sample_ids": ["sUVVjE3Ucp8", "tjmoSi330GM"], "start_seconds": ["0", "23"], "properties": ["ring, bell, several", "speed, water, boat"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a church bell is ringing ", "a motorboat speeds through water with wind noise "], "question": "which entity is moving faster", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["uiS58TNyUiw", "siJFXfGWgDk"], "start_seconds": ["430", "50"], "properties": ["vocalize, bird, chirp", "a, bird, vehicle"], "captions_pred_video": ["of the pigeon in the cage", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking and birds are chirping in the background "], "question": "which entity is about birds?", "label": 0}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "people speak as gunfire rings out"], "sample_ids": ["yZrFNS7GFBQ", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["pigeon, buzzes, insect", "gunfire, ring, speak"], "captions_pred_video": ["of the bird in the cage", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["an animal quacks rapidly", "winds blows roughly as a vehicle races past"], "sample_ids": ["vh30P49Po6s", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["animal, quacks, rapidly", "wind, blows, vehicle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a duck is quacking loudly", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["an airplane engine runs", "an insect buzzes around continuously"], "sample_ids": ["yVPZ2MNWpms", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["engine, airplane, runs", "buzzes, continuously, insect"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a car is driving by on the road ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tDVADusiIoc", "zj2R0XoFr5k"], "start_seconds": ["60", "50"], "properties": ["wind, radio, waves", "airplane, boy, fly"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["people speak softly as food sizzles", "a car speeding up in the distance"], "sample_ids": ["yhQ2Lg-7qDY", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["food, sizzle, speak", "distance, car, speed"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "loud clanking and banging with brief male speech"], "sample_ids": ["wSVhSdj0F0", "sWZzXuWYY"], "start_seconds": ["10", "420"], "properties": ["horn honks, keys jingle, slam", "male, speech, banging"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a sewing machine runs and a man speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a woman speaks as she rubs two objects together"], "sample_ids": ["xKB8O8LTs6s", "vzxHnu-SFEw"], "start_seconds": ["70", "80"], "properties": ["music, gunfire, explosion", "two objects, woman, speak"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "paper folding and crinkling"], "sample_ids": ["uWPRNLnpy7Y", "zPpG3RD8lSs"], "start_seconds": ["10", "20"], "properties": ["accelerate, laugh, vehicle", "paper, fold, crinkle"], "captions_pred_video": ["is taken from a car driving down the street", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a car accelerates and revs its engine ", "the wind blows and a mouse clicks "], "question": "which entity is a paper folding and crinkling?", "label": 1}, {"captions": ["a consistent ticking pattern", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sCeWURVHfOM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["ticking, pattern, clock", "stream, water, flow"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "footage is blurry and out of focus"], "captions_pred_audio": ["ticking of a clock", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "a man speaks followed by another man speaking outside"], "sample_ids": ["s4Uz1Ffgo04", "viuTg1M-dqg"], "start_seconds": ["100", "30"], "properties": ["water, rushes, vehicle", "two men, speak, follow"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a child babbles as a woman speaks", "water splashes as an animal walks through"], "sample_ids": ["wEBlkGWVWwE", "w1ir-sZ3Im8"], "start_seconds": ["260", "90"], "properties": ["a, babble, woman", "animal, water, splashes"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a man speaks as a car is passing by"], "sample_ids": ["vzxHnu-SFEw", "sK4u5T8hW78"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "a, car, pass"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which object is moving", "label": 0}, {"captions": ["a person burps loudly for a long time nearby", "people applaud and hoot and chat quietly"], "sample_ids": ["vf44CgrjT0A", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["loud, long, person", "people, applaud, hoot"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", null], "captions_pred_audio": ["a loud burp", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a man speaks as a car is passing by"], "sample_ids": ["rwtmaKiCcQU", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["nozzle, depressed, spray can", "a, car, pass"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["spraying and people speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an electronic device bleeps once", "paper is crumpling consistently"], "sample_ids": ["tHJ6JSa8Y4", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["bleeps, electronic, device", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a clock is ticking and beeping", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "several insects fly while two men talk"], "sample_ids": ["vbpKkWvfOu4", "s-T9OVOiMLo"], "start_seconds": ["560", "330"], "properties": ["a, woman, man", "several, fly, men"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more people", "label": 0}, {"captions": ["an emergency siren wails as it passes", "a man speaks as a car is passing by"], "sample_ids": ["vGj1XLJvNrw", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["wails, wails, pass", "a, car, pass"], "captions_pred_video": ["footage of a police car driving down a city street", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a duck quacks continuously"], "sample_ids": ["wP8ZKrlx3oA", "vh30P49Po6s"], "start_seconds": ["40", "30"], "properties": ["rain, storm, thunder", "quacks, continuously, duck"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["t25U-v4k4ts", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["bees buzz, birds chirp, man speaks", "a woman, a television program, a bird"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["someone snores nearby", "motors rev and run loudly as a person laughs"], "sample_ids": ["spJCm8tD9Zo", "zl9Dqx-j7q4"], "start_seconds": ["90", "6"], "properties": ["someone snores, nearby, someone", "motors rev, laugh, loudly"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a person is snoring loudly", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["xfaoyyzw2WU", "y8WEcpOlT3I"], "start_seconds": ["180", "40"], "properties": ["loud, jet engine, roar", "harsh, wind, blows"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "on how to use a sewing machine youtube"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a man is speaking with wind noise in the background "], "question": "which entity is louder", "label": 0}, {"captions": ["a person snores hilariously while someone laughs", "a speedboat passes quickly on the water"], "sample_ids": ["sSMl2vc3ek", "tjmoSi330GM"], "start_seconds": ["20", "23"], "properties": ["a person, laughs, snores", "speed, water, boat"], "captions_pred_video": [null, "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a person snoring loudly", "a motorboat speeds through water with wind noise "], "question": "which is moving faster", "label": 0}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["siJFXfGWgDk", "vXlk0lIQBFo"], "start_seconds": ["50", "470"], "properties": ["man, woman, vehicle", "wind, speak, vocalize"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["material crumbles into a microphone", "people talk quietly in the distance, followed by a police car siren wailing"], "sample_ids": ["vofpvUo6NAw", "wy1eKjR7KC0"], "start_seconds": ["220", "30"], "properties": ["material, crumbles, microphone", "people, talk, distance"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "two police officers riding motorcycles down the street"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a man is speaking and a siren is going off"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "birds chirp and objects are moved around"], "sample_ids": ["x5cuQjOdM3E", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["cat, meows, young woman", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a cat meows and a woman speaks", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "several insects fly while two men talk"], "sample_ids": ["sQGXqGcwOTc", "s-T9OVOiMLo"], "start_seconds": ["3", "330"], "properties": ["audio, kid, giggles", "several, fly, men"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video?", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a stream of water runs briefly"], "sample_ids": ["uKCSGgof8gI", "x-PeY8Yb8M4"], "start_seconds": ["12", "300"], "properties": ["chirps, distance, signal", "stream, water, run"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yI-KvObbDoY", "sSMl2vc3ek"], "start_seconds": ["260", "20"], "properties": ["sound, smack, wind", "loud, multiple, distance"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", null], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "continuous sizzling with a woman speaking towards the end"], "sample_ids": ["zdYdyF9-m8U", "ukxt9I7eMMg"], "start_seconds": ["7", "30"], "properties": ["wind, crash, shoreline", "continuous, woman, speaking"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["waves crash and wind blows ", "a woman is speaking while food is frying in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a child speaks", "a toilet flushes and water drains"], "sample_ids": ["yW6FWLSLkx4", "sfAvvZwdLCY"], "start_seconds": ["40", "20"], "properties": ["a, child, speaks", "water drains, flushes, water"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vlS6YMeWAPo", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["sheep, baa, birds", "female, spraying, scream"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a goat bleats and birds chirp", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a man speaks as horns blow"], "sample_ids": ["sK4u5T8hW78", "tHyNqRyK34A"], "start_seconds": ["30", "24"], "properties": ["a, car, pass", "a, man, speaks"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "being taken from inside a vehicle on the street at night"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a car is honking with background noise "], "question": "which entity is about a man speaking as a car passes by?", "label": 0}, {"captions": ["a car accelerates and wind blows", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["u0TrcHhkPQ", "tw76HGONaKg"], "start_seconds": ["20", "570"], "properties": ["accelerates, wind, blows", "A, game, keyboard"], "captions_pred_video": [null, "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man speaks and types on a computer keyboard "], "question": "which entity is playing a game", "label": 1}, {"captions": ["a baby cries and a woman speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tMbMDvT50j8", "vb1fPSDI4c"], "start_seconds": ["12", "30"], "properties": ["a, cry, woman", "multiple, people, yell"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "people speak as gunfire rings out"], "sample_ids": ["xNMovAf3o50", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["rain, thunder, music", "gunfire, ring, speak"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a train horn blows as it passes by"], "sample_ids": ["sLUnaPT5gM8", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["loud, laughter, intermittent", "horn, blows, train"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is not a train?", "label": 0}, {"captions": ["motors rev and run loudly as a person laughs", "a horn rings out as a machine runs by"], "sample_ids": ["zl9Dqx-j7q4", "slZLHwNbbt4"], "start_seconds": ["6", "300"], "properties": ["motors rev, laugh, loudly", "a, horn, run"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a jet engine roars ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "plastic is tapped on while someone speaks"], "sample_ids": ["vf44CgrjT0A", "wvKpEYswXO0"], "start_seconds": ["20", "150"], "properties": ["loud, long, person", "plastic, tap, speak"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a loud burp", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "an infant crying as a woman laughs"], "sample_ids": ["w2JXXIAdUdg", "xhmRY9yhC7c"], "start_seconds": ["10", "20"], "properties": ["snoring, distance, person", "a, laugh, infant"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["v5P-ThUCINM", "zj2R0XoFr5k"], "start_seconds": ["400", "50"], "properties": ["background, chirp, bird", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["waves of water rumble", "a car speeding up in the distance"], "sample_ids": ["vwqaIHKxLvM", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["sound, wave, water", "distance, car, speed"], "captions_pred_video": ["of a surfer riding a big wave in the ocean", null], "captions_pred_audio": ["waves crash and wind blows ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["sawing of wood and rustling with leaves blowing in the distance", "a man speaks as a motor runs in the background"], "sample_ids": ["uiItxDsDMFI", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["sound, distance, leaves", "background, motor, run"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a saw is being used with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sZPuqDgX2V0", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["engine, accelerate, intercom", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity shows a man talking to the camera?", "label": 0}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "water pouring and bubbling"], "sample_ids": ["vqZuVbG6-HI", "uyRfq-jKPpo"], "start_seconds": ["130", "50"], "properties": ["background, male, female", "water, bubbles, pouring"], "captions_pred_video": ["footage is blurry because it's raining outside", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "water is running from a faucet"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a cat meows and children speak", "some tunes played by whistling"], "sample_ids": ["x5cuQjOdM3E", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["cat, speak, children", "tune, play, whistling"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a cat meows and a woman speaks", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "someone is typing on a computer keyboard"], "sample_ids": ["tDVADusiIoc", "v0x1odnXtP0"], "start_seconds": ["60", "210"], "properties": ["wind, radio, waves", "keyboard, type, computer"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a person is typing on a keyboard"], "question": "which is not a type of device", "label": 0}, {"captions": ["distant men speak as a spray can nozzle is depressed", "an airplane engine runs"], "sample_ids": ["rwtmaKiCcQU", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["nozzle, depressed, spray can", "engine, airplane, runs"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["spraying and people speaking", "a car is driving by on the road "], "question": "which is a moving object", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "an engine works in idle nearby followed by a man talking"], "sample_ids": ["vhJWZheqaE", "wqADXCzngMw"], "start_seconds": ["0", "340"], "properties": ["water drains unevenly, toilet flushes, water drains", "engine, idle, man"], "captions_pred_video": [null, "of a man working on a vintage volkswagen beetle"], "captions_pred_audio": ["a toilet is flushed", "a lawn mower is running and a man is speaking "], "question": "which entity is a machine?", "label": 0}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vddP56-ogds", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["water, splash, person, laugh", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water?", "label": 0}, {"captions": ["long loud burping by a man", "wind blows as people chatter quietly"], "sample_ids": ["xmiUIOhtZyQ", "xBxDz0CFVn0"], "start_seconds": ["60", "30"], "properties": ["loud, burp, man", "wind, chatter, people"], "captions_pred_video": ["homer simpson drinking a beer", "footage is blurry and out of focus"], "captions_pred_audio": ["a person burps and music plays in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a man speaks followed by another man speaking outside"], "sample_ids": ["wsHBIgzs9Fs", "viuTg1M-dqg"], "start_seconds": ["50", "30"], "properties": ["horn, continuous, buzzing", "two men, speak, follow"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking followed by another man speaking?", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["ylpYOorfH4o", "wz7N8YRy74I"], "start_seconds": ["410", "30"], "properties": ["engine, running, wind", "rooster, crow, background, men"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["an engine runs and a man speaks", "a train horn blows as it passes by"], "sample_ids": ["yT5WfYMRr-U", "zVacuqSb4LI"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "horn, blows, train"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which train is blowing its horn", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yDoT73BWsdA", "sSMl2vc3ek"], "start_seconds": ["10", "20"], "properties": ["engine, revs, vehicle", "loud, multiple, distance"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["wSVhSdj0F0", "ukg5L09Wpvo"], "start_seconds": ["10", "150"], "properties": ["beep, clang, footsteps", "a train, a horn, a bell"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sYITalLZjj4", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["water, rushes, background, birds", "three men, wind, flow"], "captions_pred_video": ["two ducks are swimming in the water near each other", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["wind blows and birds chirp", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 0}, {"captions": ["a diesel truck engine runs continuously", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sZvwOuuPGP0", "su6FAOcOA8c"], "start_seconds": ["50", "4"], "properties": ["engine, diesel, truck", "engine, idle, woman"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a medium engine is running ", "a woman is speaking and a subway train is moving "], "question": "which entity has a running engine", "label": 0}, {"captions": ["water flows as a woman laughs and a man speaks", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["vddP56-ogds", "sapQIQUhFc"], "start_seconds": ["30", "280"], "properties": ["water, flow, laugh", "liquid, flow, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking and a stream is flowing in the background "], "question": "which entity shows a man speaking?", "label": 0}, {"captions": ["a cat meows and children speak", "water is sprayed across a hard surface"], "sample_ids": ["x5cuQjOdM3E", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["cat, speak, children", "water, spray, surface"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a cat meows and a woman speaks", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "dishes cling together then a man begins to speak"], "sample_ids": ["xyL9F5VrjkE", "sQGXqGcwOTc"], "start_seconds": ["20", "3"], "properties": ["wind, blows, vehicle", "cling, speak, dishes"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "mechanisms are operating and water is splashing "], "question": "which entity is about a vehicle engine running?", "label": 0}, {"captions": ["an insect buzzes around continuously", "wind blowing followed by a zoom"], "sample_ids": ["v25l1jef3JY", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["buzzes, continuously, insect", "wind, blow, zoom"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["a airplane flies overhead as a woman speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zj2R0XoFr5k", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["airplane, fly, woman", "a woman, laughs, animal"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a drill runs and two people laugh", "a stream of water runs briefly"], "sample_ids": ["tEE3MpBt1sg", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["two people, laugh, drill", "stream, water, run"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "water splashes as an animal walks through"], "sample_ids": ["xjvTpk2Zpr8", "w1ir-sZ3Im8"], "start_seconds": ["70", "90"], "properties": ["wind, blows, vehicle", "animal, water, splashes"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a jet engine roars and wind blows ", "water splashes and gurgles as people speak"], "question": "which entity is more calm", "label": 1}, {"captions": ["birds fly and flutter around", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wGKgwOP3h30", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["fly, flutter, around", "loud, laughter, intermittent"], "captions_pred_video": ["of the pigeons in the coop", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["pigeons coo and flap their wings", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zofjfKhqLk8", "wz7N8YRy74I"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "rooster, crow, background, men"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a horn rings out as a machine runs by"], "sample_ids": ["sYITalLZjj4", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["water, rushes, background, birds", "a, horn, run"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["wind blows and birds chirp", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is more active", "label": 1}, {"captions": ["water rushes by", "a man talks as several small engines run"], "sample_ids": ["x-PeY8Yb8M4", "u9A6VZQCZpU"], "start_seconds": ["300", "30"], "properties": ["water, rushes, by", "a, man, talk"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking while a race car is revving and accelerating "], "question": "which entity is a video of a man talking?", "label": 1}, {"captions": ["a jet engine spools up and takes off", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vBslzh7saPw", "tiDFTC-5vU"], "start_seconds": ["90", "30"], "properties": ["engine, spools, takes", "male, duck, laugh"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a toilet flushes and a female speaks"], "sample_ids": ["zj2R0XoFr5k", "yaln9y8I7ms"], "start_seconds": ["50", "230"], "properties": ["airplane, boy, fly", "female, flushes, toilet"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "water flows followed by women screaming"], "sample_ids": ["zcDwZ6W7E3E", "w5W5Kqtc8E"], "start_seconds": ["180", "100"], "properties": ["man, speak, motorcycles", "water, flow, women"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["zF8yoL0rkbI", "t25U-v4k4ts"], "start_seconds": ["30", "40"], "properties": ["engine, run, someone", "a, chirps, bird"], "captions_pred_video": ["footage of the traffic on the street at night", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man is speaking and bees are buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["continuous sneezing together with speech", "a person is whistling"], "sample_ids": ["x4dZyf9Gbj0", "sIXTftIuUgw"], "start_seconds": ["130", "90"], "properties": ["continuous, sneeze, speech", "person, whistling, person"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a woman sneezes and speaks", "a person whistling a song"], "question": "which entity is a person", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "a helicopter engine runs"], "sample_ids": ["ukg5L09Wpvo", "t5ZbXbniOWk"], "start_seconds": ["150", "30"], "properties": ["clickety-clack, train, whistle", "engine, helicopter, run"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a helicopter is flying overhead "], "question": "which entity is a machine", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "people cheer as a vehicle engine revs"], "sample_ids": ["tEE3MpBt1sg", "xjhAnI2q6hM"], "start_seconds": ["50", "6"], "properties": ["drill, something, laugh", "engine revs, vehicle, people"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a truck is revving its engine and a man is speaking "], "question": "which is a vehicle", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a motor runs in the distance as a soft wind periodically gusts"], "sample_ids": ["xSKJGCItUWE", "xyL9F5VrjkE"], "start_seconds": ["10", "20"], "properties": ["engine, run, boy", "wind, motor, distance"], "captions_pred_video": ["footage of the helicopter flying in the room", "of a caterpillar truck loading logs into a trailer"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "the wind is blowing and a car is passing by "], "question": "which entity is a motor?", "label": 1}, {"captions": ["a woman and man are speaking", "frogs croak and vocalize"], "sample_ids": ["vbpKkWvfOu4", "yswmmRZFItk"], "start_seconds": ["560", "0"], "properties": ["two people, speaking, woman, man", "croak, vocalize, frog"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a frog is croaking"], "question": "which entity is not a person", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "a man speaks as a motor runs in the background"], "sample_ids": ["wqADXCzngMw", "xZepNM9qcRA"], "start_seconds": ["340", "30"], "properties": ["engine, idle, man", "background, motor, run"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks while water drains", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vSeGhaZt-aI", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["water, drain, man", "animal, grunts, snorts"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "wind blows as people chatter quietly"], "sample_ids": ["wTideSjRFS0", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["food, sizzle, woman", "wind, chatter, people"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "water is sprayed across a hard surface"], "sample_ids": ["u--KhUW8l1Y", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["engine, sound, horn", "water, spray, surface"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a fly buzzes around loudly as birds chirp"], "sample_ids": ["tQWGZLItBXk", "uJV8NDaHqqk"], "start_seconds": ["170", "100"], "properties": ["music, kid, speak", "loud, fly, chirp"], "captions_pred_video": ["worms revolution screenshots", "a bee hive in a wooden box"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a swarm of bees buzzing around"], "question": "which entity is louder", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a person snores loudly multiple times at a close distance"], "sample_ids": ["zFjIWfSD-4", "sSMl2vc3ek"], "start_seconds": ["410", "20"], "properties": ["People, motor, brakes", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["water splashes as an animal walks through", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["w1ir-sZ3Im8", "xKB8O8LTs6s"], "start_seconds": ["90", "70"], "properties": ["animal, water, splashes", "music, gunfire, explosion"], "captions_pred_video": ["footage of a group of people riding horses through a river", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["water splashes and gurgles as people speak", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wAAkbZToh8", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["burp, laugh, speak", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man burps and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "a duck quacks loudly and continuously"], "sample_ids": ["yVumC9TGknc", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["humming, clock, birds", "loud, continuous, quacks"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a series of beeps and chirps", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a stream runs then someone speaks", "some men converse over an engine running"], "sample_ids": ["wbHTKEJZyhc", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["stream, run, someone", "men, converse, engine"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a stream running?", "label": 0}, {"captions": ["a person screams glaringly", "wind blowing followed by a zoom"], "sample_ids": ["xC8kbrKJmco", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["glaringly, screams, person", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a goat is bleating ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a man speaks followed by another man speaking outside"], "sample_ids": ["xBxDz0CFVn0", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["stream, water, flow", "two men, speak, follow"], "captions_pred_video": ["footage is blurry and out of focus", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "a infant makes noise and is excited"], "sample_ids": ["sSMl2vc3ek", "wIJK3-5y0kA"], "start_seconds": ["20", "30"], "properties": ["loud, multiple, distance", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a person snoring loudly", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "children speak and play together"], "sample_ids": ["x5cuQjOdM3E", "yVVP8XvWJTo"], "start_seconds": ["30", "260"], "properties": ["cat, meows, young woman", "children, speak, play"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a cat meows and a woman speaks", "children are speaking and breathing with background noise "], "question": "which entity is more social", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "dishes cling together then a man begins to speak"], "sample_ids": ["yajyRTUQk3U", "sQGXqGcwOTc"], "start_seconds": ["400", "3"], "properties": ["a woman, something, fried", "cling, speak, dishes"], "captions_pred_video": ["- a woman cooking in the kitchen", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "mechanisms are operating and water is splashing "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["motors runs briefly and tires screech", "some tunes played by whistling"], "sample_ids": ["yRx9txMcBl0", "u6BnG6YZqJ4"], "start_seconds": ["40", "0"], "properties": ["motors, tires, screech", "tune, play, whistling"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "a door opens and closes"], "sample_ids": ["w2bYrCVLT60", "vBHyYJ8pL0"], "start_seconds": ["120", "2"], "properties": ["ducks, speak, quack", "open, close, door"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", null], "captions_pred_audio": ["ducks are quacking and a man is speaking", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is a door?", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["ugHJF0hfYkg", "t97k0cejSQE"], "start_seconds": ["10", "250"], "properties": ["loud, intense, propeller", "sound, chirp, buzz"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a bee on a purple thistle flower"], "captions_pred_audio": ["a helicopter is flying overhead ", "a bee buzzes and a woman speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tdWhHV3X25Q", "vYkA3cfXp5Q"], "start_seconds": ["60", "30"], "properties": ["applause, audience, yells", "engine, accelerate, idle"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "an engine is idling"], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a clock ticktocks briefly", "birds chirp and objects are moved around"], "sample_ids": ["u7C-AEBQM", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["ticktocks, clock, ticktocks briefly", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a ticktock of a clock", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "wind blowing followed by a zoom"], "sample_ids": ["s4Uz1Ffgo04", "vr8ZXjEBhMQ"], "start_seconds": ["100", "150"], "properties": ["water, rushes, vehicle", "wind, blow, zoom"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be a video of a vehicle zooming past?", "label": 0}, {"captions": ["an aircraft engine runs as people speak", "water flows as men speak and yell"], "sample_ids": ["wTjoRj1se3U", "vJ7JPEFhyLA"], "start_seconds": ["390", "16"], "properties": ["engine, run, people", "water, flow, men"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking and yelling?", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uPDn2BFTHk", "su6FAOcOA8c"], "start_seconds": ["140", "4"], "properties": ["lady, laugh, baby", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["a goat bleats as a person speaks", "small dogs yip and bark sharply"], "sample_ids": ["tPJvjq9QePY", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["bleats, person, speak", "bark, yip, sharply"], "captions_pred_video": ["a dog and a sheep in a barn", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a baby cries and a man speaks", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "winds blows roughly as a vehicle races past"], "sample_ids": ["zl9Dqx-j7q4", "xjvTpk2Zpr8"], "start_seconds": ["6", "70"], "properties": ["engine, laugh, loud", "wind, blows, vehicle"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a jet engine roars ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks as a car is passing by", "men speak and a nozzle sprays liquid"], "sample_ids": ["sK4u5T8hW78", "wRV8yMk886E"], "start_seconds": ["30", "0"], "properties": ["a, car, pass", "liquid, spray, nozzle"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "two cars are parked in a parking lot at night"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man speaks followed by a loud burst"], "question": "which entity is about liquid spraying?", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "someone sprays a liquid onto a hard surface making a hiss sound"], "sample_ids": ["zO-LSSY92ZM", "zO-LSSY92ZM"], "start_seconds": ["30", "30"], "properties": ["liquid, surface, sound", "liquid, surface, sound"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'"], "captions_pred_audio": ["steam is hissing and hissing", "steam is hissing and hissing"], "question": "which entity is a liquid?", "label": 0}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a man speaks as a motor runs in the background"], "sample_ids": ["w8uLijTqtlU", "xZepNM9qcRA"], "start_seconds": ["70", "30"], "properties": ["wind, microphone, noise", "background, motor, run"], "captions_pred_video": ["footage is blurry and shaky", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["the wind is blowing strongly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vhJWZheqaE", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["water drains unevenly, toilet flushes, water drains", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["water pouring and bubbling", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["uyRfq-jKPpo", "w5W5Kqtc8E"], "start_seconds": ["50", "100"], "properties": ["water, bubbles, pouring", "wind, blow, vehicle"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", null], "captions_pred_audio": ["water is running from a faucet", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a telephone rings and a bird vocalizes", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["skd2PphS6oI", "t97k0cejSQE"], "start_seconds": ["190", "250"], "properties": ["ring, bird, vocalize", "sound, chirp, buzz"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "a bee on a purple thistle flower"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a bee buzzes and a woman speaks"], "question": "which entity has a bird vocalize?", "label": 0}, {"captions": ["white noise and snoring with some rustling in the background", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xzKKf9bKNUo", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["background, noise, snoring", "three men, wind, flow"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "people speak and tapping occurs"], "sample_ids": ["x5cuQjOdM3E", "tFCUUGdREgA"], "start_seconds": ["30", "70"], "properties": ["cat, talk, meow", "people, tap, speak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a person riding a white horse in an indoor arena"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and walking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a car accelerates and wind blows", "a man speaks as a car is passing by"], "sample_ids": ["u0TrcHhkPQ", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["accelerates, wind, blows", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which car is moving faster", "label": 0}, {"captions": ["an adult man speaks over glass clinking", "wind blows as people chatter quietly"], "sample_ids": ["u6jIvCtKarQ", "xBxDz0CFVn0"], "start_seconds": ["70", "30"], "properties": ["a, man, speaks", "wind, chatter, people"], "captions_pred_video": ["footage of a person using a blender on a stove top", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "wind blows as people chatter quietly"], "sample_ids": ["w5W5Kqtc8E", "xBxDz0CFVn0"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["zY3icUyMdh8", "ukg5L09Wpvo"], "start_seconds": ["20", "150"], "properties": ["dog, bark, engine", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["y1saVTXsKwc", "uYT5gxnyMWM"], "start_seconds": ["80", "50"], "properties": ["a, dog, talk", "female, spraying, scream"], "captions_pred_video": ["a dog playing with a pink ball", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a dog barks and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "waves crash against a shoreline and people speak"], "sample_ids": ["sEprKHm8Sj8", "yFB25fqfU8I"], "start_seconds": ["90", "300"], "properties": ["noise, loud, buzzing", "wave, crash, shoreline"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "people speak softly as food sizzles"], "sample_ids": ["y8dSeubCNI", "yhQ2Lg-7qDY"], "start_seconds": ["4", "130"], "properties": ["engine revving, people speaking, motorcycle", "food, sizzle, speak"], "captions_pred_video": [null, "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["an engine revving and people talking in the background", "a faucet is running and a man is speaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["ticking continues without interruption", "a frog croaks as other frogs croak in the background"], "sample_ids": ["v-g-j2uTByM", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["ticking, continuous, clock", "background, frog, croak"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a close up of a frog in the water"], "captions_pred_audio": ["a clock is ticking loudly", "a frog is croaking"], "question": "which entity is not continuous", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xOZfdgAgJ9o", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["woman, whimpering, speaking", "a woman, something, fried"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking while food is frying in the background"], "question": "which woman is speaking", "label": 1}, {"captions": ["a toilet flushes and water drains", "pigeons vocalize and birds chirp"], "sample_ids": ["sfAvvZwdLCY", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["water drains, flushes, water", "vocalize, bird, chirp"], "captions_pred_video": ["footage of the toilet in the bathroom", "of the pigeon in the cage"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "people applaud and hoot and chat quietly"], "sample_ids": ["vMf1dLD6Sng", "wwyfGO2J4"], "start_seconds": ["6", "90"], "properties": ["frog, bird, vocalize", "people, applaud, hoot"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", null], "captions_pred_audio": ["a frog croaks loudly", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a infant makes noise and is excited"], "sample_ids": ["wPz6QRAkEb4", "wIJK3-5y0kA"], "start_seconds": ["60", "30"], "properties": ["chirps, tweets, song", "noise, excited, infant"], "captions_pred_video": ["a bird in a cage on top of a pole", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["birds are chirping in the background ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "an engine works in idle nearby followed by a man talking"], "sample_ids": ["weDbePuc-Xc", "wqADXCzngMw"], "start_seconds": ["40", "340"], "properties": ["cartoon character, music, vocalize", "engine, idle, man"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "of a man working on a vintage volkswagen beetle"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a lawn mower is running and a man is speaking "], "question": "which entity is a video of a man talking?", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["zcDwZ6W7E3E", "uEU-Hg5MTN8"], "start_seconds": ["180", "27"], "properties": ["man, speak, motorcycles", "animal, grunts, snorts"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a duck quacks continuously", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vh30P49Po6s", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["quacks, continuously, duck", "music, gunfire, explosion"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a duck is quacking loudly", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a scene from a movie", "label": 1}, {"captions": ["continuous snoring", "a car speeding up in the distance"], "sample_ids": ["sLkeqCDJIyw", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["loud, snoring, noise", "distance, car, speed"], "captions_pred_video": [", what is the man doing on the couch? sleeping", null], "captions_pred_audio": ["a person is snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["people clap and speak in the distance", "vehicle tires screech and a man speaks before a car door opens"], "sample_ids": ["wwyfGO2J4", "sxYkFKFIZD0"], "start_seconds": ["90", "20"], "properties": ["clap, distance, speak", "screech, man, door"], "captions_pred_video": [null, "2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking while a car is revving and accelerating with a squeal in the background "], "question": "which entity is about a car door opening?", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a frog croaks as other frogs croak in the background"], "sample_ids": ["xKB8O8LTs6s", "yswmmRZFItk"], "start_seconds": ["70", "0"], "properties": ["music, gunfire, explosion", "background, frog, croak"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a close up of a frog in the water"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a frog is croaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "someone is typing on a computer keyboard"], "sample_ids": ["uYT5gxnyMWM", "v0x1odnXtP0"], "start_seconds": ["50", "210"], "properties": ["a, scream, girl", "keyboard, type, computer"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["someone snores nearby", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["spJCm8tD9Zo", "wz7N8YRy74I"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "rooster, crow, background, men"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["an insect buzzes around continuously", "a duck quacks continuously"], "sample_ids": ["v25l1jef3JY", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "quacks, continuously, duck"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["electronic beeps occur in a short series", "someone whistles a tune"], "sample_ids": ["y682ml90jGw", "sIXTftIuUgw"], "start_seconds": ["11", "90"], "properties": ["beeps, series, electronic", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a person whistling a song"], "question": "which is not a musical instrument", "label": 0}, {"captions": ["a train horn sounds as a railroad passing bell rings", "a car accelerates and wind blows"], "sample_ids": ["zgUgkpk78xU", "u0TrcHhkPQ"], "start_seconds": ["70", "20"], "properties": ["horn, bell, train", "accelerates, wind, blows"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as horns blow", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tHyNqRyK34A", "tDVADusiIoc"], "start_seconds": ["24", "60"], "properties": ["a, man, speaks", "water, radio, man"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sOa7g-44Dag", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["background, man, spray", "stream, water, flow"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a man speaks followed by another man speaking outside"], "sample_ids": ["sG7TyPnFDR0", "viuTg1M-dqg"], "start_seconds": ["180", "30"], "properties": ["beeps, machine, smoke alarm", "two men, speak, follow"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking while a machine runs?", "label": 0}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["uEU-Hg5MTN8", "tiDFTC-5vU"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "male, duck, laugh"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["loud, continuous burping", "a person snores loudly multiple times at a close distance"], "sample_ids": ["y636gklDioE", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["loud, continuous, burping", "loud, multiple, distance"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", null], "captions_pred_audio": ["a person burps loudly several times", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "wind blows as people chatter quietly"], "sample_ids": ["yaln9y8I7ms", "xBxDz0CFVn0"], "start_seconds": ["230", "30"], "properties": ["female, flushes, toilet", "wind, chatter, people"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["w9lpbUn0hPc", "vzceMbklWc"], "start_seconds": ["30", "180"], "properties": ["male, wind, rustling", "water, faucet, sink"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", null], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "water is running and a man is speaking"], "question": "which entity is a video of water running?", "label": 1}, {"captions": ["people clap and speak in the distance", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wwyfGO2J4", "vb1fPSDI4c"], "start_seconds": ["90", "30"], "properties": ["clap, distance, speak", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zhx6hoYrHeI", "wqZ135Ssz0"], "start_seconds": ["160", "60"], "properties": ["engine, sputter, rough", "two men, woman, birds"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is talking", "label": 1}, {"captions": ["a clock ticktocks in wind", "small dogs yip and bark sharply"], "sample_ids": ["yVumC9TGknc", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["ticktocks, clock, wind", "bark, yip, sharply"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a series of beeps and chirps", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a motorcycle idles loudly as wind blows"], "sample_ids": ["sOa7g-44Dag", "v7jJS8aAyA"], "start_seconds": ["30", "10"], "properties": ["background, man, spray", "wind, blows, loudly"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a motorcycle engine is idling and vibrating"], "question": "which entity is louder", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "a car speeding up in the distance"], "sample_ids": ["tDlysoZiA1I", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["animal, grunt, multiple", "distance, car, speed"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a stream of water runs briefly"], "sample_ids": ["wRBHTgrbiwg", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["bird, owl, speak", "stream, water, run"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "water quietly rushes by while birds chirp in the background"], "sample_ids": ["y8dSeubCNI", "sYITalLZjj4"], "start_seconds": ["4", "30"], "properties": ["engine revving, people speaking, motorcycle", "water, rushes, background, birds"], "captions_pred_video": [null, "two ducks are swimming in the water near each other"], "captions_pred_audio": ["an engine revving and people talking in the background", "wind blows and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["a person is snoring while sleeping", "people cheer as a vehicle engine revs"], "sample_ids": ["vJrjSeP17yE", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["a person is sleeping, snoring, person", "engine revs, vehicle, people"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person snoring loudly", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a woman speaks with water running", "a horn blasts as warning bells ring"], "sample_ids": ["wTideSjRFS0", "zgUgkpk78xU"], "start_seconds": ["30", "70"], "properties": ["water, running, woman", "horn, bells, ring"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a warning", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "water pouring and bubbling"], "sample_ids": ["soTOh3zYJfY", "uyRfq-jKPpo"], "start_seconds": ["40", "50"], "properties": ["vehicle, skid, tires", "water, bubbles, pouring"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a man speaking with light rustling"], "sample_ids": ["y1saVTXsKwc", "zOZleIRqZm4"], "start_seconds": ["80", "80"], "properties": ["a, dog, talk", "light, rustling, man"], "captions_pred_video": ["a dog playing with a pink ball", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a dog barks and a man speaks", "a man is speaking with crickets chirping in the background"], "question": "which entity is a man speaking with light rustling?", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tDVADusiIoc", "uYT5gxnyMWM"], "start_seconds": ["60", "50"], "properties": ["man, radio, blows", "female, spraying, scream"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a man speaks as a car is passing by"], "sample_ids": ["uWPRNLnpy7Y", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["accelerate, laugh, vehicle", "a, car, pass"], "captions_pred_video": ["is taken from a car driving down the street", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which vehicle is moving faster", "label": 0}, {"captions": ["a man talks while vehicles pass by", "winds blows roughly as a vehicle races past"], "sample_ids": ["sK4u5T8hW78", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["a, man, talk", "wind, blows, vehicle"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle racing past?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["uYT5gxnyMWM", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["person, spray, yell", "a woman, something, fried"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a large crowd cheers and applauds", "pigeons vocalize and birds chirp"], "sample_ids": ["rqfQRErjfk8", "uiS58TNyUiw"], "start_seconds": ["170", "430"], "properties": ["crowd, cheers, applauds", "vocalize, bird, chirp"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "of the pigeon in the cage"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a man is speaking and a bee is buzzing"], "question": "which entity is not a crowd?", "label": 1}, {"captions": ["a goat screams and people speak in the background", "vehicles pass by on a roadway"], "sample_ids": ["xC8kbrKJmco", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["background, goat, scream", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a goat is bleating ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "paper is crumpling consistently"], "sample_ids": ["yVumC9TGknc", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["humming, clock, birds", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a series of beeps and chirps", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a man speaks as a car is passing by"], "sample_ids": ["wRBHTgrbiwg", "sK4u5T8hW78"], "start_seconds": ["50", "30"], "properties": ["bird, owl, speak", "a, car, pass"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking briefly?", "label": 0}, {"captions": ["distant humming of an engine", "small dogs yip and bark sharply"], "sample_ids": ["yVPZ2MNWpms", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["sound, distance, engine", "bark, yip, sharply"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a car is driving by on the road ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a vehicle engine runs while a siren and horn sound", "an audience gives applause"], "sample_ids": ["u--KhUW8l1Y", "x6iCUDmRpKQ"], "start_seconds": ["0", "38"], "properties": ["engine, sound, horn", "applause, audience, give"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a black background with the moon and stars in the sky"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a group of people are clapping and cheering"], "question": "which entity is a response to a stimulus", "label": 1}, {"captions": ["water flows and trickles", "a stream of water runs briefly"], "sample_ids": ["tB7hWb9gTuQ", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["water, flow, trickle", "stream, water, run"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["water is splashing and gurgling", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["wind blows strongly", "a horn rings out as a machine runs by"], "sample_ids": ["w8uLijTqtlU", "slZLHwNbbt4"], "start_seconds": ["70", "300"], "properties": ["wind, blows, strongly", "a, horn, run"], "captions_pred_video": ["footage is blurry and shaky", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["the wind is blowing strongly", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is moving", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "a car accelerates and wind blows"], "sample_ids": ["uPDn2BFTHk", "u0TrcHhkPQ"], "start_seconds": ["140", "20"], "properties": ["woman, laughs, speaks", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["women speak and laugh as wind blows", "music plays, a person speaks, followed by whooshes and a ding"], "sample_ids": ["un9VQlzgZM", "tQWGZLItBXk"], "start_seconds": ["5", "170"], "properties": ["wind, speak, laugh", "music, person, ding"], "captions_pred_video": [null, "worms revolution screenshots"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has a person speaking?", "label": 1}, {"captions": ["food is frying then a woman speaks", "a horn rings out as a machine runs by"], "sample_ids": ["ukxt9I7eMMg", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["food, woman, speak", "a, horn, run"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["tjmoSi330GM", "ukg5L09Wpvo"], "start_seconds": ["23", "150"], "properties": ["speed, water, boat", "clickety-clack, train, whistle"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a train blows its whistle and blows its horn "], "question": "which entity is moving at a slower speed", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "wind blows as people chatter quietly"], "sample_ids": ["zgUgkpk78xU", "xBxDz0CFVn0"], "start_seconds": ["70", "30"], "properties": ["horn, bells, ring", "wind, chatter, people"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage is blurry and out of focus"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a toilet flushes and a female speaks"], "sample_ids": ["w2M4i1mklOA", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["loud, chime, bell", "female, flushes, toilet"], "captions_pred_video": ["footage of an antique clock", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a toilet flushes and a man speaks"], "question": "which entity is silent", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "a stream of water runs briefly"], "sample_ids": ["vSeGhaZt-aI", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["water, bubbles, speak", "stream, water, run"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vddP56-ogds", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["water, splash, person, laugh", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking nearby?", "label": 0}, {"captions": ["an engine works in idle nearby followed by a man talking", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wqADXCzngMw", "su6FAOcOA8c"], "start_seconds": ["340", "4"], "properties": ["engine, idle, man", "engine, idle, woman"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a woman is speaking and a subway train is moving "], "question": "which entity has a man talking to an engine?", "label": 0}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "people applaud and hoot and chat quietly"], "sample_ids": ["sQGXqGcwOTc", "wwyfGO2J4"], "start_seconds": ["3", "90"], "properties": ["audio, kid, giggles", "people, applaud, hoot"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone is snoring while sleeping", "a child speaks in closed space"], "sample_ids": ["ujMt0-D-x2k", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["snore, sleep, someone", "child, space, speak"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["u2f5NpsoHBg", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["person, laugh, clap", "rooster, crow, background, men"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity shows a person speaking?", "label": 0}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["ylpYOorfH4o", "siJFXfGWgDk"], "start_seconds": ["410", "50"], "properties": ["engine, run, loud", "man, woman, vehicle"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and birds are chirping in the background "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["slZLHwNbbt4", "uZesmtKZGSw"], "start_seconds": ["300", "250"], "properties": ["train, horn, sound", "men, talk, cars"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks over intermittent keyboard taps", "a woman speaks as she rubs two objects together"], "sample_ids": ["tw76HGONaKg", "vzxHnu-SFEw"], "start_seconds": ["570", "80"], "properties": ["audio, man, keyboard", "two objects, woman, speak"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a man speaks as a boat engine runs"], "sample_ids": ["zhx6hoYrHeI", "wtDqrBygTcU"], "start_seconds": ["160", "30"], "properties": ["engine, sputter, rough", "man, engine, run"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "shows a person riding on the back of a boat as it speeds through the water"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and a motor is running"], "question": "which entity is a boat?", "label": 1}, {"captions": ["a woman and man are speaking", "some men converse over an engine running"], "sample_ids": ["vbpKkWvfOu4", "sCiy7QS1U"], "start_seconds": ["560", "300"], "properties": ["two people, speaking, woman, man", "men, converse, engine"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows two people speaking", "label": 0}, {"captions": ["a child speaks in closed space", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yW6FWLSLkx4", "tiDFTC-5vU"], "start_seconds": ["40", "30"], "properties": ["child, space, speak", "male, duck, laugh"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["a stream of water runs briefly", "a man speaks as a car is passing by"], "sample_ids": ["x-PeY8Yb8M4", "sK4u5T8hW78"], "start_seconds": ["300", "30"], "properties": ["stream, water, run", "a, car, pass"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while water drains", "paper folding and crinkling"], "sample_ids": ["vSeGhaZt-aI", "zPpG3RD8lSs"], "start_seconds": ["50", "20"], "properties": ["water, drain, man", "paper, fold, crinkle"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of a process", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "people speak softly as food sizzles"], "sample_ids": ["yswmmRZFItk", "yhQ2Lg-7qDY"], "start_seconds": ["0", "130"], "properties": ["background, frog, croak", "food, sizzle, speak"], "captions_pred_video": ["a close up of a frog in the water", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a frog is croaking", "a faucet is running and a man is speaking"], "question": "which entity is silent", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a child speaks in closed space"], "sample_ids": ["xC8kbrKJmco", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["background, goat, scream", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a goat is bleating ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "heavy rain splashes as it falls"], "sample_ids": ["wqADXCzngMw", "wP8ZKrlx3oA"], "start_seconds": ["340", "40"], "properties": ["engine, idle, man", "fall, rain, splash"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a heavy rain is falling on a surface"], "question": "which entity is a liquid", "label": 1}, {"captions": ["some clanking with distant murmuring", "three men talk while wind blows and some liquid flows"], "sample_ids": ["uMTTDZ2mb4", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["clanking, murmuring, distant", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "a woman speaks as she rubs two objects together"], "sample_ids": ["vVhthZ45k3Y", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["cat, purr, hiss", "two objects, woman, speak"], "captions_pred_video": ["footage is blurry and out of focus", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "someone snores nearby"], "sample_ids": ["wztCSUxOf8", "spJCm8tD9Zo"], "start_seconds": ["130", "90"], "properties": ["a crowd, yells, applauds", "someone snores, nearby, someone"], "captions_pred_video": [null, "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a baby cries and a woman speaks"], "sample_ids": ["tDVADusiIoc", "tMbMDvT50j8"], "start_seconds": ["60", "12"], "properties": ["water, radio, man", "a, cry, woman"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a baby cries and a woman speaks"], "question": "which entity is a human speaking?", "label": 0}, {"captions": ["a beep occurs briefly", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["xtWeJ56-U-g", "ukg5L09Wpvo"], "start_seconds": ["20", "150"], "properties": ["beep, occur, briefly", "a train, a horn, a bell"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a train blows its whistle and blows its horn "], "question": "which entity is a warning", "label": 1}, {"captions": ["water runs into a sink while men speak", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vzceMbklWc", "tiDFTC-5vU"], "start_seconds": ["180", "30"], "properties": ["water, sink, run", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and a man is speaking", "a man is speaking and ducks are quacking"], "question": "which entity has a duck?", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "pigeons vocalize and birds chirp"], "sample_ids": ["uiS58TNyUiw", "uiS58TNyUiw"], "start_seconds": ["430", "430"], "properties": ["vocalize, bird, chirp", "vocalize, bird, chirp"], "captions_pred_video": ["of the pigeon in the cage", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "gunshots ring out, a man yells, and more shots follow"], "sample_ids": ["vZAw4apG0Es", "vKrYfzleLB8"], "start_seconds": ["30", "110"], "properties": ["people, clock, converse", "a, ring, gunshots"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "stock footage of a person holding a gun in their hand"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking with background noise and a cap gun is fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["tDVADusiIoc", "y8WEcpOlT3I"], "start_seconds": ["60", "40"], "properties": ["water, radio, man", "harsh, wind, blows"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with wind noise in the background "], "question": "which entity has a man speaking over a radio as wind blows and water splashes?", "label": 0}, {"captions": ["an small aircraft engine runs and a boy speaks", "some tunes played by whistling"], "sample_ids": ["xSKJGCItUWE", "u6BnG6YZqJ4"], "start_seconds": ["10", "0"], "properties": ["engine, run, boy", "tune, play, whistling"], "captions_pred_video": ["footage of the helicopter flying in the room", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["scraping and female speech with distant music", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["yHeVV-xeOxQ", "ziUT9IFTkjg"], "start_seconds": ["130", "10"], "properties": ["female, speech, music", "background, birds, rustling"], "captions_pred_video": ["of a girl milking a goat's udder", null], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "some men converse over an engine running"], "sample_ids": ["uWPRNLnpy7Y", "sCiy7QS1U"], "start_seconds": ["10", "300"], "properties": ["accelerate, laugh, vehicle", "men, converse, engine"], "captions_pred_video": ["is taken from a car driving down the street", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a dog barks and whimpers", "a clock ticktocks"], "sample_ids": ["sShpyu2l4YQ", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "ticktocks, clock, ticktocks"], "captions_pred_video": ["the puppies are playing with a toy", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a dog is barking and growling", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as horns blow", "children cheer as a man speaks then an audience screams"], "sample_ids": ["tHyNqRyK34A", "vJvryTwuAV8"], "start_seconds": ["24", "16"], "properties": ["a, man, speaks", "audience, cheer, man"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man is speaking and a crowd is shouting and whooping "], "question": "which man is speaking", "label": 1}, {"captions": ["an adult woman and an adult man speak", "an airplane accelerates briefly"], "sample_ids": ["zTLVJCo4WEE", "zjTG0gaGCUI"], "start_seconds": ["30", "80"], "properties": ["two people, adult, speak", "accelerates, airplane, briefly"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a jet engine roars as wind blows "], "question": "which is not a person", "label": 1}, {"captions": ["an audience gives applause", "wind blows as people chatter quietly"], "sample_ids": ["x6iCUDmRpKQ", "xBxDz0CFVn0"], "start_seconds": ["38", "30"], "properties": ["applause, audience, give", "wind, chatter, people"], "captions_pred_video": ["a black background with the moon and stars in the sky", "footage is blurry and out of focus"], "captions_pred_audio": ["a group of people are clapping and cheering", "a man is speaking with wind noise in the background "], "question": "which is quieter", "label": 0}, {"captions": ["females talk and laugh over gusting wind", "a motor idles, accelerates, then slows down."], "sample_ids": ["un9VQlzgZM", "vYkA3cfXp5Q"], "start_seconds": ["5", "30"], "properties": ["females, talk, laugh", "speed, idle, accelerate"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "an engine is idling"], "question": "which entity is not a person", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "a telephone rings followed by a woman talking"], "sample_ids": ["sQGXqGcwOTc", "tGcFnX0GHI"], "start_seconds": ["3", "0"], "properties": ["audio, kid, giggles", "ring, talk, woman"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["children speak and play together", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yVVP8XvWJTo", "xBxDz0CFVn0"], "start_seconds": ["260", "30"], "properties": ["children, speak, play", "stream, water, flow"], "captions_pred_video": ["footage of a playground at a school or daycare center", "footage is blurry and out of focus"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "vehicles pass by on a roadway"], "sample_ids": ["vhJWZheqaE", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["water drains unevenly, toilet flushes, water drains", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a toilet is flushed", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["w8uLijTqtlU", "sapQIQUhFc"], "start_seconds": ["70", "280"], "properties": ["wind, microphone, noise", "liquid, flow, distance"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more distant", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "an infant crying frantically"], "sample_ids": ["u21-Z5gJCB8", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["background, voice, man", "cry, infant, frantically"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a helicopter engine idles continuously", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["ugHJF0hfYkg", "sLUnaPT5gM8"], "start_seconds": ["10", "0"], "properties": ["engine, idle, continuously", "loud, laughter, intermittent"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a helicopter is flying overhead ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a male speaks and another male speaks", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["viuTg1M-dqg", "s7knHCFW82w"], "start_seconds": ["30", "30"], "properties": ["two males, speaking, male", "blow horn, get close, train"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a train is blowing its horn and its wheels are squealing "], "question": "which is a train", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "several insects fly while two men talk"], "sample_ids": ["zcDwZ6W7E3E", "s-T9OVOiMLo"], "start_seconds": ["180", "330"], "properties": ["a, man, speak", "several, fly, men"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more insects", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tIY7qOV3rEM", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "applause, audience, yells"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["y8dSeubCNI", "tdWhHV3X25Q"], "start_seconds": ["4", "60"], "properties": ["men, women, car", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["an engine revving and people talking in the background", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "birds coo incessantly"], "sample_ids": ["s4Uz1Ffgo04", "yZrFNS7GFBQ"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "coo, bird, incessant"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "of the bird in the cage"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "an owl hoots in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "someone whistles a tune"], "sample_ids": ["s59PfAghdkM", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["bird, chirp, background, horse, neigh", "someone, tune, whistle"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", null], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["material crumbles into a microphone", "a man speaks over intermittent keyboard taps"], "sample_ids": ["vofpvUo6NAw", "tw76HGONaKg"], "start_seconds": ["220", "570"], "properties": ["material, crumbles, microphone", "audio, man, keyboard"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a man speaks and types on a computer keyboard "], "question": "which entity is a video", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "water flows and trickles"], "sample_ids": ["wvKpEYswXO0", "tB7hWb9gTuQ"], "start_seconds": ["150", "30"], "properties": ["sound, water, running", "water, flow, trickle"], "captions_pred_video": ["of the person preparing food in the kitchen", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "water is splashing and gurgling"], "question": "which entity has water flowing and trickling?", "label": 1}, {"captions": ["a door slams shut roughly", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zkKdxzNC97Y", "vfYTJq7nU"], "start_seconds": ["27", "130"], "properties": ["a door, slams, shut", "rustling, ducks, quack"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a duck quacks and a woman speaks"], "question": "which entity is more likely to be a door", "label": 0}, {"captions": ["a man speaks over intermittent keyboard taps", "pigeons vocalize and birds chirp"], "sample_ids": ["tw76HGONaKg", "uiS58TNyUiw"], "start_seconds": ["570", "430"], "properties": ["audio, man, keyboard", "vocalize, bird, chirp"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "of the pigeon in the cage"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["wind blows as people chatter quietly", "an insect buzzes around continuously"], "sample_ids": ["xBxDz0CFVn0", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["wind, chatter, people", "buzzes, continuously, insect"], "captions_pred_video": ["footage is blurry and out of focus", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a fly is buzzing around a microphone "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a stream of water flows as people talk and wind blows", "winds blows roughly as a vehicle races past"], "sample_ids": ["xBxDz0CFVn0", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["stream, water, flow", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a stream", "label": 0}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "a duck quacks continuously"], "sample_ids": ["w9lpbUn0hPc", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["male, wind, rustling", "quacks, continuously, duck"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a propeller rotates loudly and intensely"], "sample_ids": ["tQWGZLItBXk", "ugHJF0hfYkg"], "start_seconds": ["170", "10"], "properties": ["music, person, ding", "loud, intense, propeller"], "captions_pred_video": ["worms revolution screenshots", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a helicopter engine runs", "a man speaks as a motor runs in the background"], "sample_ids": ["t5ZbXbniOWk", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["engine, helicopter, run", "background, motor, run"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "roadway noise occurs and a truck accelerates"], "sample_ids": ["yajyRTUQk3U", "tgbONvsP47Y"], "start_seconds": ["400", "0"], "properties": ["noise, woman, speak", "noise, truck, accelerate"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a car is driving on the road "], "question": "which noise is caused by a truck", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["s4tUs779vBA", "tw76HGONaKg"], "start_seconds": ["160", "570"], "properties": ["a, sound, stop", "A, game, keyboard"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a man speaks and types on a computer keyboard "], "question": "which entity has a keyboard?", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "wind blowing followed by a zoom"], "sample_ids": ["vzxHnu-SFEw", "vr8ZXjEBhMQ"], "start_seconds": ["80", "150"], "properties": ["two objects, woman, speak", "wind, blow, zoom"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a propeller rotates loudly and intensely"], "sample_ids": ["wnpJndXuxLc", "ugHJF0hfYkg"], "start_seconds": ["50", "10"], "properties": ["beeps, loud, whistle", "loud, intense, propeller"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["an engine runs and a man speaks", "a weapon fires multiple times"], "sample_ids": ["yT5WfYMRr-U", "sMC07Ucy7kg"], "start_seconds": ["30", "10"], "properties": ["engine, run, man", "weapon, fire, multiple"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage is from a car's point of view"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["leaves rustle while man speaks", "a vehicle accelerates and squeals tires"], "sample_ids": ["zOZleIRqZm4", "yRx9txMcBl0"], "start_seconds": ["80", "40"], "properties": ["leaves, rustle, speak", "accelerates, tires, squeals"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a car is revving its engine and skidding "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a car speeding up in the distance"], "sample_ids": ["wtDqrBygTcU", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["man, engine, run", "distance, car, speed"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", null], "captions_pred_audio": ["a man is speaking and a motor is running", "a race car accelerates and revs its engine "], "question": "which object is moving faster", "label": 0}, {"captions": ["a man speaks as a machine runs", "some men converse over an engine running"], "sample_ids": ["vD6lYD1l0BY", "sCiy7QS1U"], "start_seconds": ["330", "300"], "properties": ["a, machine, run", "men, converse, engine"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a man speaking as a machine runs?", "label": 0}, {"captions": ["birds chirp quietly and an adult man speaks", "a machine clanks and thumps and a male speaks"], "sample_ids": ["zuua6-5goWw", "sWZzXuWYY"], "start_seconds": ["30", "420"], "properties": ["birds, chirp, quiet, man, speaks", "male, clanks, thumps"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", null], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a sewing machine runs and a man speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["music plays followed by gunshots and then an explosion", "people applaud and hoot and chat quietly"], "sample_ids": ["xKB8O8LTs6s", "wwyfGO2J4"], "start_seconds": ["70", "90"], "properties": ["music, gunshots, explosion", "people, applaud, hoot"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "people are clapping and speaking with background noise "], "question": "which entity is more peaceful", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "water flows and trickles"], "sample_ids": ["ugHJF0hfYkg", "tB7hWb9gTuQ"], "start_seconds": ["10", "30"], "properties": ["loud, intense, propeller", "water, flow, trickle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a helicopter is flying overhead ", "water is splashing and gurgling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["male speech with light ticking", "an airplane engine spools and people speak"], "sample_ids": ["xO-Q2BlIIPU", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["male, speech, ticking", "airplane, engine, spool"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine is running and people are talking"], "question": "which entity is a video", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a man speaks over intermittent keyboard taps"], "sample_ids": ["sWZzXuWYY", "tw76HGONaKg"], "start_seconds": ["420", "570"], "properties": ["male, clanks, thumps", "audio, man, keyboard"], "captions_pred_video": [null, "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uzQnlJXBbOM", "sSMl2vc3ek"], "start_seconds": ["50", "20"], "properties": ["ringing, beep, stop", "loud, multiple, distance"], "captions_pred_video": ["footage of a person using a cell phone on a table", null], "captions_pred_audio": ["a telephone rings and a man speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["someone is snoring while sleeping", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["ujMt0-D-x2k", "sLUnaPT5gM8"], "start_seconds": ["0", "0"], "properties": ["snore, sleep, someone", "loud, laughter, intermittent"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a person is snoring loudly", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tQWGZLItBXk", "uYT5gxnyMWM"], "start_seconds": ["170", "50"], "properties": ["music, person, ding", "female, spraying, scream"], "captions_pred_video": ["worms revolution screenshots", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a baby is crying"], "question": "which entity has a person speaking?", "label": 0}, {"captions": ["a machine engine runs and a man speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vs65y4qmyBE", "su6FAOcOA8c"], "start_seconds": ["340", "4"], "properties": ["engine, run, man", "engine, idle, woman"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and a subway train is moving "], "question": "which entity has a man speaking while an engine runs?", "label": 0}, {"captions": ["water gurgles, metal squeaks and the water stops", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["x4a9YGIw4ok", "xKB8O8LTs6s"], "start_seconds": ["120", "70"], "properties": ["water, gurgles, stops", "music, gunfire, explosion"], "captions_pred_video": ["footage is blurry and out of focus", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a toilet flushes and water splashes", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "water is sprayed across a hard surface"], "sample_ids": ["xM4joTqDVp4", "sQwlkXjQabo"], "start_seconds": ["160", "10"], "properties": ["background, chirp, birds", "water, spray, surface"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["birds are chirping and a train is moving ", "spraying followed by silence"], "question": "which entity is a video of a train chugging?", "label": 0}, {"captions": ["a man speaking with light rustling", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zOZleIRqZm4", "wz7N8YRy74I"], "start_seconds": ["80", "30"], "properties": ["light, rustling, man", "rooster, crow, background, men"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["x5cuQjOdM3E", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["cat, meows, young woman", "A, game, keyboard"], "captions_pred_video": ["a black background with an airplane flying in the sky", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man speaks and types on a computer keyboard "], "question": "which entity is playing a video game", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "wind blows strongly and a young man speaks"], "sample_ids": ["xBxDz0CFVn0", "vs65y4qmyBE"], "start_seconds": ["30", "340"], "properties": ["stream, water, flow", "wind, blows, strongly"], "captions_pred_video": ["footage is blurry and out of focus", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a heavy engine is running and men are speaking "], "question": "which entity is a stream of water flowing as people talk and wind blows?", "label": 0}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "people speak as gunfire rings out"], "sample_ids": ["xyL9F5VrjkE", "wqTCwqVRDlk"], "start_seconds": ["20", "80"], "properties": ["wind, motor, distance", "gunfire, ring, speak"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a large crowd cheers and applauds"], "sample_ids": ["y1saVTXsKwc", "rqfQRErjfk8"], "start_seconds": ["80", "170"], "properties": ["a, dog, talk", "crowd, cheers, applauds"], "captions_pred_video": ["a dog playing with a pink ball", "a man hugging another man in front of an orchestra"], "captions_pred_audio": ["a dog barks and a man speaks", "a crowd of people clapping and cheering"], "question": "which entity is more active", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vZAw4apG0Es", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["people, clock, converse", "loud, laughter, intermittent"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a clock is ticking and people are talking", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["rustling with distant murmuring", "a car speeding up in the distance"], "sample_ids": ["wnNNcxAPwGQ", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["sound, distance, rustling", "distance, car, speed"], "captions_pred_video": ["footage of a yellow truck doing a burnout on a race track", null], "captions_pred_audio": ["a crowd of people are talking and laughing while a skateboard rolls by ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an engine runs loudly", "a clock ticktocks"], "sample_ids": ["vqZuVbG6-HI", "v-g-j2uTByM"], "start_seconds": ["130", "30"], "properties": ["loud, engine, run", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is blurry because it's raining outside", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a clock is ticking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wTjoRj1se3U", "ukg5L09Wpvo"], "start_seconds": ["390", "150"], "properties": ["engine, run, people", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a jet engine is running and people are talking", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a car speeding up in the distance", "a clock ticktocks"], "sample_ids": ["u0TrcHhkPQ", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["distance, car, speed", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["paper is crumpling consistently", "a toilet flushes and a female speaks"], "sample_ids": ["v5cSxLaHADY", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "female, flushes, toilet"], "captions_pred_video": ["footage of the person holding a pair of scissors", "footage is blurry and out of focus"], "captions_pred_audio": ["paper is crumpled and crinkled", "a toilet flushes and a man speaks"], "question": "which entity is a video", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a man speaks followed by another man speaking outside"], "sample_ids": ["tEE3MpBt1sg", "viuTg1M-dqg"], "start_seconds": ["50", "30"], "properties": ["drill, something, laugh", "two men, speak, follow"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tQWGZLItBXk", "uZesmtKZGSw"], "start_seconds": ["170", "250"], "properties": ["music, person, ding", "men, talk, cars"], "captions_pred_video": ["worms revolution screenshots", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "winds blows roughly as a vehicle races past"], "sample_ids": ["sDSppXIlJrs", "xjvTpk2Zpr8"], "start_seconds": ["27", "70"], "properties": ["microphone, water, wind", "wind, blows, vehicle"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a low rumbling in the distance followed by a motorcycle engine revving up", "people speak in the background as a clock ticktocks"], "sample_ids": ["vr8ZXjEBhMQ", "vZAw4apG0Es"], "start_seconds": ["150", "30"], "properties": ["sound, distance, engine", "background, clock, ticktocks"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a clock is ticking and people are talking"], "question": "which entity has a clock ticking in the background?", "label": 1}, {"captions": ["frogs croak and vocalize", "women speak and laugh as wind blows"], "sample_ids": ["yswmmRZFItk", "un9VQlzgZM"], "start_seconds": ["0", "5"], "properties": ["croak, vocalize, frog", "wind, speak, laugh"], "captions_pred_video": ["a close up of a frog in the water", null], "captions_pred_audio": ["a frog is croaking", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is a human", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vBslzh7saPw", "vJ7JPEFhyLA"], "start_seconds": ["90", "16"], "properties": ["power, scream, increase", "three men, wind, flow"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "an airplane engine spools and people speak"], "sample_ids": ["wTideSjRFS0", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["food, sizzle, woman", "airplane, engine, spool"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a jet engine is running and people are talking"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a toilet flushes and water drains", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sfAvvZwdLCY", "vb1fPSDI4c"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "multiple, people, yell"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a crowd of people are talking and laughing"], "question": "which entity has more water", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w5W5Kqtc8E", "wDVMhEdTiVw"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "water splashes as an animal walks through"], "sample_ids": ["vbpKkWvfOu4", "w1ir-sZ3Im8"], "start_seconds": ["560", "90"], "properties": ["a, man, speaks", "animal, water, splashes"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["speaking following by laughing and clapping", "small dogs yip and bark sharply"], "sample_ids": ["u2f5NpsoHBg", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["person, laugh, clap", "bark, yip, sharply"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a weapon fires multiple times", "a machine beeps continuously"], "sample_ids": ["sMC07Ucy7kg", "y682ml90jGw"], "start_seconds": ["10", "11"], "properties": ["weapon, fire, multiple", "beeps, machine, continuously"], "captions_pred_video": ["footage is from a car's point of view", null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a beeping sound is being made "], "question": "which entity is not a weapon", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a motor runs in the distance as a soft wind periodically gusts"], "sample_ids": ["vBHyYJ8pL0", "xyL9F5VrjkE"], "start_seconds": ["2", "20"], "properties": ["noise, door, opening", "wind, motor, distance"], "captions_pred_video": [null, "of a caterpillar truck loading logs into a trailer"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "the wind is blowing and a car is passing by "], "question": "which entity is accompanied by a door opening and closing", "label": 0}, {"captions": ["a propeller rotates loudly and intensely", "a man speaks as a motor runs in the background"], "sample_ids": ["ugHJF0hfYkg", "xZepNM9qcRA"], "start_seconds": ["10", "30"], "properties": ["loud, intense, propeller", "background, motor, run"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man speaks while a motorcycle revs and accelerates "], "question": "which is quieter", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["weDbePuc-Xc", "wqZ135Ssz0"], "start_seconds": ["40", "60"], "properties": ["music, slaps, human", "two men, woman, birds"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "water pouring and bubbling"], "sample_ids": ["x9JovgqUcs", "uyRfq-jKPpo"], "start_seconds": ["500", "50"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man speaks and types on a keyboard", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a person snoring", "water splashes as an animal walks through"], "sample_ids": ["t8tv5YRMJUg", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["a person, snore, loud", "animal, water, splashes"], "captions_pred_video": ["of a man getting his face licked by another man", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a person sniffs and breathes heavily", "water splashes and gurgles as people speak"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wqN6IIHw3po", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["rain, surface, fall", "rooster, crow, background, men"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and water is splashing", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "water is sprayed across a hard surface"], "sample_ids": ["uYT5gxnyMWM", "sQwlkXjQabo"], "start_seconds": ["50", "10"], "properties": ["female, spraying, scream", "water, spray, surface"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "spraying followed by silence"], "question": "which entity is sprayed across a hard surface", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tK4VlLsNxak", "wqZ135Ssz0"], "start_seconds": ["120", "60"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "two men, woman, birds"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a duck quacks continuously"], "sample_ids": ["wvKpEYswXO0", "vh30P49Po6s"], "start_seconds": ["150", "30"], "properties": ["water, tap, run", "quacks, continuously, duck"], "captions_pred_video": ["of the person preparing food in the kitchen", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "people speak as gunfire rings out"], "sample_ids": ["zFjIWfSD-4", "wqTCwqVRDlk"], "start_seconds": ["410", "80"], "properties": ["People, motor, brakes", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["food is frying and sizzles", "a car accelerates and wind blows"], "sample_ids": ["zNRChLjqcU", "u0TrcHhkPQ"], "start_seconds": ["220", "20"], "properties": ["food is frying, sizzles, food", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running from a faucet into a sink", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sNB8zxXneIM", "vb1fPSDI4c"], "start_seconds": ["20", "30"], "properties": ["several, quack, cocks", "multiple, people, yell"], "captions_pred_video": ["a group of geese in a cage", null], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a drill runs and two people laugh", "water flows and trickles"], "sample_ids": ["tEE3MpBt1sg", "tB7hWb9gTuQ"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "water, flow, trickle"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tDlysoZiA1I", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, chirp", "gun, shoot, water"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man speaks as a motor runs in the background"], "sample_ids": ["sfAvvZwdLCY", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "background, motor, run"], "captions_pred_video": ["footage of the toilet in the bathroom", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a toilet is flushed", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["someone snores nearby", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["spJCm8tD9Zo", "wqZ135Ssz0"], "start_seconds": ["90", "60"], "properties": ["someone snores, nearby, someone", "two men, woman, birds"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a man woman speak while crickets sing", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["zTLVJCo4WEE", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "beeps, hit, woman"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["heavy rain splashes as it falls", "a child speaks in closed space"], "sample_ids": ["wP8ZKrlx3oA", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["fall, rain, splash", "child, space, speak"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not a splash", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wz7N8YRy74I", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, men", "a woman, a television program, a bird"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a rooster?", "label": 0}, {"captions": ["a clock ticktocks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["v-g-j2uTByM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks", "stream, water, flow"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["frogs croak and vocalize", "a man speaks as a car is passing by"], "sample_ids": ["yswmmRZFItk", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["croak, vocalize, frog", "a, car, pass"], "captions_pred_video": ["a close up of a frog in the water", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a frog is croaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is not a frog?", "label": 1}, {"captions": ["a train horn blows as it passes by", "a man speaks as a car is passing by"], "sample_ids": ["zVacuqSb4LI", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["horn, blows, train", "a, car, pass"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a man is speaking with background noise and breathing sounds "], "question": "which is a moving object", "label": 0}, {"captions": ["a vehicle engine accelerates and wind blows", "an infant crying frantically"], "sample_ids": ["wudZTNBtVqc", "zwOBqeFTgiU"], "start_seconds": ["60", "30"], "properties": ["accelerates, engine, wind", "cry, infant, frantically"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "of the baby crying in the car seat"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["goats bleat and people speak", "a frog croaks as other frogs croak in the background"], "sample_ids": ["z5iUE5h0EPs", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["goats bleat, people speak, language", "background, frog, croak"], "captions_pred_video": ["of the goat in the barn", "a close up of a frog in the water"], "captions_pred_audio": ["a goat bleats and a man speaks", "a frog is croaking"], "question": "which entity is a single animal", "label": 1}, {"captions": ["a helicopter engine runs continuously", "small dogs growl, bark and yip."], "sample_ids": ["ugHJF0hfYkg", "sShpyu2l4YQ"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "growl, bark, yip"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "the puppies are playing with a toy"], "captions_pred_audio": ["a helicopter is flying overhead ", "a dog is barking and growling"], "question": "which entity is more likely to be running continuously", "label": 0}, {"captions": ["a weapon fires multiple times", "a toilet flushes and a female speaks"], "sample_ids": ["sMC07Ucy7kg", "yaln9y8I7ms"], "start_seconds": ["10", "230"], "properties": ["weapon, fire, multiple", "female, flushes, toilet"], "captions_pred_video": ["footage is from a car's point of view", "footage is blurry and out of focus"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a toilet flushes and a man speaks"], "question": "which entity is not a weapon", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "an infant crying as a woman laughs"], "sample_ids": ["vzxHnu-SFEw", "xhmRY9yhC7c"], "start_seconds": ["80", "20"], "properties": ["two objects, woman, speak", "a, laugh, infant"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a baby cries and a woman speaks"], "question": "which woman is a mother", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vZAw4apG0Es", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["background, clock, ticktocks", "loud, multiple, distance"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a man speaks over a running engine and blowing wind"], "sample_ids": ["wtDqrBygTcU", "ylpYOorfH4o"], "start_seconds": ["30", "410"], "properties": ["man, engine, run", "engine, running, wind"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15"], "captions_pred_audio": ["a man is speaking and a motor is running", "a man is speaking and an engine is revving"], "question": "which entity is a boat?", "label": 0}, {"captions": ["an adult woman and an adult man speak", "a man speaks as a motor runs in the background"], "sample_ids": ["zTLVJCo4WEE", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["two people, adult, speak", "background, motor, run"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "birds chirp as a man speaks and a younger person speaks"], "sample_ids": ["smDKStoHBJo", "xl2PIWyXaM"], "start_seconds": ["0", "160"], "properties": ["a, talk, baby, cry", "chirp, man, younger person"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "birds are chirping and people are talking"], "question": "which entity has a baby?", "label": 0}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vddP56-ogds", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["water, splash, person, laugh", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking and a baby is crying"], "question": "which entity has a person speaking?", "label": 0}, {"captions": ["food is frying while a woman speaks", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["yhQ2Lg-7qDY", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["food, woman, speak", "animal, grunts, snorts"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a man speaks as bees buzz and birds chirp", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["t25U-v4k4ts", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["bees buzz, birds chirp, man speaks", "a woman, laughs, animal"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and an animal snorting?", "label": 1}, {"captions": ["a small engine spits as it runs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sZvwOuuPGP0", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["spits, engine, runs", "female, spraying, scream"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a medium engine is running ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a car speeding up in the distance"], "sample_ids": ["y682ml90jGw", "u0TrcHhkPQ"], "start_seconds": ["11", "20"], "properties": ["beeps, series, electronic", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a stream of water runs briefly"], "sample_ids": ["vmrxwuAMb2I", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["a dog, inhales, exhales", "stream, water, run"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a dog barks and growls", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a baby cries and a woman speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tMbMDvT50j8", "zl9Dqx-j7q4"], "start_seconds": ["12", "6"], "properties": ["a, cry, woman", "engine, laugh, loud"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a baby cries and a woman speaks", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "water splashes as an animal walks through"], "sample_ids": ["s4Uz1Ffgo04", "w1ir-sZ3Im8"], "start_seconds": ["100", "90"], "properties": ["roars, background, people speaking", "animal, water, splashes"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "water splashes and gurgles as people speak"], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speak in a closed space", "a stream of water runs briefly"], "sample_ids": ["sTpirNYo8vQ", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["people, space, speak", "stream, water, run"], "captions_pred_video": ["of a man taking a selfie on a bus", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a telephone rings followed by a woman talking"], "sample_ids": ["tGcFnX0GHI", "tGcFnX0GHI"], "start_seconds": ["0", "0"], "properties": ["ring, talk, woman", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a video of a woman talking?", "label": 0}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "wind blows as people chatter quietly"], "sample_ids": ["xyL9F5VrjkE", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["wind, motor, distance", "wind, chatter, people"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage is blurry and out of focus"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a stream of water runs briefly"], "sample_ids": ["uqFtmnhuqA8", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["a, b, c", "stream, water, run"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "water splashing and a person laughs in the distance then a man speaks nearby"], "sample_ids": ["vKrYfzleLB8", "vddP56-ogds"], "start_seconds": ["110", "30"], "properties": ["a, ring, gunshots", "water, splash, person, laugh"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", null], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "water is running and gurgling and a man is speaking"], "question": "which entity is more calm", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a clock ticktocks"], "sample_ids": ["vdoxuJn9lTc", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a child speaks followed by a burp", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "water splashes as an animal walks through"], "sample_ids": ["smDKStoHBJo", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["a, infant, speaking", "animal, water, splashes"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "water splashes and gurgles as people speak"], "question": "which entity is a video of an animal walking through water?", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a horn rings out as a machine runs by"], "sample_ids": ["tZGN5a7ybxo", "slZLHwNbbt4"], "start_seconds": ["60", "300"], "properties": ["ring, train, horn", "a, horn, run"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a door opens and birds chirp"], "sample_ids": ["vfYTJq7nU", "yeFvk9x0wWI"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "door, open, birds"], "captions_pred_video": [null, "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a duck quacks and a woman speaks", "birds chirp in the background as a car drives by "], "question": "which entity is more quiet", "label": 1}, {"captions": ["children speak and play together", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["yVVP8XvWJTo", "su6FAOcOA8c"], "start_seconds": ["260", "4"], "properties": ["children, speak, play", "engine, idle, woman"], "captions_pred_video": ["footage of a playground at a school or daycare center", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wvKpEYswXO0", "uEU-Hg5MTN8"], "start_seconds": ["150", "27"], "properties": ["water, tap, run", "a woman, laughs, animal"], "captions_pred_video": ["of the person preparing food in the kitchen", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman is speaking and a baby is crying"], "question": "which entity is about water?", "label": 0}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a toilet flushes and a female speaks"], "sample_ids": ["ukxt9I7eMMg", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["continuous, woman, speaking", "female, flushes, toilet"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a toilet flushes and a man speaks"], "question": "which entity has a woman speaking towards the end?", "label": 0}, {"captions": ["a beep occurs briefly", "a woman speaks as she rubs two objects together"], "sample_ids": ["xtWeJ56-U-g", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["beep, occur, briefly", "two objects, woman, speak"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "small dogs yip and bark sharply"], "sample_ids": ["se87d6yxEOA", "v-wcQf4BDY0"], "start_seconds": ["10", "120"], "properties": ["run, whistle, pass", "bark, yip, sharply"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["xl2PIWyXaM", "uEU-Hg5MTN8"], "start_seconds": ["160", "27"], "properties": ["chirp, man, younger person", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["birds are chirping and people are talking", "a woman is speaking and a baby is crying"], "question": "which entity has a more snorts", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tIY7qOV3rEM", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "female, spraying, scream"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a toilet flushes and water drains"], "sample_ids": ["v0x1odnXtP0", "sfAvvZwdLCY"], "start_seconds": ["210", "20"], "properties": ["keyboard, type, computer", "water drains, flushes, water"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a person is typing on a keyboard", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a dark barks and whimpers", "wind blows as people chatter quietly"], "sample_ids": ["sYj4hpDUZDQ", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["barks, whimpers, dark", "wind, chatter, people"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "footage is blurry and out of focus"], "captions_pred_audio": ["a dog barks and a cat meows", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vf9xf3vMsGM", "sLUnaPT5gM8"], "start_seconds": ["540", "0"], "properties": ["A man speaks while turning a water faucet on.", "loud, laughter, intermittent"], "captions_pred_video": ["of the person washing their hands under the faucet", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["birds chirp as a train approaches", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xM4joTqDVp4", "vfYTJq7nU"], "start_seconds": ["160", "130"], "properties": ["bird, chirp, train", "rustling, ducks, quack"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", null], "captions_pred_audio": ["birds are chirping and a train is moving ", "a duck quacks and a woman speaks"], "question": "which entity is about birds?", "label": 0}, {"captions": ["water flows as men speak and yell", "a stream of water runs briefly"], "sample_ids": ["vJ7JPEFhyLA", "x-PeY8Yb8M4"], "start_seconds": ["16", "300"], "properties": ["water, flow, men", "stream, water, run"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["uEU-Hg5MTN8", "sLUnaPT5gM8"], "start_seconds": ["27", "0"], "properties": ["animal, grunts, snorts", "loud, laughter, intermittent"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["water gurgles, metal squeaks and the water stops", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["x4a9YGIw4ok", "vfYTJq7nU"], "start_seconds": ["120", "130"], "properties": ["water, gurgles, stops", "rustling, ducks, quack"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and water splashes", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["a vehicle engine revs as the vehicle passes", "speaking following by laughing and clapping"], "sample_ids": ["yDoT73BWsdA", "u2f5NpsoHBg"], "start_seconds": ["10", "30"], "properties": ["engine, revs, vehicle", "person, laugh, clap"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a crowd is clapping"], "question": "which entity is a person", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a vehicle accelerates before a race car idles then accelerates quickly"], "sample_ids": ["sSMl2vc3ek", "sjlVMgdGSK0"], "start_seconds": ["20", "30"], "properties": ["a person, laughs, snores", "accelerates, vehicle, race car"], "captions_pred_video": [null, "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a person snoring loudly", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a person is burping while a girl speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vdoxuJn9lTc", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["person, burp, girl", "rustling, ducks, quack"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", null], "captions_pred_audio": ["a child speaks followed by a burp", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a woman speaks as she rubs two objects together"], "sample_ids": ["zFjIWfSD-4", "vzxHnu-SFEw"], "start_seconds": ["410", "80"], "properties": ["People, motor, brakes", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is about a woman speaking?", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "water splashes as an animal walks through"], "sample_ids": ["zALy31PjDl0", "w1ir-sZ3Im8"], "start_seconds": ["21", "90"], "properties": ["a man, a vehicle, a horn", "animal, water, splashes"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["bees buzz as wind blows", "a small engine idles continuously"], "sample_ids": ["tMJne1a4AFI", "y5WII6cTH7k"], "start_seconds": ["0", "40"], "properties": ["bees, buzz, wind", "engine, idle, continuously"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["a swarm of bees buzzing around", "an engine is knocking and vibrating "], "question": "which entity is not a living thing", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "paper folding and crinkling"], "sample_ids": ["sjlVMgdGSK0", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["accelerates, vehicle, race car", "paper, fold, crinkle"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a car accelerates and revs its engine ", "the wind blows and a mouse clicks "], "question": "which is not a vehicle", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["uZesmtKZGSw", "vlS6YMeWAPo"], "start_seconds": ["250", "40"], "properties": ["men, talk, cars", "sheep, baa, birds"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["vlJS7LN2XyM", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["background, clocks, ticking", "wind, blows, vehicle"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a ticktock of a clock", "a jet engine roars and wind blows "], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a infant makes noise and is excited"], "sample_ids": ["t97k0cejSQE", "wIJK3-5y0kA"], "start_seconds": ["250", "30"], "properties": ["bird, chirp, insect", "noise, excited, infant"], "captions_pred_video": ["a bee on a purple thistle flower", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a toilet flushes and water drains", "a man speaks in the background while a slow tick repeats"], "sample_ids": ["sfAvvZwdLCY", "vZAw4apG0Es"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "background, tick, repeat"], "captions_pred_video": ["footage of the toilet in the bathroom", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a toilet is flushed", "a clock is ticking and people are talking"], "question": "which entity is a video of a toilet flushing?", "label": 0}, {"captions": ["pigeons vocalize and birds chirp", "a stream of water runs briefly"], "sample_ids": ["uiS58TNyUiw", "x-PeY8Yb8M4"], "start_seconds": ["430", "300"], "properties": ["vocalize, bird, chirp", "stream, water, run"], "captions_pred_video": ["of the pigeon in the cage", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a car is driving on a wet road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "an insect buzzes around continuously"], "sample_ids": ["ugHJF0hfYkg", "v25l1jef3JY"], "start_seconds": ["10", "0"], "properties": ["loud, intense, propeller", "buzzes, continuously, insect"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a helicopter is flying overhead ", "a fly is buzzing around a microphone "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "a toilet flushes and a female speaks"], "sample_ids": ["soTOh3zYJfY", "yaln9y8I7ms"], "start_seconds": ["40", "230"], "properties": ["vehicle, skid, tires", "female, flushes, toilet"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "footage is blurry and out of focus"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a person is snoring while sleeping", "dishes cling together then a man begins to speak"], "sample_ids": ["vJrjSeP17yE", "sQGXqGcwOTc"], "start_seconds": ["40", "3"], "properties": ["a person is sleeping, snoring, person", "cling, speak, dishes"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a person snoring loudly", "mechanisms are operating and water is splashing "], "question": "which entity is about a person speaking?", "label": 1}, {"captions": ["paper is crumpling consistently", "a saw finishes running as metal clings in the background"], "sample_ids": ["v5cSxLaHADY", "zofjfKhqLk8"], "start_seconds": ["0", "10"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "background, metal, clings"], "captions_pred_video": ["footage of the person holding a pair of scissors", "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["paper is crumpled and crinkled", "a large engine is running and a bell is ringing"], "question": "which entity is a video of a saw running?", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "paper folding and crinkling"], "sample_ids": ["y8WEcpOlT3I", "zPpG3RD8lSs"], "start_seconds": ["40", "20"], "properties": ["wind, speak, buffeting", "paper, fold, crinkle"], "captions_pred_video": ["on how to use a sewing machine youtube", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["food is frying and sizzles", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zNRChLjqcU", "tdWhHV3X25Q"], "start_seconds": ["220", "60"], "properties": ["food is frying, sizzles, food", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vbr9mHKc8WM", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["noise, loudness, engine", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and a baby is crying"], "question": "which entity is quieter", "label": 0}, {"captions": ["race cars go around a track as a man commentates", "winds blows roughly as a vehicle races past"], "sample_ids": ["uZesmtKZGSw", "xjvTpk2Zpr8"], "start_seconds": ["250", "70"], "properties": ["car, track, man", "wind, blows, vehicle"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "some tunes played by whistling"], "sample_ids": ["w5W5Kqtc8E", "u6BnG6YZqJ4"], "start_seconds": ["100", "0"], "properties": ["wind, engine, scream", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a motorcycle engine works nearby", "winds blows roughly as a vehicle races past"], "sample_ids": ["tOSWIURC-4", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["engine, work, nearby", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a lawn mower is running ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a man speaks while water drains", "an insect buzzes around continuously"], "sample_ids": ["vSeGhaZt-aI", "v25l1jef3JY"], "start_seconds": ["50", "0"], "properties": ["water, drain, man", "buzzes, continuously, insect"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tZGN5a7ybxo", "tiDFTC-5vU"], "start_seconds": ["60", "30"], "properties": ["ring, train, horn", "male, duck, laugh"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", null], "captions_pred_audio": ["a train is moving and blowing its horn ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a clock ticktocks continuously", "a man sprays as a scraping occurs in the background"], "sample_ids": ["vlJS7LN2XyM", "sOa7g-44Dag"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks continuously", "background, man, spray"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and rubbing his hands together "], "question": "which entity is a man?", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a man makes an exclamation, then another man speaks"], "sample_ids": ["w2M4i1mklOA", "xO-Q2BlIIPU"], "start_seconds": ["30", "30"], "properties": ["loud, chime, bell", "two men, exclamation, speak"], "captions_pred_video": ["footage of an antique clock", "a clock with a green glowing display showing the time 09 07 2016 12 31 2016"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking with background noise and breathing sounds "], "question": "which is quieter", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "water splashes and a door squeaks"], "sample_ids": ["vBHyYJ8pL0", "sdXV-ylviw"], "start_seconds": ["2", "190"], "properties": ["noise, door, opening", "sound, splash, door"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a dog barks and taps with background noise "], "question": "which door is squeaking", "label": 1}, {"captions": ["a person is burping while a girl speaks", "vehicle tires screech and a man speaks before a car door opens"], "sample_ids": ["vdoxuJn9lTc", "sxYkFKFIZD0"], "start_seconds": ["40", "20"], "properties": ["person, burp, girl", "screech, man, door"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking while a car is revving and accelerating with a squeal in the background "], "question": "which entity is about a car door opening?", "label": 1}, {"captions": ["ticking continues without interruption", "a male speaks and another male speaks"], "sample_ids": ["v-g-j2uTByM", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "two males, speaking, male"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is not continuous", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "water flows as men speak and yell"], "sample_ids": ["xKB8O8LTs6s", "vJ7JPEFhyLA"], "start_seconds": ["70", "16"], "properties": ["music, gunfire, explosion", "water, flow, men"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "paper is crumpling consistently"], "sample_ids": ["vbZ-0lGPneg", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["a woman, a television program, a bird", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "an insect buzzes around continuously"], "sample_ids": ["v7jJS8aAyA", "v25l1jef3JY"], "start_seconds": ["10", "0"], "properties": ["wind, blows, loudly", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a fly is buzzing around a microphone "], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a child speaks in closed space"], "sample_ids": ["yajyRTUQk3U", "yW6FWLSLkx4"], "start_seconds": ["400", "40"], "properties": ["noise, woman, speak", "child, space, speak"], "captions_pred_video": ["- a woman cooking in the kitchen", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking over a noise", "label": 0}, {"captions": ["wind blows strongly", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w8uLijTqtlU", "wDVMhEdTiVw"], "start_seconds": ["70", "30"], "properties": ["wind, blows, strongly", "gun, shoot, water"], "captions_pred_video": ["footage is blurry and shaky", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["the wind is blowing strongly", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not a gun?", "label": 0}, {"captions": ["a person speaks over rustling leaves", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zOZleIRqZm4", "wDVMhEdTiVw"], "start_seconds": ["80", "30"], "properties": ["rustling, leaves, person", "gun, shoot, water"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a person sniffles and sneezes", "water flows and trickles"], "sample_ids": ["uRlbY6aoBU", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["sneezes, sniffles, person", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is sneezing ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a toilet flushes and water drains", "a clock ticktocks"], "sample_ids": ["sfAvvZwdLCY", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the toilet in the bathroom", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a toilet is flushed", "a clock is ticking loudly"], "question": "which entity is a timepiece", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a child speaks in closed space"], "sample_ids": ["zALy31PjDl0", "yW6FWLSLkx4"], "start_seconds": ["21", "40"], "properties": ["a man, a vehicle, a horn", "child, space, speak"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["w2M4i1mklOA", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["alarm, gears, turn", "water, radio, man"], "captions_pred_video": ["footage of an antique clock", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a clock?", "label": 0}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "people cheer as a vehicle engine revs"], "sample_ids": ["vlJS7LN2XyM", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["background, clocks, ticking", "engine revs, vehicle, people"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a ticktock of a clock", "a truck is revving its engine and a man is speaking "], "question": "which entity is more active", "label": 1}, {"captions": ["ticking continues without interruption", "a duck quacks loudly and continuously"], "sample_ids": ["v-g-j2uTByM", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "loud, continuous, quacks"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a clock is ticking loudly", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tDVADusiIoc", "wDVMhEdTiVw"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "gun, shoot, water"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is about shooting water?", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a clock ticktocks"], "sample_ids": ["wtDqrBygTcU", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["man, engine, run", "ticktocks, clock, ticktocks"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a motor is running", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "paper is crumpling consistently"], "sample_ids": ["voJh2gJxXhA", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["music, frog, croak", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["music is playing and crickets are chirping ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "plastic is tapped on while someone speaks"], "sample_ids": ["x4a9YGIw4ok", "wvKpEYswXO0"], "start_seconds": ["120", "150"], "properties": ["water, gurgles, stops", "plastic, tap, speak"], "captions_pred_video": ["footage is blurry and out of focus", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a toilet flushes and water splashes", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks?", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sEprKHm8Sj8", "xfaoyyzw2WU"], "start_seconds": ["90", "180"], "properties": ["noise, loud, buzzing", "loud, jet engine, roar"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "an aircraft engine roars and a man speaks "], "question": "which noise is louder", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "an airplane engine runs"], "sample_ids": ["sOa7g-44Dag", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["background, man, spray", "engine, airplane, runs"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man speaks while water drains", "wind blows as people chatter quietly"], "sample_ids": ["vSeGhaZt-aI", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["water, drain, man", "wind, chatter, people"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "some tunes played by whistling"], "sample_ids": ["vW4x7S1VfQc", "u6BnG6YZqJ4"], "start_seconds": ["150", "0"], "properties": ["clacking, oil, woman", "tune, play, whistling"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["food sizzles in a frying pan", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xjhAnI2q6hM", "yajyRTUQk3U"], "start_seconds": ["6", "400"], "properties": ["engine revs, vehicle, people", "a woman, something, fried"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["y2bVZ7rz-5M", "vlS6YMeWAPo"], "start_seconds": ["280", "40"], "properties": ["motor noise, horn, siren", "sheep, baa, birds"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a goat bleats and birds chirp"], "question": "which entity is followed by a horn honking and a siren wailing", "label": 0}, {"captions": ["a person is snoring while sleeping", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vJrjSeP17yE", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["a person is sleeping, snoring, person", "animal, grunts, snorts"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["an airplane engine runs", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yVPZ2MNWpms", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["engine, airplane, runs", "People, motor, brakes"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "paper is crumpling consistently"], "sample_ids": ["vdoxuJn9lTc", "v5cSxLaHADY"], "start_seconds": ["40", "0"], "properties": ["burp, loud, girl", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a child speaks followed by a burp", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["children speak and play together", "someone snores nearby"], "sample_ids": ["yVVP8XvWJTo", "spJCm8tD9Zo"], "start_seconds": ["260", "90"], "properties": ["children, speak, play", "someone snores, nearby, someone"], "captions_pred_video": ["footage of a playground at a school or daycare center", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a car accelerates and wind blows"], "sample_ids": ["yLy-WycbVVE", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, people, talk", "accelerates, wind, blows"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", null], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "two women and a man talk while a kid cries"], "sample_ids": ["yFB25fqfU8I", "wyllXV6PjKo"], "start_seconds": ["300", "30"], "properties": ["wave, crash, shoreline", "a kid, talk, cry"], "captions_pred_video": ["footage of a person surfing in the ocean", null], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a woman speaks and a baby cries"], "question": "which entity is more calm", "label": 1}, {"captions": ["a person screams glaringly", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["xC8kbrKJmco", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["glaringly, screams, person", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "a man is speaking and ducks are quacking"], "question": "which entity is a person?", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a vehicle accelerates and squeals tires"], "sample_ids": ["uRExseg-0XI", "yRx9txMcBl0"], "start_seconds": ["210", "40"], "properties": ["woman, man, water", "accelerates, tires, squeals"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an adult woman and an adult man speak", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zTLVJCo4WEE", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["two people, adult, speak", "music, gunfire, explosion"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman speaks and crickets chirp", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["people speak and tapping occurs", "water running down a sink while a man is talking"], "sample_ids": ["tFCUUGdREgA", "vSeGhaZt-aI"], "start_seconds": ["70", "50"], "properties": ["people, tap, speak", "water, sink, talk"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video of water running down a sink while a man is talking?", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xKB8O8LTs6s", "wDVMhEdTiVw"], "start_seconds": ["70", "30"], "properties": ["music, gunshots, explosion", "gun, shoot, water"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity has a gunshot followed by an explosion?", "label": 0}, {"captions": ["multiple people speak and children yell while water gurgles", "a infant makes noise and is excited"], "sample_ids": ["vb1fPSDI4c", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["multiple, people, yell", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks over intermittent keyboard taps", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tw76HGONaKg", "vb1fPSDI4c"], "start_seconds": ["570", "30"], "properties": ["audio, man, keyboard", "multiple, people, yell"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "people cheer as a vehicle engine revs"], "sample_ids": ["rwTERCUno", "xjhAnI2q6hM"], "start_seconds": ["90", "6"], "properties": ["engine, idle, sputter", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["an engine is idling and vibrating", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "water flows as men speak and yell"], "sample_ids": ["s4Uz1Ffgo04", "vJ7JPEFhyLA"], "start_seconds": ["100", "16"], "properties": ["water, rushes, motorcycle", "water, flow, men"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "water splashes as an animal walks through"], "sample_ids": ["tMJne1a4AFI", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["wind, buzz, rustling", "animal, water, splashes"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a swarm of bees buzzing around", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "water flows as men speak and yell"], "sample_ids": ["w34HjHr6gAY", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["beeps, hit, woman", "water, flow, men"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water flowing", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "a whistling owl calls out repeatedly and insects screech"], "sample_ids": ["u--KhUW8l1Y", "w6RTHR6AeAg"], "start_seconds": ["0", "40"], "properties": ["engine, sound, horn", "call, owl, screech"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", null], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "an owl hoots and mechanisms operate "], "question": "which entity is a bird?", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["uRExseg-0XI", "xfaoyyzw2WU"], "start_seconds": ["210", "180"], "properties": ["woman, man, water", "loud, jet engine, roar"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vVhthZ45k3Y", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["cat, purr, hiss", "harsh, wind, blows"], "captions_pred_video": ["footage is blurry and out of focus", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be a natural phenomenon", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wudZTNBtVqc", "tDVADusiIoc"], "start_seconds": ["60", "60"], "properties": ["accelerates, engine, wind", "water, radio, man"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "wind blowing and birds chirping with the distant cooing of a large bird"], "sample_ids": ["w5W5Kqtc8E", "wRBHTgrbiwg"], "start_seconds": ["100", "50"], "properties": ["water, splashes, motorboat", "birds, chirp, cooing"], "captions_pred_video": [null, "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "birds are chirping and insects are buzzing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "a man speaks followed by another man speaking outside"], "sample_ids": ["wvKpEYswXO0", "viuTg1M-dqg"], "start_seconds": ["150", "30"], "properties": ["plastic, tap, speak", "two men, speak, follow"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["someone snores nearby", "a woman speaks as she rubs two objects together"], "sample_ids": ["spJCm8tD9Zo", "vzxHnu-SFEw"], "start_seconds": ["90", "80"], "properties": ["someone snores, nearby, someone", "two objects, woman, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["soTOh3zYJfY", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["vehicle, skid, tires", "a woman, a television program, a bird"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["food fries in a pan as someone talks and cooks", "a man speaks as a car is passing by"], "sample_ids": ["ukxt9I7eMMg", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["food, pan, cook", "a, car, pass"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "someone whistles a tune"], "sample_ids": ["w34HjHr6gAY", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["beeps, hit, woman", "someone, tune, whistle"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "an engine runs loudly"], "sample_ids": ["vbpKkWvfOu4", "vqZuVbG6-HI"], "start_seconds": ["560", "130"], "properties": ["a, man, speaks", "loud, engine, run"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "someone is typing on a computer keyboard"], "sample_ids": ["su6FAOcOA8c", "v0x1odnXtP0"], "start_seconds": ["4", "210"], "properties": ["engine, idle, woman", "keyboard, type, computer"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a person is typing on a keyboard"], "question": "which is not a person", "label": 0}, {"captions": ["a speedboat passes quickly on the water", "dishes cling together then a man begins to speak"], "sample_ids": ["tjmoSi330GM", "sQGXqGcwOTc"], "start_seconds": ["23", "3"], "properties": ["speed, water, boat", "cling, speak, dishes"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "mechanisms are operating and water is splashing "], "question": "which entity is a still image?", "label": 0}, {"captions": ["a clock ticktocks continuously", "a woman speaks as she rubs two objects together"], "sample_ids": ["vlJS7LN2XyM", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["ticktocks, clock, ticktocks continuously", "two objects, woman, speak"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a clock", "label": 0}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "some light rustling followed by a loud burp and a girl speaking"], "sample_ids": ["s59PfAghdkM", "vdoxuJn9lTc"], "start_seconds": ["0", "40"], "properties": ["bird, chirp, background, horse, neigh", "burp, loud, girl"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a child speaks followed by a burp"], "question": "which entity has a horse in it?", "label": 0}, {"captions": ["birds chirp and pigeons vocalize while walking around", "running water in a faucet with some clinks"], "sample_ids": ["wIvYjuR3nrg", "zNRChLjqcU"], "start_seconds": ["9", "220"], "properties": ["birds, pigeons, vocalize", "water, faucet, run"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", null], "captions_pred_audio": ["birds are chirping and cooing", "water is running from a faucet into a sink"], "question": "which entity is silent", "label": 1}, {"captions": ["a person screams glaringly", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["xC8kbrKJmco", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["glaringly, screams, person", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a goat is bleating ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["wP8ZKrlx3oA", "uWPRNLnpy7Y"], "start_seconds": ["40", "10"], "properties": ["rain, storm, thunder", "accelerate, laugh, vehicle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "is taken from a car driving down the street"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a car accelerates and revs its engine "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "water rushes by"], "sample_ids": ["s4Uz1Ffgo04", "x-PeY8Yb8M4"], "start_seconds": ["100", "300"], "properties": ["water, rushes, vehicle", "water, rushes, by"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a car is driving on a wet road "], "question": "which entity is a video of water rushing by?", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["wqADXCzngMw", "uEU-Hg5MTN8"], "start_seconds": ["340", "27"], "properties": ["engine, idle, man", "animal, grunts, snorts"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "an airplane engine runs"], "sample_ids": ["yaln9y8I7ms", "yVPZ2MNWpms"], "start_seconds": ["230", "0"], "properties": ["female, flushes, toilet", "engine, airplane, runs"], "captions_pred_video": ["footage is blurry and out of focus", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["leaves rustle while man speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zOZleIRqZm4", "vb1fPSDI4c"], "start_seconds": ["80", "30"], "properties": ["leaves, rustle, speak", "multiple, people, yell"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a crowd of people are talking and laughing"], "question": "which entity is more active", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a clock ticktocks in wind"], "sample_ids": ["yLy-WycbVVE", "yVumC9TGknc"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "ticktocks, clock, wind"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "game title screen of the game shadow of the colossus on sony playstation 2"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a series of beeps and chirps"], "question": "which entity is silent", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "an airplane engine runs"], "sample_ids": ["zofjfKhqLk8", "yVPZ2MNWpms"], "start_seconds": ["10", "0"], "properties": ["noise, stop, motor", "engine, airplane, runs"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["running water in a faucet with some clinks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zNRChLjqcU", "yajyRTUQk3U"], "start_seconds": ["220", "400"], "properties": ["water, faucet, run", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["water is running from a faucet into a sink", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tjmoSi330GM", "vfYTJq7nU"], "start_seconds": ["23", "130"], "properties": ["speed, water, boat", "rustling, ducks, quack"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", null], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a duck quacks and a woman speaks"], "question": "which entity is moving at a slower speed", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a telephone rings followed by a woman talking"], "sample_ids": ["wRV8yMk886E", "tGcFnX0GHI"], "start_seconds": ["0", "0"], "properties": ["liquid, spray, nozzle", "ring, talk, woman"], "captions_pred_video": ["two cars are parked in a parking lot at night", null], "captions_pred_audio": ["a man speaks followed by a loud burst", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["some men converse over an engine running", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sCiy7QS1U", "uZesmtKZGSw"], "start_seconds": ["300", "250"], "properties": ["men, converse, engine", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity shows men talking about cars?", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vlJS7LN2XyM", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["background, clocks, ticking", "a, scream, girl"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a door slams shut roughly", "a woman speaks as she rubs two objects together"], "sample_ids": ["zkKdxzNC97Y", "vzxHnu-SFEw"], "start_seconds": ["27", "80"], "properties": ["a door, slams, shut", "two objects, woman, speak"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a door is opened and closed", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tDlysoZiA1I", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["animal, grunt, multiple", "music, gunfire, explosion"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["uYT5gxnyMWM", "tiDFTC-5vU"], "start_seconds": ["50", "30"], "properties": ["female, spraying, scream", "male, duck, laugh"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a man speaks while water drains", "paper is crumpling consistently"], "sample_ids": ["vSeGhaZt-aI", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["water, drain, man", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man talks as several small engines run", "a clock ticktocks"], "sample_ids": ["u9A6VZQCZpU", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "people applaud and hoot and chat quietly"], "sample_ids": ["w0xsN8X18Y", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["rain, thunder, surface", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a performance", "label": 1}, {"captions": ["white noise and birds chirping", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["wRBHTgrbiwg", "tDlysoZiA1I"], "start_seconds": ["50", "0"], "properties": ["noise, white, chirping", "animal, grunts, chirps"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "birds are chirping and a rooster is crowing "], "question": "which entity is more like a song", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sK4u5T8hW78", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["a, car, pass", "harsh, wind, blows"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be in a car", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "a toilet flushes and a female speaks"], "sample_ids": ["y8WEcpOlT3I", "yaln9y8I7ms"], "start_seconds": ["40", "230"], "properties": ["wind, speak, buffeting", "female, flushes, toilet"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a man speaks while a machine runs before a smoke alarm beeps"], "sample_ids": ["tDVADusiIoc", "sG7TyPnFDR0"], "start_seconds": ["60", "180"], "properties": ["man, radio, blows", "beeps, machine, smoke alarm"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "a person is using an espresso machine in a restaurant"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a microwave oven is beeping "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["someone is typing on a computer keyboard", "water flows and trickles"], "sample_ids": ["v0x1odnXtP0", "tB7hWb9gTuQ"], "start_seconds": ["210", "30"], "properties": ["keyboard, type, computer", "water, flow, trickle"], "captions_pred_video": ["how to make money on youtube in spanish", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a person is typing on a keyboard", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "wind blows as people chatter quietly"], "sample_ids": ["ukxt9I7eMMg", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["food, pan, cook", "wind, chatter, people"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vZAw4apG0Es", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["people, clock, converse", "wind, blow, vehicle"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "waves crash against a shoreline and people speak"], "sample_ids": ["y8WEcpOlT3I", "yFB25fqfU8I"], "start_seconds": ["40", "300"], "properties": ["harsh, wind, blows", "wave, crash, shoreline"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman speaks with water running", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wTideSjRFS0", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["water, running, woman", "engine, accelerate, idle"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wvKpEYswXO0", "sSMl2vc3ek"], "start_seconds": ["150", "20"], "properties": ["water, tap, run", "loud, multiple, distance"], "captions_pred_video": ["of the person preparing food in the kitchen", null], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["w5W5Kqtc8E", "yDoT73BWsdA"], "start_seconds": ["100", "10"], "properties": ["wind, blow, vehicle", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "someone whistles a tune"], "sample_ids": ["yJ0TePmaOo", "sIXTftIuUgw"], "start_seconds": ["390", "90"], "properties": ["two hard objects, man, speak", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a person whistling a song"], "question": "which is a musical instrument", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "crowd applause while a guy laughs followed by another man speaking"], "sample_ids": ["uzQnlJXBbOM", "tDlfY3nmx1A"], "start_seconds": ["50", "160"], "properties": ["ringing, beep, stop", "applause, laugh, man"], "captions_pred_video": ["footage of a person using a cell phone on a table", "a man in a suit and tie is talking to another man in a suit and tie"], "captions_pred_audio": ["a telephone rings and a man speaks", "a crowd is clapping and laughing and a man is speaking "], "question": "which entity is a performance", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "several insects fly while two men talk"], "sample_ids": ["vb1fPSDI4c", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["multiple, people, yell", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more people", "label": 0}, {"captions": ["music plays followed by gunshots and then an explosion", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["xKB8O8LTs6s", "uYT5gxnyMWM"], "start_seconds": ["70", "50"], "properties": ["music, gunshots, explosion", "a, scream, girl"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "plastic is tapped on while someone speaks"], "sample_ids": ["sAam2NqGhLY", "wvKpEYswXO0"], "start_seconds": ["20", "150"], "properties": ["snoring, breathing, child", "plastic, tap, speak"], "captions_pred_video": ["of a little girl sleeping on a couch", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a person is snoring", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is not a person", "label": 1}, {"captions": ["women speak and laugh as wind blows", "multiple people speak and children yell while water gurgles"], "sample_ids": ["un9VQlzgZM", "vb1fPSDI4c"], "start_seconds": ["5", "30"], "properties": ["wind, speak, laugh", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["xZepNM9qcRA", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["background, motor, run", "loud, laughter, intermittent"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a motorcycle engine is idling", "a infant makes noise and is excited"], "sample_ids": ["vZAqdHZ81yA", "wIJK3-5y0kA"], "start_seconds": ["180", "30"], "properties": ["engine, motorcycle, idling", "noise, excited, infant"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["an engine is idling loudly", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["xSKJGCItUWE", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["engine, run, boy", "a, scream, girl"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yJ0TePmaOo", "vfYTJq7nU"], "start_seconds": ["390", "130"], "properties": ["two hard objects, man, speak", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a duck quacks and a woman speaks"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a jet engine roars, almost making a man inaudible", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["xfaoyyzw2WU", "vlS6YMeWAPo"], "start_seconds": ["180", "40"], "properties": ["loud, jet engine, roar", "sheep, baa, birds"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a goat bleats and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["rustling with distant murmuring", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wnNNcxAPwGQ", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["sound, distance, rustling", "a woman, something, fried"], "captions_pred_video": ["footage of a yellow truck doing a burnout on a race track", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a crowd of people are talking and laughing while a skateboard rolls by ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "water flows and trickles"], "sample_ids": ["wvKpEYswXO0", "tB7hWb9gTuQ"], "start_seconds": ["150", "30"], "properties": ["plastic, tap, speak", "water, flow, trickle"], "captions_pred_video": ["of the person preparing food in the kitchen", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "water is splashing and gurgling"], "question": "which entity is flowing", "label": 1}, {"captions": ["goats bleat and metal clings", "a man speaks as a car is passing by"], "sample_ids": ["tH17JPjDPnc", "sK4u5T8hW78"], "start_seconds": ["260", "30"], "properties": ["bleat, metal, clings", "a, car, pass"], "captions_pred_video": ["feed of the goats eating hay in the barn", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "people cheer as a vehicle engine revs"], "sample_ids": ["xjvTpk2Zpr8", "xjhAnI2q6hM"], "start_seconds": ["70", "6"], "properties": ["engine, run, wind", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a truck is revving its engine and a man is speaking "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["birds chirp and a dog breathes heavily", "an infant crying frantically"], "sample_ids": ["y2ZBGpgbhHM", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["dog, chirp, breathe", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["birds chirping and a dog panting", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a toilet flushes and water drains", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sfAvvZwdLCY", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["water drains, flushes, water", "loud, multiple, distance"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["zuua6-5goWw", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["birds, chirp, quiet, man, speaks", "sheep, baa, birds"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a goat bleats and birds chirp"], "question": "which entity is more active", "label": 1}, {"captions": ["a helicopter engine runs", "a man speaks followed by another man speaking outside"], "sample_ids": ["t5ZbXbniOWk", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["engine, helicopter, run", "two men, speak, follow"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["tw76HGONaKg", "wz7N8YRy74I"], "start_seconds": ["570", "30"], "properties": ["A, game, keyboard", "rooster, crow, background, men"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["vehicle engines race around a track as a man commentates", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sZPuqDgX2V0", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["commentator, race, track", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["an infant crying frantically", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zwOBqeFTgiU", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["cry, infant, frantically", "a woman, something, fried"], "captions_pred_video": ["of the baby crying in the car seat", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a baby cries loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "people applaud and hoot and chat quietly"], "sample_ids": ["yZrFNS7GFBQ", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["pigeon, buzzes, insect", "people, applaud, hoot"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a performance", "label": 1}, {"captions": ["material crumbles into a microphone", "paper is crumpling consistently"], "sample_ids": ["vofpvUo6NAw", "v5cSxLaHADY"], "start_seconds": ["220", "0"], "properties": ["material, crumbles, microphone", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["paper is being crumpled and crinkled", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a grown man speaks and water bubbles and runs"], "sample_ids": ["yaln9y8I7ms", "vSeGhaZt-aI"], "start_seconds": ["230", "50"], "properties": ["female, flushes, toilet", "water, bubbles, run"], "captions_pred_video": ["footage is blurry and out of focus", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is speaking and pouring liquid with background noise "], "question": "which entity has a female speaking?", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "a diesel truck engine runs continuously"], "sample_ids": ["wyllXV6PjKo", "sZvwOuuPGP0"], "start_seconds": ["30", "50"], "properties": ["a kid, talk, cry", "engine, diesel, truck"], "captions_pred_video": [null, "of a bulldozer clearing a road in a forest stock footage and royalty-free videos"], "captions_pred_audio": ["a woman speaks and a baby cries", "a medium engine is running "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["xERFUeZONz8", "s7knHCFW82w"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "blow horn, get close, train"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["an emergency vehicle siren blares", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a door opens and birds chirp", "water splashes and a door squeaks"], "sample_ids": ["yeFvk9x0wWI", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["door, open, birds", "sound, splash, door"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a dog barks and taps with background noise "], "question": "which door is squeaking", "label": 1}, {"captions": ["a clock ticktocks continuously", "water splashes as an animal walks through"], "sample_ids": ["vlJS7LN2XyM", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["ticktocks, clock, ticktocks continuously", "animal, water, splashes"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a ticktock of a clock", "water splashes and gurgles as people speak"], "question": "which entity is a moving object", "label": 1}, {"captions": ["some clanking with distant murmuring", "small dogs yip and bark sharply"], "sample_ids": ["uMTTDZ2mb4", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["clanking, murmuring, distant", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a stream of water runs briefly"], "sample_ids": ["u7C-AEBQM", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["ticks, rhythmic, quiet", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a ticktock of a clock", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "plastic is tapped on while someone speaks"], "sample_ids": ["xZepNM9qcRA", "wvKpEYswXO0"], "start_seconds": ["30", "150"], "properties": ["background, motor, run", "plastic, tap, speak"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity has a motor running in the background", "label": 0}, {"captions": ["a man speaks uses a drill", "a male speaks and another male speaks"], "sample_ids": ["x5eIC7S0fbg", "viuTg1M-dqg"], "start_seconds": ["60", "30"], "properties": ["A man is speaking, uses a drill, and is a tool", "two males, speaking, male"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more males speaking", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xBxDz0CFVn0", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["stream, water, flow", "a woman, something, fried"], "captions_pred_video": ["footage is blurry and out of focus", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "some tunes played by whistling"], "sample_ids": ["zY3icUyMdh8", "u6BnG6YZqJ4"], "start_seconds": ["20", "0"], "properties": ["dog, bark, engine", "tune, play, whistling"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "some men converse over an engine running"], "sample_ids": ["tDVADusiIoc", "sCiy7QS1U"], "start_seconds": ["60", "300"], "properties": ["wind, radio, waves", "men, converse, engine"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation?", "label": 1}, {"captions": ["a door opens and birds chirp", "a door opens and closes"], "sample_ids": ["yeFvk9x0wWI", "vBHyYJ8pL0"], "start_seconds": ["30", "2"], "properties": ["door, open, birds", "open, close, door"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which door opens and closes", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "water pouring and bubbling"], "sample_ids": ["uYT5gxnyMWM", "uyRfq-jKPpo"], "start_seconds": ["50", "50"], "properties": ["female, spraying, scream", "water, bubbles, pouring"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wqADXCzngMw", "vfYTJq7nU"], "start_seconds": ["340", "130"], "properties": ["engine, idle, man", "rustling, ducks, quack"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", null], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a duck quacks and a woman speaks"], "question": "which entity is about a man talking?", "label": 0}, {"captions": ["an electronic device bleeps once", "some men converse over an engine running"], "sample_ids": ["tHJ6JSa8Y4", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["bleeps, electronic, device", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a clock is ticking and beeping", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "people applaud and hoot and chat quietly"], "sample_ids": ["uZesmtKZGSw", "wwyfGO2J4"], "start_seconds": ["250", "90"], "properties": ["car, track, man", "people, applaud, hoot"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be at a concert", "label": 1}, {"captions": ["a goat screams and people speak in the background", "some clanking with distant murmuring"], "sample_ids": ["xC8kbrKJmco", "uMTTDZ2mb4"], "start_seconds": ["0", "30"], "properties": ["background, goat, scream", "clanking, murmuring, distant"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "people are talking and a car is driving by with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a motorcycle engine is idling", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vZAqdHZ81yA", "vYkA3cfXp5Q"], "start_seconds": ["180", "30"], "properties": ["engine, motorcycle, idling", "engine, accelerate, idle"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["an engine is idling loudly", "an engine is idling"], "question": "which entity has an engine that is idling", "label": 0}, {"captions": ["a man speaks as crickets sing", "paper is crumpling consistently"], "sample_ids": ["ryFDPxgDOGc", "v5cSxLaHADY"], "start_seconds": ["570", "0"], "properties": ["a, crickets, sing", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "vehicles pass by on a roadway"], "sample_ids": ["y2ZBGpgbhHM", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["animal, growl, bird", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds chirping and a dog panting", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["weDbePuc-Xc", "vlS6YMeWAPo"], "start_seconds": ["40", "40"], "properties": ["cartoon character, music, vocalize", "sheep, baa, birds"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a goat bleats and birds chirp"], "question": "which entity is more likely to be a video of a sheep baaing?", "label": 1}, {"captions": ["heavy rain splashes as it falls", "an infant crying as a woman laughs"], "sample_ids": ["wP8ZKrlx3oA", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["fall, rain, splash", "a, laugh, infant"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "water splashes as an animal walks through"], "sample_ids": ["x6ijhqRY38s", "w1ir-sZ3Im8"], "start_seconds": ["250", "90"], "properties": ["something metal, glass, hit", "animal, water, splashes"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to cause water to splash", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a fly buzzes around loudly as birds chirp"], "sample_ids": ["tK4VlLsNxak", "uJV8NDaHqqk"], "start_seconds": ["120", "100"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "loud, fly, chirp"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a bee hive in a wooden box"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a swarm of bees buzzing around"], "question": "which entity is louder", "label": 1}, {"captions": ["a toilet flushes and water drains", "people cheer as a vehicle engine revs"], "sample_ids": ["sfAvvZwdLCY", "xjhAnI2q6hM"], "start_seconds": ["20", "6"], "properties": ["water drains, flushes, water", "engine revs, vehicle, people"], "captions_pred_video": ["footage of the toilet in the bathroom", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a toilet is flushed", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "some men converse over an engine running"], "sample_ids": ["w2JXXIAdUdg", "sCiy7QS1U"], "start_seconds": ["10", "300"], "properties": ["snoring, distance, person", "men, converse, engine"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", null], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a person snoring?", "label": 0}, {"captions": ["a person whistles and clicks a mouse", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["zCrAfDfv6-A", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["person, mouse, click", "beeps, hit, woman"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a person whistles a song", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a person whistles a meandering tune", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["uFoga8sHpiw", "tiDFTC-5vU"], "start_seconds": ["90", "30"], "properties": ["person, tune, whistle", "male, duck, laugh"], "captions_pred_video": ["footage of a bird in a cage", null], "captions_pred_audio": ["a person whistles a song", "a man is speaking and ducks are quacking"], "question": "which entity is a person", "label": 0}, {"captions": ["a train horn sounds as it passes by", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["ukg5L09Wpvo", "zj2R0XoFr5k"], "start_seconds": ["150", "50"], "properties": ["sound, train, horn", "airplane, boy, fly"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a toilet flushes and water drains"], "sample_ids": ["rqfQRErjfk8", "sfAvvZwdLCY"], "start_seconds": ["170", "20"], "properties": ["crowd, cheers, applauds", "water drains, flushes, water"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["vmrxwuAMb2I", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["a dog, inhales, exhales", "a train, a horn, a bell"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a dog barks and growls", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a toilet flushes and a female speaks"], "sample_ids": ["yks4cLgIDMc", "yaln9y8I7ms"], "start_seconds": ["170", "230"], "properties": ["background, speaking, child", "female, flushes, toilet"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a child is crying", "a toilet flushes and a man speaks"], "question": "which entity has a child shouting in the background", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["y8WEcpOlT3I", "wz7N8YRy74I"], "start_seconds": ["40", "30"], "properties": ["wind, speak, buffeting", "rooster, crow, background, men"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "an airplane accelerates briefly"], "sample_ids": ["zkKdxzNC97Y", "zjTG0gaGCUI"], "start_seconds": ["27", "80"], "properties": ["hard, surface, door", "accelerates, airplane, briefly"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a jet engine roars as wind blows "], "question": "which object is moving", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["su6FAOcOA8c", "zFjIWfSD-4"], "start_seconds": ["4", "410"], "properties": ["engine, idle, woman", "People, motor, brakes"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["waves crash against a shoreline and wind blows", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zdYdyF9-m8U", "uYT5gxnyMWM"], "start_seconds": ["7", "50"], "properties": ["wind, crash, shoreline", "female, spraying, scream"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["waves crash and wind blows ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["weDbePuc-Xc", "w34HjHr6gAY"], "start_seconds": ["40", "30"], "properties": ["cartoon character, music, vocalize", "beeps, hit, woman"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["tgbONvsP47Y", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["pass, vehicle, roadway", "a woman, laughs, animal"], "captions_pred_video": ["footage of a fire truck entering a garage", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car is driving on the road ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "an airplane accelerates briefly"], "sample_ids": ["s4Uz1Ffgo04", "zjTG0gaGCUI"], "start_seconds": ["100", "80"], "properties": ["roars, background, people speaking", "accelerates, airplane, briefly"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a jet engine roars as wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "rain falls onto a hard surface and thunder roars before music plays"], "sample_ids": ["sZPuqDgX2V0", "xNMovAf3o50"], "start_seconds": ["30", "0"], "properties": ["engine, accelerate, intercom", "rain, thunder, music"], "captions_pred_video": [null, "tieng mua - the falling rain lynk lee"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "thunder and rain with music playing in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a person is burping while a girl speaks", "a woman and man speak while food is frying"], "sample_ids": ["vdoxuJn9lTc", "zk-xJGQU8-4"], "start_seconds": ["40", "130"], "properties": ["person, burp, girl", "food, man, woman"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a child speaks followed by a burp", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wwyfGO2J4", "yDoT73BWsdA"], "start_seconds": ["90", "10"], "properties": ["people, applaud, hoot", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "people cheer as a vehicle engine revs"], "sample_ids": ["slZLHwNbbt4", "xjhAnI2q6hM"], "start_seconds": ["300", "6"], "properties": ["clap, distance, horn", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a car accelerates and wind blows"], "sample_ids": ["zhx6hoYrHeI", "u0TrcHhkPQ"], "start_seconds": ["160", "20"], "properties": ["engine, sputter, rough", "accelerates, wind, blows"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "several insects fly while two men talk"], "sample_ids": ["yFB25fqfU8I", "s-T9OVOiMLo"], "start_seconds": ["300", "330"], "properties": ["wave, crash, shoreline", "several, fly, men"], "captions_pred_video": ["footage of a person surfing in the ocean", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be seen in a nature documentary", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "small dogs yip and bark sharply"], "sample_ids": ["xOZfdgAgJ9o", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["woman, whimpering, speaking", "bark, yip, sharply"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["t8CV69hcvF0", "xBxDz0CFVn0"], "start_seconds": ["210", "30"], "properties": ["person, sneeze, follow", "stream, water, flow"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "wind blows as people chatter quietly"], "sample_ids": ["vBslzh7saPw", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["engine, roar, louder", "wind, chatter, people"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage is blurry and out of focus"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking with wind noise in the background "], "question": "which is quieter", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "an emergency vehicle engine runs then a horn blows and siren sounds"], "sample_ids": ["xZepNM9qcRA", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["background, motor, run", "engine, horn, siren"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn and siren?", "label": 1}, {"captions": ["a machine beeps continuously", "water splashes as an animal walks through"], "sample_ids": ["y682ml90jGw", "w1ir-sZ3Im8"], "start_seconds": ["11", "90"], "properties": ["beeps, machine, continuously", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a beeping sound is being made ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "a woman speaks happily and an animal chirps"], "sample_ids": ["vMf1dLD6Sng", "uWAAAL4CIoc"], "start_seconds": ["6", "0"], "properties": ["frog, bird, vocalize", "a woman, chirps, animal"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", null], "captions_pred_audio": ["a frog croaks loudly", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "vehicle engines race around a track as a man commentates"], "sample_ids": ["vZAw4apG0Es", "sZPuqDgX2V0"], "start_seconds": ["30", "30"], "properties": ["background, clock, ticktocks", "commentator, race, track"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking and a helicopter is flying overhead "], "question": "which entity is a video of a race?", "label": 1}, {"captions": ["a machine runs continuously", "an engine runs loudly"], "sample_ids": ["wdXV3Pv0jiY", "vqZuVbG6-HI"], "start_seconds": ["11", "130"], "properties": ["machine, running, continuously", "loud, engine, run"], "captions_pred_video": ["footage is blurry and shaky", "footage is blurry because it's raining outside"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a lawn mower is running and men are speaking "], "question": "which machine is running continuously", "label": 0}, {"captions": ["bees buzz as wind blows", "an aircraft engine runs"], "sample_ids": ["tMJne1a4AFI", "yLCORCnd35Q"], "start_seconds": ["0", "0"], "properties": ["bees, buzz, wind", "engine, aircraft, runs"], "captions_pred_video": ["a swarm of bees on the ground", "a lufthansa airbus a380 landing at london's heathrow airport"], "captions_pred_audio": ["a swarm of bees buzzing around", "a train is moving and its wheels are squealing "], "question": "which entity is moving", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["uJV8NDaHqqk", "vlS6YMeWAPo"], "start_seconds": ["100", "40"], "properties": ["loud, fly, chirp", "sheep, baa, birds"], "captions_pred_video": ["a bee hive in a wooden box", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a swarm of bees buzzing around", "a goat bleats and birds chirp"], "question": "which entity is a sheep?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a child speaks in closed space"], "sample_ids": ["tgbONvsP47Y", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["pass, vehicle, roadway", "child, space, speak"], "captions_pred_video": ["footage of a fire truck entering a garage", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a car is driving on the road ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "repeated tapping is accompanied by water running and a woman speaking softly"], "sample_ids": ["y2bVZ7rz-5M", "wvKpEYswXO0"], "start_seconds": ["280", "150"], "properties": ["motor noise, horn, siren", "sound, water, running"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity has a horn honking?", "label": 0}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "a motorcycle engine works nearby"], "sample_ids": ["sjlVMgdGSK0", "tOSWIURC-4"], "start_seconds": ["30", "0"], "properties": ["accelerates, vehicle, race car", "engine, work, nearby"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a lawn mower is running "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a person sniffs and sneezes", "an infant crying as a woman laughs"], "sample_ids": ["uRlbY6aoBU", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["sneezes, person, sniffs", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is sneezing ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 0}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "birds chirps while a siren signals in the distance"], "sample_ids": ["tDVADusiIoc", "uKCSGgof8gI"], "start_seconds": ["60", "12"], "properties": ["wind, radio, waves", "chirps, distance, signal"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a street in a small town on a sunny day"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a truck is accelerating and revving its engine "], "question": "which entity is more quiet", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "several insects fly while two men talk"], "sample_ids": ["sLUnaPT5gM8", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["loud, laughter, intermittent", "several, fly, men"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be a video of insects flying?", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a infant makes noise and is excited"], "sample_ids": ["uPDn2BFTHk", "wIJK3-5y0kA"], "start_seconds": ["140", "30"], "properties": ["lady, laugh, baby", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a male speaks over some small clicks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["uXxVebHsGZ8", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["male, clicks, speak", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "people applaud and hoot and chat quietly"], "sample_ids": ["ylpYOorfH4o", "wwyfGO2J4"], "start_seconds": ["410", "90"], "properties": ["motor, run, steady", "people, applaud, hoot"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["goats bleat and people speak", "water quietly rushes by while birds chirp in the background"], "sample_ids": ["z5iUE5h0EPs", "sYITalLZjj4"], "start_seconds": ["30", "30"], "properties": ["goats bleat, people speak, language", "water, rushes, background, birds"], "captions_pred_video": ["of the goat in the barn", "two ducks are swimming in the water near each other"], "captions_pred_audio": ["a goat bleats and a man speaks", "wind blows and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["uPDn2BFTHk", "vbZ-0lGPneg"], "start_seconds": ["140", "30"], "properties": ["lady, laugh, baby", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["an insect buzzes around continuously", "pigeons vocalize and birds chirp"], "sample_ids": ["v25l1jef3JY", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["buzzes, continuously, insect", "vocalize, bird, chirp"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of the pigeon in the cage"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a child speaks in closed space"], "sample_ids": ["zgUgkpk78xU", "yW6FWLSLkx4"], "start_seconds": ["70", "40"], "properties": ["clinking, humming, horn", "child, space, speak"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a man speaks as horns blow", "a man speaks as a motor runs in the background"], "sample_ids": ["tHyNqRyK34A", "xZepNM9qcRA"], "start_seconds": ["24", "30"], "properties": ["a, man, speaks", "background, motor, run"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wvKpEYswXO0", "xfaoyyzw2WU"], "start_seconds": ["150", "180"], "properties": ["sound, water, running", "loud, jet engine, roar"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a telephone rings followed by a woman talking"], "sample_ids": ["un9VQlzgZM", "tGcFnX0GHI"], "start_seconds": ["5", "0"], "properties": ["females, talk, laugh", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a horn blasts loudly as a train passes"], "sample_ids": ["vhJWZheqaE", "zsLxS-uLJTw"], "start_seconds": ["0", "20"], "properties": ["water drains unevenly, toilet flushes, water drains", "horn, blast, train"], "captions_pred_video": [null, "footage of the train on the tracks at sunrise or sunset"], "captions_pred_audio": ["a toilet is flushed", "a train blows its horn and moves on the tracks "], "question": "which entity is louder", "label": 1}, {"captions": ["a woman talking as an infant is crying", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tMbMDvT50j8", "tiDFTC-5vU"], "start_seconds": ["12", "30"], "properties": ["a, talk, infant", "male, duck, laugh"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and ducks are quacking"], "question": "which entity is about a person talking?", "label": 0}, {"captions": ["a child and woman laughs and the woman speaks", "distant men speak as a spray can nozzle is depressed"], "sample_ids": ["uPDn2BFTHk", "rwtmaKiCcQU"], "start_seconds": ["140", "30"], "properties": ["woman, laughs, speaks", "nozzle, depressed, spray can"], "captions_pred_video": [null, "shows a man spraying paint on a wall with a spray gun"], "captions_pred_audio": ["a baby laughs and a woman speaks", "spraying and people speaking"], "question": "which entity is about a spray can?", "label": 1}, {"captions": ["a man speaks while water drains", "plastic is tapped on while someone speaks"], "sample_ids": ["vSeGhaZt-aI", "wvKpEYswXO0"], "start_seconds": ["50", "150"], "properties": ["water, drain, man", "plastic, tap, speak"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a video of someone speaking?", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "pigeons vocalize and birds chirp"], "sample_ids": ["zofjfKhqLk8", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["background, metal, clings", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of the pigeon in the cage"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a power tool runs and touches a surface", "an infant crying as a woman laughs"], "sample_ids": ["zfvPRf3chY", "xhmRY9yhC7c"], "start_seconds": ["290", "20"], "properties": ["power tool, run, touch", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a baby cries and a woman speaks"], "question": "which is not a person", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a saw finishes running as metal clings in the background"], "sample_ids": ["wtDqrBygTcU", "zofjfKhqLk8"], "start_seconds": ["30", "10"], "properties": ["man, engine, run", "background, metal, clings"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["a man is speaking and a motor is running", "a large engine is running and a bell is ringing"], "question": "which entity is about a man speaking as a boat engine runs?", "label": 0}, {"captions": ["a toilet flushes and water sputters as it drains", "a clock alarm sounds and gears turn"], "sample_ids": ["smGI3C1NZc", "w2M4i1mklOA"], "start_seconds": ["30", "30"], "properties": ["water, drain, toilet", "alarm, gears, turn"], "captions_pred_video": [null, "footage of an antique clock"], "captions_pred_audio": ["a toilet is flushed", "a clock is ticking and a bell is ringing "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a person screams glaringly", "a propeller rotates loudly and intensely"], "sample_ids": ["xC8kbrKJmco", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["glaringly, screams, person", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a goat is bleating ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a person snoring several times", "a baby laugh at a sputter"], "sample_ids": ["spJCm8tD9Zo", "sLUnaPT5gM8"], "start_seconds": ["90", "0"], "properties": ["snore, person, several", "laugh, sputter, baby"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a person is snoring loudly", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is a baby?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a child speaks in closed space"], "sample_ids": ["vJrjSeP17yE", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["a person is sleeping, snoring, person", "child, space, speak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wyllXV6PjKo", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["a kid, talk, cry", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a duck quacks and a woman speaks"], "question": "which entity has a kid?", "label": 0}, {"captions": ["a person is snoring while sleeping", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vJrjSeP17yE", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["a person is sleeping, snoring, person", "men, talk, cars"], "captions_pred_video": ["a black background with a small plane flying in the sky", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a piece of wood is being placed down and sawed", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["uiItxDsDMFI", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["wood, piece, saw", "two men, woman, birds"], "captions_pred_video": ["a man cutting a log with an axe in the woods", null], "captions_pred_audio": ["a saw is being used with background noise ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a still image?", "label": 0}, {"captions": ["a stream of water flows as people talk and wind blows", "a car speeding up in the distance"], "sample_ids": ["xBxDz0CFVn0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["stream, water, flow", "distance, car, speed"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zj2R0XoFr5k", "sLUnaPT5gM8"], "start_seconds": ["50", "0"], "properties": ["airplane, fly, overhead", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a power tool runs and touches a surface", "a clock ticktocks"], "sample_ids": ["zfvPRf3chY", "v-g-j2uTByM"], "start_seconds": ["290", "30"], "properties": ["power tool, run, touch", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a clock is ticking loudly"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["a person snoring several times", "a person snores loudly multiple times at a close distance"], "sample_ids": ["spJCm8tD9Zo", "sSMl2vc3ek"], "start_seconds": ["90", "20"], "properties": ["snore, person, several", "loud, multiple, distance"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a person snoring loudly"], "question": "which person is snoring", "label": 1}, {"captions": ["a woman sneezes then speaks", "people speak and tapping occurs"], "sample_ids": ["x4dZyf9Gbj0", "tFCUUGdREgA"], "start_seconds": ["130", "70"], "properties": ["sneezes, speaks, woman", "people, tap, speak"], "captions_pred_video": ["footage is blurry and out of focus", "a person riding a white horse in an indoor arena"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking and walking with wind noise in the background "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["scraping and female speech with distant music", "wind blowing followed by a zoom"], "sample_ids": ["yHeVV-xeOxQ", "vr8ZXjEBhMQ"], "start_seconds": ["130", "150"], "properties": ["female, speech, music", "wind, blow, zoom"], "captions_pred_video": ["of a girl milking a goat's udder", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a video of a wind blowing?", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a kid speaks followed by music playing"], "sample_ids": ["yaln9y8I7ms", "tQWGZLItBXk"], "start_seconds": ["230", "170"], "properties": ["female, flushes, toilet", "music, kid, speak"], "captions_pred_video": ["footage is blurry and out of focus", "worms revolution screenshots"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has a female speaking?", "label": 0}, {"captions": ["a woman speaks happily and an animal chirps", "a man speaks while rain falls onto a hard surface"], "sample_ids": ["uWAAAL4CIoc", "wqN6IIHw3po"], "start_seconds": ["0", "30"], "properties": ["a woman, chirps, animal", "rain, surface, fall"], "captions_pred_video": [null, "in your own words what is happening in this screenshot? blood splattered all over the place"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking and water is splashing"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yeFvk9x0wWI", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["chirp, twitter, clatter", "men, talk, cars"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a consistent ticking pattern", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sCeWURVHfOM", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["ticking, pattern, clock", "men, talk, cars"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["ticking of a clock", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a vehicle engine revs and tires squeal", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yDoT73BWsdA", "tDVADusiIoc"], "start_seconds": ["10", "60"], "properties": ["engine revs, tires squeal, vehicle", "water, radio, man"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["white noise and birds chirping", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wRBHTgrbiwg", "vb1fPSDI4c"], "start_seconds": ["50", "30"], "properties": ["noise, white, chirping", "multiple, people, yell"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a stream of water flows as people talk and wind blows", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["xBxDz0CFVn0", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["stream, water, flow", "People, motor, brakes"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a car?", "label": 1}, {"captions": ["a toilet flushes and water drains", "paper is crumpling consistently"], "sample_ids": ["sfAvvZwdLCY", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["water drains, flushes, water", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a toilet is flushed", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a person is snoring while sleeping", "a car accelerates and wind blows"], "sample_ids": ["vJrjSeP17yE", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["a person is sleeping, snoring, person", "accelerates, wind, blows"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "birds chirp and a dog breathes heavily"], "sample_ids": ["yajyRTUQk3U", "y2ZBGpgbhHM"], "start_seconds": ["400", "30"], "properties": ["a woman, something, fried", "dog, chirp, breathe"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "birds chirping and a dog panting"], "question": "which entity is a dog", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "vehicles pass by on a roadway"], "sample_ids": ["y2ZBGpgbhHM", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["birds, tweet, pant", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds chirping and a dog panting", "a car is driving on the road "], "question": "which entity is more likely to be seen in a city", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a frog croaks as other frogs croak in the background"], "sample_ids": ["uEU-Hg5MTN8", "yswmmRZFItk"], "start_seconds": ["27", "0"], "properties": ["animal, grunts, snorts", "background, frog, croak"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a frog is croaking"], "question": "which animal is speaking", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "a clock ticktocks"], "sample_ids": ["sQGXqGcwOTc", "v-g-j2uTByM"], "start_seconds": ["3", "30"], "properties": ["audio, kid, giggles", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an airplane accelerates briefly", "an infant crying frantically"], "sample_ids": ["zjTG0gaGCUI", "zwOBqeFTgiU"], "start_seconds": ["80", "30"], "properties": ["accelerates, airplane, briefly", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["vehicles pass by on a roadway", "several insects fly while two men talk"], "sample_ids": ["tgbONvsP47Y", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["pass, vehicle, roadway", "several, fly, men"], "captions_pred_video": ["footage of a fire truck entering a garage", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a car is driving on the road ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "distant men speak as a spray can nozzle is depressed"], "sample_ids": ["tQWGZLItBXk", "rwtmaKiCcQU"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "nozzle, depressed, spray can"], "captions_pred_video": ["worms revolution screenshots", "shows a man spraying paint on a wall with a spray gun"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "spraying and people speaking"], "question": "which entity has a nozzle depressed?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "water flows and trickles"], "sample_ids": ["yRx9txMcBl0", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["motors, tires, screech", "water, flow, trickle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a car is revving its engine and skidding ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "a man speaks as a motor runs in the background"], "sample_ids": ["uRlbY6aoBU", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["a, distance, sneeze", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is sneezing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vimzuGQvdcU", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["a, man, yells", "loud, multiple, distance"], "captions_pred_video": ["a group of people are rafting down a river", null], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an engine runs and wind blows", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["vs65y4qmyBE", "x9JovgqUcs"], "start_seconds": ["340", "500"], "properties": ["engine, run, wind", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man speaks and types on a keyboard"], "question": "which entity is a person", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "wind blowing followed by a zoom"], "sample_ids": ["zofjfKhqLk8", "vr8ZXjEBhMQ"], "start_seconds": ["10", "150"], "properties": ["background, metal, clings", "wind, blow, zoom"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["u7C-AEBQM", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["ticks, rhythmic, quiet", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a duck quacks and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman speaks and food sizzles while frying", "vehicles pass by on a roadway"], "sample_ids": ["wTideSjRFS0", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["food, sizzle, woman", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a stream of water runs briefly"], "sample_ids": ["zk-xJGQU8-4", "x-PeY8Yb8M4"], "start_seconds": ["130", "300"], "properties": ["food, man, woman", "stream, water, run"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["small dogs yip and bark sharply", "an insect buzzes around continuously"], "sample_ids": ["v-wcQf4BDY0", "v25l1jef3JY"], "start_seconds": ["120", "0"], "properties": ["bark, yip, sharply", "buzzes, continuously, insect"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a dog barks and growls", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "people applaud and hoot and chat quietly"], "sample_ids": ["t8CV69hcvF0", "wwyfGO2J4"], "start_seconds": ["210", "90"], "properties": ["person, sneeze, follow", "people, applaud, hoot"], "captions_pred_video": ["of an airplane flying in the dark sky at night", null], "captions_pred_audio": ["a woman sneezes and speaks", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be at a concert", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "a person is snoring while sleeping"], "sample_ids": ["uWAAAL4CIoc", "vJrjSeP17yE"], "start_seconds": ["0", "40"], "properties": ["a, dog, vocalize", "a person is sleeping, snoring, person"], "captions_pred_video": [null, "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "an small aircraft engine runs and a boy speaks"], "sample_ids": ["uYT5gxnyMWM", "xSKJGCItUWE"], "start_seconds": ["50", "10"], "properties": ["female, spraying, scream", "engine, run, boy"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a high pitched engine is running and a child speaks"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a clock ticks quietly and rhythmically", "a man speaks as a car is passing by"], "sample_ids": ["u7C-AEBQM", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["ticks, rhythmic, quiet", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a person is snoring while sleeping", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vJrjSeP17yE", "sLUnaPT5gM8"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "loud, laughter, intermittent"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a person snoring loudly", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["water flows as men speak and yell", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["vJ7JPEFhyLA", "tDlysoZiA1I"], "start_seconds": ["16", "0"], "properties": ["water, flow, men", "animal, grunts, chirps"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["v5P-ThUCINM", "wz7N8YRy74I"], "start_seconds": ["400", "30"], "properties": ["background, chirp, bird", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "people speak as gunfire rings out"], "sample_ids": ["uYT5gxnyMWM", "wqTCwqVRDlk"], "start_seconds": ["50", "80"], "properties": ["person, spray, yell", "gunfire, ring, speak"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "dishes cling together then a man begins to speak"], "sample_ids": ["zdYdyF9-m8U", "sQGXqGcwOTc"], "start_seconds": ["7", "3"], "properties": ["wind, crash, shoreline", "cling, speak, dishes"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["waves crash and wind blows ", "mechanisms are operating and water is splashing "], "question": "which entity is a still image?", "label": 0}, {"captions": ["a man speaks while water trickles and flows", "a man speaks followed by another man speaking outside"], "sample_ids": ["sapQIQUhFc", "viuTg1M-dqg"], "start_seconds": ["280", "30"], "properties": ["water, trickles, flow", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a horn honks and then loudly blares", "cats meow and then a person begins to talk while the cats continue to meow"], "sample_ids": ["wnpJndXuxLc", "x5cuQjOdM3E"], "start_seconds": ["50", "30"], "properties": ["horn, honk, loud", "cat, talk, meow"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a cat meows and a woman speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a woman speaks in a fast tone with a male"], "sample_ids": ["w1mlz3Pe4fU", "sTpirNYo8vQ"], "start_seconds": ["300", "30"], "properties": ["vocalize, chirp, continuously", "a, tone, fast"], "captions_pred_video": ["of a bird in a cage", "of a man taking a selfie on a bus"], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking while a car is revving and accelerating "], "question": "which entity is speaking", "label": 1}, {"captions": ["a person screams glaringly", "pigeons vocalize and birds chirp"], "sample_ids": ["xC8kbrKJmco", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["glaringly, screams, person", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a goat is bleating ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["uWPRNLnpy7Y", "tDVADusiIoc"], "start_seconds": ["10", "60"], "properties": ["accelerate, laugh, vehicle", "water, radio, man"], "captions_pred_video": ["is taken from a car driving down the street", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a stream of water runs briefly"], "sample_ids": ["zhx6hoYrHeI", "x-PeY8Yb8M4"], "start_seconds": ["160", "300"], "properties": ["engine, sputter, rough", "stream, water, run"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "sawing of wood and rustling with leaves blowing in the distance"], "sample_ids": ["ukxt9I7eMMg", "uiItxDsDMFI"], "start_seconds": ["30", "30"], "properties": ["continuous, woman, speaking", "sound, distance, leaves"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a saw is being used with background noise "], "question": "which entity has a woman speaking towards the end?", "label": 0}, {"captions": ["multiple people speak and children yell while water gurgles", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vb1fPSDI4c", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["multiple, people, yell", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a game", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["zO-LSSY92ZM", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["liquid, surface, sound", "engine, laugh, loud"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "footage of a man driving a car in the dark"], "captions_pred_audio": ["steam is hissing and hissing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "people cheer as a vehicle engine revs"], "sample_ids": ["tDlfY3nmx1A", "xjhAnI2q6hM"], "start_seconds": ["160", "6"], "properties": ["applause, laugh, man", "engine revs, vehicle, people"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "an infant crying frantically"], "sample_ids": ["wwyfGO2J4", "zwOBqeFTgiU"], "start_seconds": ["90", "30"], "properties": ["people, applaud, hoot", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "pigeons vocalize and birds chirp"], "sample_ids": ["ul60S8TXDA8", "uiS58TNyUiw"], "start_seconds": ["60", "430"], "properties": ["sound, distance, bell", "vocalize, bird, chirp"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "of the pigeon in the cage"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a man speaks as a motor runs in the background"], "sample_ids": ["vbr9mHKc8WM", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["noise, loudness, engine", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["an engine is idling", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is quieter", "label": 0}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "an electric engine works nearby followed by a child talking"], "sample_ids": ["tQWGZLItBXk", "xSKJGCItUWE"], "start_seconds": ["170", "10"], "properties": ["music, person, ding", "engine, work, child"], "captions_pred_video": ["worms revolution screenshots", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a high pitched engine is running and a child speaks"], "question": "which entity has a child talking?", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "birds chirp and an insect buzzes around"], "sample_ids": ["su6FAOcOA8c", "t97k0cejSQE"], "start_seconds": ["4", "250"], "properties": ["engine, run, woman", "bird, chirp, insect"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a bee on a purple thistle flower"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a bee buzzes and a woman speaks"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a dog barks and whimpers", "a crowd yells, reacts and applauds"], "sample_ids": ["sShpyu2l4YQ", "wztCSUxOf8"], "start_seconds": ["0", "130"], "properties": ["barks, whimpers, dog", "a crowd, yells, applauds"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking and a crowd is clapping"], "question": "which entity is more likely to be a crowd", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "birds chirp and objects are moved around"], "sample_ids": ["wRBHTgrbiwg", "yPUYU6t3rwo"], "start_seconds": ["50", "370"], "properties": ["birds, chirp, cooing", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a stream runs then someone speaks", "a car accelerates and wind blows"], "sample_ids": ["wbHTKEJZyhc", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["stream, run, someone", "accelerates, wind, blows"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a door slams shut roughly", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zkKdxzNC97Y", "zj2R0XoFr5k"], "start_seconds": ["27", "50"], "properties": ["a door, slams, shut", "airplane, boy, fly"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a door is opened and closed", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "birds chirp and objects are moved around"], "sample_ids": ["tiDFTC-5vU", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["male, duck, laugh", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a car accelerates and wind blows"], "sample_ids": ["sG7TyPnFDR0", "u0TrcHhkPQ"], "start_seconds": ["180", "20"], "properties": ["beeps, machine, smoke alarm", "accelerates, wind, blows"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", null], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "a steam engine runs and whistles as it passes by"], "sample_ids": ["sTpirNYo8vQ", "se87d6yxEOA"], "start_seconds": ["30", "10"], "properties": ["a, tone, fast", "run, whistle, pass"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a train is moving and blowing its whistle "], "question": "which entity is moving", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "some men converse over an engine running"], "sample_ids": ["vXlk0lIQBFo", "sCiy7QS1U"], "start_seconds": ["470", "300"], "properties": ["wind, talk, vocalize", "men, converse, engine"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", null], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a conversation?", "label": 1}, {"captions": ["continuous sneezing together with speech", "people speak as gunfire rings out"], "sample_ids": ["x4dZyf9Gbj0", "wqTCwqVRDlk"], "start_seconds": ["130", "80"], "properties": ["continuous, sneeze, speech", "gunfire, ring, speak"], "captions_pred_video": ["footage is blurry and out of focus", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "paper is crumpling consistently"], "sample_ids": ["vYkA3cfXp5Q", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["engine, accelerate, idle", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["an engine is idling", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a man speaks as a motor runs in the background"], "sample_ids": ["w0xsN8X18Y", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["music, surface, rain", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "an infant crying frantically"], "sample_ids": ["sK4u5T8hW78", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "cry, infant, frantically"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["an insect buzzes around continuously", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["v25l1jef3JY", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["buzzes, continuously, insect", "a woman, laughs, animal"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking and a baby is crying"], "question": "which entity is not a person?", "label": 0}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["x5cuQjOdM3E", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["cat, talk, meow", "water, radio, man"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yeFvk9x0wWI", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["chirp, twitter, clatter", "engine, laugh, loud"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yYEVLuqEytU", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["animal, pig, background", "a woman, a television program, a bird"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sfAvvZwdLCY", "wDVMhEdTiVw"], "start_seconds": ["20", "30"], "properties": ["flushes, drains, water", "gun, shoot, water"], "captions_pred_video": ["footage of the toilet in the bathroom", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a toilet is flushed", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity has water sloshing nearby?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "birds tweet and squawk"], "sample_ids": ["tgbONvsP47Y", "w1mlz3Pe4fU"], "start_seconds": ["0", "300"], "properties": ["pass, vehicle, roadway", "squawk, tweet, scream"], "captions_pred_video": ["footage of a fire truck entering a garage", "of a bird in a cage"], "captions_pred_audio": ["a car is driving on the road ", "birds are chirping and singing"], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["y2ZBGpgbhHM", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["birds, tweet, pant", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about animals?", "label": 0}, {"captions": ["a man sprays as a scraping occurs in the background", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sOa7g-44Dag", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["background, man, spray", "engine, idle, woman"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a person is snoring while sleeping", "plastic is tapped on while someone speaks"], "sample_ids": ["vJrjSeP17yE", "wvKpEYswXO0"], "start_seconds": ["40", "150"], "properties": ["a person is sleeping, snoring, person", "plastic, tap, speak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a person screams glaringly", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xC8kbrKJmco", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["glaringly, screams, person", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a goat is bleating ", "a woman is speaking and a subway train is moving "], "question": "which entity is a person", "label": 0}, {"captions": ["a stream of water runs briefly", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["x-PeY8Yb8M4", "uYT5gxnyMWM"], "start_seconds": ["300", "50"], "properties": ["stream, water, run", "female, spraying, scream"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car is driving on a wet road ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person spraying and screaming?", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "a stream of water runs briefly"], "sample_ids": ["vfYTJq7nU", "x-PeY8Yb8M4"], "start_seconds": ["130", "300"], "properties": ["ducks, quack, man", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a woman sneezes then speaks", "a stream of water runs briefly"], "sample_ids": ["x4dZyf9Gbj0", "x-PeY8Yb8M4"], "start_seconds": ["130", "300"], "properties": ["sneezes, speaks, woman", "stream, water, run"], "captions_pred_video": ["footage is blurry and out of focus", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman sneezes and speaks", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a cat meows and children speak", "a frog croaks as other frogs croak in the background"], "sample_ids": ["x5cuQjOdM3E", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["cat, speak, children", "background, frog, croak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a close up of a frog in the water"], "captions_pred_audio": ["a cat meows and a woman speaks", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "some men converse over an engine running"], "sample_ids": ["vbZ-0lGPneg", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["a woman, a television program, a bird", "men, converse, engine"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has more people", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "waves crash against a shoreline and people speak"], "sample_ids": ["sSMl2vc3ek", "yFB25fqfU8I"], "start_seconds": ["20", "300"], "properties": ["a person, laughs, snores", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "wind blowing followed by a zoom"], "sample_ids": ["sU53zg9Jp7s", "vr8ZXjEBhMQ"], "start_seconds": ["380", "150"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "wind, blow, zoom"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to cause a woman to gasp", "label": 0}, {"captions": ["a vehicle accelerates and squeals tires", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["yRx9txMcBl0", "wDVMhEdTiVw"], "start_seconds": ["40", "30"], "properties": ["accelerates, tires, squeals", "gun, shoot, water"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a woman speaks as she rubs two objects together"], "sample_ids": ["s7knHCFW82w", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["blow horn, get close, train", "two objects, woman, speak"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a duck quacks continuously"], "sample_ids": ["sShpyu2l4YQ", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["growl, bark, yip", "quacks, continuously, duck"], "captions_pred_video": ["the puppies are playing with a toy", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a dog is barking and growling", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a telephone rings followed by a woman talking"], "sample_ids": ["y2bVZ7rz-5M", "tGcFnX0GHI"], "start_seconds": ["280", "0"], "properties": ["motor noise, horn, siren", "ring, talk, woman"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is followed by a woman talking", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["u2f5NpsoHBg", "vVhthZ45k3Y"], "start_seconds": ["30", "30"], "properties": ["person, laugh, clap", "cat, purr, hiss"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking and a cat is meowing"], "question": "which entity is a person", "label": 0}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["w2JXXIAdUdg", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["snoring, distance, person", "a woman, laughs, animal"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a woman is speaking and a baby is crying"], "question": "which entity has a person snoring nearby?", "label": 0}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "people cheer as a vehicle engine revs"], "sample_ids": ["tDVADusiIoc", "xjhAnI2q6hM"], "start_seconds": ["60", "6"], "properties": ["water, radio, man", "engine revs, vehicle, people"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "an airplane engine runs"], "sample_ids": ["zj2R0XoFr5k", "yVPZ2MNWpms"], "start_seconds": ["50", "0"], "properties": ["airplane, fly, overhead", "engine, airplane, runs"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a car is driving by on the road "], "question": "which airplane is flying overhead", "label": 0}, {"captions": ["a man yells and speaks as water splashes", "water pouring and bubbling"], "sample_ids": ["vimzuGQvdcU", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["a, man, yells", "water, bubbles, pouring"], "captions_pred_video": ["a group of people are rafting down a river", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "water is running from a faucet"], "question": "which entity is a video of water pouring and bubbling?", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "vehicles pass by on a roadway"], "sample_ids": ["tEE3MpBt1sg", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["drill, something, laugh", "pass, vehicle, roadway"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a fire truck entering a garage"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "a man speaks and is typing on a keyboard"], "sample_ids": ["wTideSjRFS0", "x9JovgqUcs"], "start_seconds": ["30", "500"], "properties": ["food, sizzle, woman", "a, man, speaks, keyboard"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man speaks and types on a keyboard"], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a stream of water runs briefly"], "sample_ids": ["sG7TyPnFDR0", "x-PeY8Yb8M4"], "start_seconds": ["180", "300"], "properties": ["beeps, machine, smoke alarm", "stream, water, run"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yVumC9TGknc", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["humming, clock, birds", "three men, wind, flow"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["children speak as a female ask them questions", "people applaud and hoot and chat quietly"], "sample_ids": ["wEBlkGWVWwE", "wwyfGO2J4"], "start_seconds": ["260", "90"], "properties": ["female, speak, questions", "people, applaud, hoot"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sofxkNWaP0s", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["wind, engine, louder", "loud, laughter, intermittent"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a crowd yells, reacts and applauds", "a man speaks followed by another man speaking outside"], "sample_ids": ["wztCSUxOf8", "viuTg1M-dqg"], "start_seconds": ["130", "30"], "properties": ["a crowd, yells, applauds", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vmrxwuAMb2I", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["a dog, inhales, exhales", "water, radio, man"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a telephone rings followed by a woman talking"], "sample_ids": ["zFjIWfSD-4", "tGcFnX0GHI"], "start_seconds": ["410", "0"], "properties": ["People, motor, brakes", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "water pouring and bubbling"], "sample_ids": ["vSeGhaZt-aI", "uyRfq-jKPpo"], "start_seconds": ["50", "50"], "properties": ["water, bubbles, speak", "water, bubbles, pouring"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "a saw finishes running as metal clings in the background"], "sample_ids": ["tqR406bGiE", "zofjfKhqLk8"], "start_seconds": ["40", "10"], "properties": ["flush, water, gurgle", "background, metal, clings"], "captions_pred_video": [null, "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["a toilet is flushed", "a large engine is running and a bell is ringing"], "question": "which entity is about a flushing toilet?", "label": 0}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "roadway noise occurs and a truck accelerates"], "sample_ids": ["su6FAOcOA8c", "tgbONvsP47Y"], "start_seconds": ["4", "0"], "properties": ["engine, idle, woman", "noise, truck, accelerate"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a train horn blows as it passes by"], "sample_ids": ["vbr9mHKc8WM", "zVacuqSb4LI"], "start_seconds": ["40", "30"], "properties": ["noise, loudness, engine", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["an engine is idling", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["an animal quacks rapidly", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vh30P49Po6s", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["animal, quacks, rapidly", "three men, wind, flow"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a person", "label": 0}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["smDKStoHBJo", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["a, infant, speaking", "engine, idle, woman"], "captions_pred_video": ["a man holding a crying baby in his arms", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking to an infant?", "label": 0}, {"captions": ["a clock ticks quietly and rhythmically", "a dog barks and whimpers"], "sample_ids": ["u7C-AEBQM", "sShpyu2l4YQ"], "start_seconds": ["30", "0"], "properties": ["ticks, rhythmic, quiet", "barks, whimpers, dog"], "captions_pred_video": [null, "the puppies are playing with a toy"], "captions_pred_audio": ["a ticktock of a clock", "a dog is barking and growling"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks as music plays before artillery is fired", "a person speaks briefly"], "sample_ids": ["vcmWSmvti8", "zOZleIRqZm4"], "start_seconds": ["30", "80"], "properties": ["music, man, fire", "person, talk, brief"], "captions_pred_video": [null, "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking with crickets chirping in the background"], "question": "which entity is more like a talk show", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "water pouring and bubbling"], "sample_ids": ["yLy-WycbVVE", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["background, people, talk", "water, bubbles, pouring"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "water is running from a faucet"], "question": "which entity is more silent", "label": 1}, {"captions": ["an insect buzzes around continuously", "an insect buzzes around continuously"], "sample_ids": ["v25l1jef3JY", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["buzzes, continuously, insect", "buzzes, continuously, insect"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a fly is buzzing around a microphone "], "question": "which insect buzzes around continuously", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "an infant crying frantically"], "sample_ids": ["vlJS7LN2XyM", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["background, clocks, ticking", "cry, infant, frantically"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "of the baby crying in the car seat"], "captions_pred_audio": ["a ticktock of a clock", "a baby cries loudly"], "question": "which entity is more active", "label": 1}, {"captions": ["ticking continues without interruption", "multiple people speak and children yell while water gurgles"], "sample_ids": ["v-g-j2uTByM", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "multiple, people, yell"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", null], "captions_pred_audio": ["a clock is ticking loudly", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a kid speaks followed by music playing", "a woman speaks and is crumpling paper"], "sample_ids": ["tQWGZLItBXk", "xvDdE3zNf8Y"], "start_seconds": ["170", "120"], "properties": ["music, kid, speak", "A, crumple, paper"], "captions_pred_video": ["worms revolution screenshots", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman speaks and crumples paper"], "question": "which entity is crumpling paper", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a stream of water runs briefly"], "sample_ids": ["yYJksgsxx5U", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["audio, woman, silverware", "stream, water, run"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "a car accelerates and wind blows"], "sample_ids": ["xyL9F5VrjkE", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["wind, blows, vehicle", "accelerates, wind, blows"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wvKpEYswXO0", "su6FAOcOA8c"], "start_seconds": ["150", "4"], "properties": ["plastic, tap, speak", "engine, idle, woman"], "captions_pred_video": ["of the person preparing food in the kitchen", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "an infant crying frantically"], "sample_ids": ["sShpyu2l4YQ", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["growl, bark, yip", "cry, infant, frantically"], "captions_pred_video": ["the puppies are playing with a toy", "of the baby crying in the car seat"], "captions_pred_audio": ["a dog is barking and growling", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a woman speaks with water running", "a car accelerates and wind blows"], "sample_ids": ["wTideSjRFS0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["water, running, woman", "accelerates, wind, blows"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "people speak as gunfire rings out"], "sample_ids": ["sYITalLZjj4", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["water, rushes, background, birds", "gunfire, ring, speak"], "captions_pred_video": ["two ducks are swimming in the water near each other", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["wind blows and birds chirp", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 0}, {"captions": ["scraping and female speech with distant music", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["yHeVV-xeOxQ", "uYT5gxnyMWM"], "start_seconds": ["130", "50"], "properties": ["female, speech, music", "a, scream, girl"], "captions_pred_video": ["of a girl milking a goat's udder", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream", "label": 1}, {"captions": ["people speak then an engine runs", "a man speaks as a motor runs in the background"], "sample_ids": ["uMTTDZ2mb4", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["engine, run, people", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sOa7g-44Dag", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["background, man, spray", "rooster, crow, background, men"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a woman and man are speaking", "an animal quacks rapidly"], "sample_ids": ["vbpKkWvfOu4", "vh30P49Po6s"], "start_seconds": ["560", "30"], "properties": ["two people, speaking, woman, man", "animal, quacks, rapidly"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a duck is quacking loudly"], "question": "which entity is speaking", "label": 0}, {"captions": ["birds chirps while a siren signals in the distance", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uKCSGgof8gI", "su6FAOcOA8c"], "start_seconds": ["12", "4"], "properties": ["chirps, distance, signal", "engine, idle, woman"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a motor noise is accompanied by a door opening and closing"], "sample_ids": ["w5W5Kqtc8E", "vBHyYJ8pL0"], "start_seconds": ["100", "2"], "properties": ["wind, blow, vehicle", "noise, door, opening"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is accompanied by a door opening and closing?", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "a child speaks in closed space"], "sample_ids": ["w2JXXIAdUdg", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["emits, sleeping, person", "child, space, speak"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "water splashing and a person laughs in the distance then a man speaks nearby"], "sample_ids": ["zliInBdC98Y", "vddP56-ogds"], "start_seconds": ["30", "30"], "properties": ["a, baby, cries, wails", "water, splash, person, laugh"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", null], "captions_pred_audio": ["a baby cries and a woman speaks", "water is running and gurgling and a man is speaking"], "question": "which entity is more active", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w34HjHr6gAY", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["beeps, squawk, child speaking", "stream, water, flow"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage is blurry and out of focus"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "an infant crying as a woman laughs"], "sample_ids": ["vveS8HT7Uog", "xhmRY9yhC7c"], "start_seconds": ["100", "20"], "properties": ["a man, objects, speak", "a, laugh, infant"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["w34HjHr6gAY", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["beeps, squawk, child speaking", "loud, laughter, intermittent"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an electronic device bleeps once", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tHJ6JSa8Y4", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["bleeps, electronic, device", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a clock is ticking and beeping", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "wind blowing and birds chirping with the distant cooing of a large bird"], "sample_ids": ["uZesmtKZGSw", "wRBHTgrbiwg"], "start_seconds": ["250", "50"], "properties": ["men, talk, cars", "birds, chirp, cooing"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "birds are chirping and insects are buzzing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a child speaks", "after a few seconds of silence, a loud bang occurs followed by a softer banging noise"], "sample_ids": ["yW6FWLSLkx4", "zkKdxzNC97Y"], "start_seconds": ["40", "27"], "properties": ["a, child, speaks", "loud, bang, noise"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a door is opened and closed"], "question": "which entity is silent", "label": 1}, {"captions": ["some men converse over an engine running", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["sCiy7QS1U", "ziUT9IFTkjg"], "start_seconds": ["300", "10"], "properties": ["men, converse, engine", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "vehicles pass by on a roadway"], "sample_ids": ["xM4joTqDVp4", "tgbONvsP47Y"], "start_seconds": ["160", "0"], "properties": ["background, chirp, birds", "pass, vehicle, roadway"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a car is driving on the road "], "question": "which entity is a video of a train chugging?", "label": 0}, {"captions": ["a man speaks over intermittent keyboard taps", "a clicking followed by some people laughing and a kid speaking"], "sample_ids": ["tw76HGONaKg", "vz8868znkVQ"], "start_seconds": ["570", "60"], "properties": ["audio, man, keyboard", "audio, click, kid speaking"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "a video of a plane flying over a cloudy sky"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a baby is laughing and breathing with background noise "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 0}, {"captions": ["water flows followed by women screaming", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["w5W5Kqtc8E", "tiDFTC-5vU"], "start_seconds": ["100", "30"], "properties": ["water, flow, women", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and ducks are quacking"], "question": "which entity is a group of people", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "someone snores nearby"], "sample_ids": ["siJFXfGWgDk", "spJCm8tD9Zo"], "start_seconds": ["50", "90"], "properties": ["a, bird, vehicle", "someone snores, nearby, someone"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vzxHnu-SFEw", "sSMl2vc3ek"], "start_seconds": ["80", "20"], "properties": ["two objects, woman, speak", "loud, multiple, distance"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an adult woman and an adult man speak", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["zTLVJCo4WEE", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["two people, adult, speak", "beeps, hit, woman"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "wind blows strongly"], "sample_ids": ["sncRqQ67iJU", "w8uLijTqtlU"], "start_seconds": ["460", "70"], "properties": ["loud, repeatedly, man", "wind, blows, strongly"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage is blurry and shaky"], "captions_pred_audio": ["a person is snoring", "the wind is blowing strongly"], "question": "which entity is not a person", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "several insects fly while two men talk"], "sample_ids": ["zofjfKhqLk8", "s-T9OVOiMLo"], "start_seconds": ["10", "330"], "properties": ["background, metal, clings", "several, fly, men"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more flying insects", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "bees buzz and wind blows"], "sample_ids": ["sWZzXuWYY", "tMJne1a4AFI"], "start_seconds": ["420", "0"], "properties": ["male, clanks, thumps", "bees buzz, wind blows, bees"], "captions_pred_video": [null, "a swarm of bees on the ground"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a swarm of bees buzzing around"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["y2ZBGpgbhHM", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["dog, chirp, breathe", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a crow?", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vs65y4qmyBE", "tDVADusiIoc"], "start_seconds": ["340", "60"], "properties": ["wind, blows, strongly", "water, radio, man"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "several insects fly while two men talk"], "sample_ids": ["sxYkFKFIZD0", "s-T9OVOiMLo"], "start_seconds": ["20", "330"], "properties": ["screech, man, door", "several, fly, men"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a beep occurs briefly", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["xtWeJ56-U-g", "sLUnaPT5gM8"], "start_seconds": ["20", "0"], "properties": ["beep, occur, briefly", "loud, laughter, intermittent"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a stream of water runs briefly", "people applaud and hoot and chat quietly"], "sample_ids": ["x-PeY8Yb8M4", "wwyfGO2J4"], "start_seconds": ["300", "90"], "properties": ["stream, water, run", "people, applaud, hoot"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["ukg5L09Wpvo", "tdWhHV3X25Q"], "start_seconds": ["150", "60"], "properties": ["a train, a horn, a bell", "applause, audience, yells"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a person snores loudly multiple times at a close distance"], "sample_ids": ["ylpYOorfH4o", "sSMl2vc3ek"], "start_seconds": ["410", "20"], "properties": ["engine, running, wind", "loud, multiple, distance"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a person snoring several times", "plastic is tapped on while someone speaks"], "sample_ids": ["spJCm8tD9Zo", "wvKpEYswXO0"], "start_seconds": ["90", "150"], "properties": ["snore, person, several", "plastic, tap, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a stream of water flows as people talk and wind blows"], "sample_ids": ["x5cuQjOdM3E", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["cat, talk, meow", "stream, water, flow"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage is blurry and out of focus"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wz7N8YRy74I", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["rooster, crow, background, men", "a woman, laughs, animal"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a woman is speaking and a baby is crying"], "question": "which entity has a rooster?", "label": 0}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "people applaud and hoot and chat quietly"], "sample_ids": ["wRBHTgrbiwg", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["birds, chirp, cooing", "people, applaud, hoot"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person speaks briefly", "a clock ticktocks"], "sample_ids": ["zOZleIRqZm4", "v-g-j2uTByM"], "start_seconds": ["80", "30"], "properties": ["person, talk, brief", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "people speak as gunfire rings out"], "sample_ids": ["tdWhHV3X25Q", "wqTCwqVRDlk"], "start_seconds": ["60", "80"], "properties": ["applause, audience, yells", "gunfire, ring, speak"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "crowd applause while a guy laughs followed by another man speaking"], "sample_ids": ["xSKJGCItUWE", "tDlfY3nmx1A"], "start_seconds": ["10", "160"], "properties": ["engine, work, child", "applause, laugh, man"], "captions_pred_video": ["footage of the helicopter flying in the room", "a man in a suit and tie is talking to another man in a suit and tie"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a crowd is clapping and laughing and a man is speaking "], "question": "which entity is a performance", "label": 1}, {"captions": ["a dog barks and whimpers", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sShpyu2l4YQ", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "rooster, crow, background, men"], "captions_pred_video": ["the puppies are playing with a toy", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird?", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a man speaks as a car is passing by"], "sample_ids": ["sAam2NqGhLY", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["snoring, breathing, child", "a, car, pass"], "captions_pred_video": ["of a little girl sleeping on a couch", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person is snoring", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "paper folding and crinkling"], "sample_ids": ["tEE3MpBt1sg", "zPpG3RD8lSs"], "start_seconds": ["50", "20"], "properties": ["drill, something, laugh", "paper, fold, crinkle"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "the wind blows and a mouse clicks "], "question": "which is not a drill", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["slZLHwNbbt4", "zj2R0XoFr5k"], "start_seconds": ["300", "50"], "properties": ["train, horn, sound", "airplane, boy, fly"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "small dogs yip and bark sharply"], "sample_ids": ["s59PfAghdkM", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["bird, chirp, background, horse, neigh", "bark, yip, sharply"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["sfAvvZwdLCY", "tw76HGONaKg"], "start_seconds": ["20", "570"], "properties": ["water drains, flushes, water", "A, game, keyboard"], "captions_pred_video": ["footage of the toilet in the bathroom", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a toilet is flushed", "a man speaks and types on a computer keyboard "], "question": "which object is a source of water", "label": 0}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "water flows as men speak and yell"], "sample_ids": ["w9lpbUn0hPc", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["male, wind, rustling", "water, flow, men"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more like a stream", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "water flows and trickles"], "sample_ids": ["uJV8NDaHqqk", "tB7hWb9gTuQ"], "start_seconds": ["100", "30"], "properties": ["loud, fly, chirp", "water, flow, trickle"], "captions_pred_video": ["a bee hive in a wooden box", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a swarm of bees buzzing around", "water is splashing and gurgling"], "question": "which entity is not loud", "label": 1}, {"captions": ["a small engine idles continuously", "a man speaks as a motor runs in the background"], "sample_ids": ["y5WII6cTH7k", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["engine, idle, continuously", "background, motor, run"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vh30P49Po6s", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["loud, continuous, quacks", "three men, wind, flow"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a duck?", "label": 1}, {"captions": ["wind blows and a stream of water flows nearby", "birds chirp and objects are moved around"], "sample_ids": ["sYITalLZjj4", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["stream, flow, wind", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["wind blows and birds chirp", "insects buzz and a man speaks"], "question": "which entity is moving objects around", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w2M4i1mklOA", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["alarm, gears, turn", "stream, water, flow"], "captions_pred_video": ["footage of an antique clock", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a woman speaks as she rubs two objects together"], "sample_ids": ["zl9Dqx-j7q4", "vzxHnu-SFEw"], "start_seconds": ["6", "80"], "properties": ["motors rev, laugh, loudly", "two objects, woman, speak"], "captions_pred_video": ["footage of a man driving a car in the dark", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a jet engine roars ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "an infant crying frantically"], "sample_ids": ["uWPRNLnpy7Y", "zwOBqeFTgiU"], "start_seconds": ["10", "30"], "properties": ["accelerate, laugh, vehicle", "cry, infant, frantically"], "captions_pred_video": ["is taken from a car driving down the street", "of the baby crying in the car seat"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "someone snores nearby"], "sample_ids": ["xzKKf9bKNUo", "spJCm8tD9Zo"], "start_seconds": ["10", "90"], "properties": ["background, noise, snoring", "someone snores, nearby, someone"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a person snoring loudly", "a person is snoring loudly"], "question": "which entity is a recording of snoring?", "label": 0}, {"captions": ["a man speaks as music plays before artillery is fired", "a propeller rotates loudly and intensely"], "sample_ids": ["vcmWSmvti8", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["music, man, fire", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["x5cuQjOdM3E", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["cat, meows, young woman", "engine, laugh, loud"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a cat meows and a woman speaks", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a child speaks in closed space"], "sample_ids": ["tGcFnX0GHI", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["ring, talk, woman", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a child speaking in a closed space?", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "paper is crumpling consistently"], "sample_ids": ["sSMl2vc3ek", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["loud, multiple, distance", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person snoring loudly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "wind blows as people chatter quietly"], "sample_ids": ["spYNpeN7rPY", "xBxDz0CFVn0"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "wind, chatter, people"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yaln9y8I7ms", "vb1fPSDI4c"], "start_seconds": ["230", "30"], "properties": ["female, flushes, toilet", "multiple, people, yell"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and a man speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a door opens and birds chirp", "a child yells and another yells"], "sample_ids": ["yeFvk9x0wWI", "vMDHu7Lxcgw"], "start_seconds": ["30", "410"], "properties": ["door, open, birds", "two, yell, child"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "a boy playing on a trampoline in the backyard"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman is speaking and a child is shouting"], "question": "which entity is more likely to be a scream", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "an infant crying as a woman laughs"], "sample_ids": ["yZp6xizR0yU", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["animal, bleat, cry", "a, laugh, infant"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a baby cries and a woman speaks"], "question": "which entity is crying", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["y8WEcpOlT3I", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["wind, speak, buffeting", "water, radio, man"], "captions_pred_video": ["on how to use a sewing machine youtube", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "an infant crying as a woman laughs"], "sample_ids": ["xKB8O8LTs6s", "xhmRY9yhC7c"], "start_seconds": ["70", "20"], "properties": ["music, radio, gunshots", "a, laugh, infant"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a baby cries and a woman speaks"], "question": "which entity is about a woman?", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "birds chirp and an owl hoots before a man speaks briefly"], "sample_ids": ["x6ijhqRY38s", "wRBHTgrbiwg"], "start_seconds": ["250", "50"], "properties": ["something metal, glass, hit", "bird, owl, speak"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "birds are chirping and insects are buzzing"], "question": "which entity has more animals", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a stream of water flows as people talk and wind blows"], "sample_ids": ["ylpYOorfH4o", "xBxDz0CFVn0"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "stream, water, flow"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "people applaud and hoot and chat quietly"], "sample_ids": ["zl9Dqx-j7q4", "wwyfGO2J4"], "start_seconds": ["6", "90"], "properties": ["motors rev, laugh, loudly", "people, applaud, hoot"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "wind blows as people chatter quietly"], "sample_ids": ["vMf1dLD6Sng", "xBxDz0CFVn0"], "start_seconds": ["6", "30"], "properties": ["frog, bird, vocalize", "wind, chatter, people"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "footage is blurry and out of focus"], "captions_pred_audio": ["a frog croaks loudly", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "winds blows roughly as a vehicle races past"], "sample_ids": ["wudZTNBtVqc", "xjvTpk2Zpr8"], "start_seconds": ["60", "70"], "properties": ["accelerates, engine, wind", "wind, blows, vehicle"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a rumble grows louder", "several insects fly while two men talk"], "sample_ids": ["y4MY9mp8-TA", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["loudness, increase, rumble", "several, fly, men"], "captions_pred_video": ["a helicopter flying in the sky", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a helicopter flies overhead ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be in a forest", "label": 1}, {"captions": ["someone is snoring while sleeping", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["ujMt0-D-x2k", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["snore, sleep, someone", "two men, woman, birds"], "captions_pred_video": ["of the dog playing with a toy on the floor", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a baby laugh at a sputter", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["sLUnaPT5gM8", "wyllXV6PjKo"], "start_seconds": ["0", "30"], "properties": ["laugh, sputter, baby", "a baby, a woman, a man"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman speaks and a baby cries"], "question": "which baby is crying", "label": 1}, {"captions": ["a goat bleats as a person speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["tPJvjq9QePY", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["bleats, person, speak", "people, applaud, hoot"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["long loud burping by a man", "some men converse over an engine running"], "sample_ids": ["xmiUIOhtZyQ", "sCiy7QS1U"], "start_seconds": ["60", "300"], "properties": ["loud, burp, man", "men, converse, engine"], "captions_pred_video": ["homer simpson drinking a beer", null], "captions_pred_audio": ["a person burps and music plays in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man?", "label": 0}, {"captions": ["an engine runs and a man speaks", "a child speaks in closed space"], "sample_ids": ["yT5WfYMRr-U", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["engine, run, man", "child, space, speak"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sZPuqDgX2V0", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["commentator, race, track", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video of a race", "label": 0}, {"captions": ["a woman speaks followed by clicks and scraping", "a woman speaks as she rubs two objects together"], "sample_ids": ["yYJksgsxx5U", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["audio, clicks, scraping", "two objects, woman, speak"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "paper is crumpling consistently"], "sample_ids": ["yaln9y8I7ms", "v5cSxLaHADY"], "start_seconds": ["230", "0"], "properties": ["female, flushes, toilet", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry and out of focus", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a toilet flushes and a man speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a person is burping then speaks and laughs"], "sample_ids": ["w2M4i1mklOA", "wAAkbZToh8"], "start_seconds": ["30", "0"], "properties": ["loud, chime, bell", "burp, laugh, speak"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man burps and a woman speaks"], "question": "which entity is speaking", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["vs65y4qmyBE", "wwyfGO2J4"], "start_seconds": ["340", "90"], "properties": ["wind, blows, strongly", "people, applaud, hoot"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a natural phenomenon", "label": 0}, {"captions": ["a door slams shut roughly", "water splashes and a door squeaks"], "sample_ids": ["zkKdxzNC97Y", "sdXV-ylviw"], "start_seconds": ["27", "190"], "properties": ["a door, slams, shut", "sound, splash, door"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a dog barks and taps with background noise "], "question": "which door is squeaking", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a woman sneezes then speaks"], "sample_ids": ["tDVADusiIoc", "x4dZyf9Gbj0"], "start_seconds": ["60", "130"], "properties": ["water, radio, man", "sneezes, speaks, woman"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman sneezes and speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["uJV8NDaHqqk", "ziUT9IFTkjg"], "start_seconds": ["100", "10"], "properties": ["loud, fly, chirp", "background, birds, rustling"], "captions_pred_video": ["a bee hive in a wooden box", null], "captions_pred_audio": ["a swarm of bees buzzing around", "birds are chirping and a chime is ringing "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zF8yoL0rkbI", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["engine, run, someone", "two men, woman, birds"], "captions_pred_video": ["footage of the traffic on the street at night", null], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sxYkFKFIZD0", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["screech, man, door", "men, talk, cars"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a car speeding up in the distance", "vehicles pass by on a roadway"], "sample_ids": ["u0TrcHhkPQ", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["distance, car, speed", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car is driving on the road "], "question": "which vehicle is moving faster", "label": 0}, {"captions": ["children speak as a female ask them questions", "a clock ticktocks"], "sample_ids": ["wEBlkGWVWwE", "v-g-j2uTByM"], "start_seconds": ["260", "30"], "properties": ["female, speak, questions", "ticktocks, clock, ticktocks"], "captions_pred_video": ["shows a person writing on the whiteboard", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a drill drills through something then people begin laughing"], "sample_ids": ["t25U-v4k4ts", "tEE3MpBt1sg"], "start_seconds": ["40", "50"], "properties": ["a, chirps, bird", "drill, something, laugh"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "people are laughing breathing and speaking with background noise "], "question": "which entity is about a drill?", "label": 1}, {"captions": ["someone whistles a tune", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sIXTftIuUgw", "wqZ135Ssz0"], "start_seconds": ["90", "60"], "properties": ["someone, tune, whistle", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "birds tweet and squawk"], "sample_ids": ["sWZzXuWYY", "w1mlz3Pe4fU"], "start_seconds": ["420", "300"], "properties": ["male, clanks, thumps", "squawk, tweet, scream"], "captions_pred_video": [null, "of a bird in a cage"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "birds are chirping and singing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a child speaks in closed space", "people speak softly as food sizzles"], "sample_ids": ["yW6FWLSLkx4", "yhQ2Lg-7qDY"], "start_seconds": ["40", "130"], "properties": ["child, space, speak", "food, sizzle, speak"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a faucet is running and a man is speaking"], "question": "which entity is more likely to be in a closed space", "label": 0}, {"captions": ["a person sniffs and sneezes", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uRlbY6aoBU", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["sneezes, person, sniffs", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is sneezing ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a person", "label": 0}, {"captions": ["a woman speaks and food sizzles while frying", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wTideSjRFS0", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["food, sizzle, woman", "People, motor, brakes"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a woman speaking and food sizzling while frying?", "label": 0}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "people cheer as a vehicle engine revs"], "sample_ids": ["uC9dtII1KDI", "xjhAnI2q6hM"], "start_seconds": ["150", "6"], "properties": ["wind, gusts, distance", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["a stream of water flows quickly", "people speak as gunfire rings out"], "sample_ids": ["wbHTKEJZyhc", "wqTCwqVRDlk"], "start_seconds": ["20", "80"], "properties": ["stream, water, flow", "gunfire, ring, speak"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "wind blowing followed by a zoom"], "sample_ids": ["uOpoD0gGXcs", "vr8ZXjEBhMQ"], "start_seconds": ["120", "150"], "properties": ["chirps, woman, bird", "wind, blow, zoom"], "captions_pred_video": ["a herd of cows grazing in the field", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["birds are chirping and a man is speaking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to blow", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a man speaks as a car is passing by"], "sample_ids": ["vXlk0lIQBFo", "sK4u5T8hW78"], "start_seconds": ["470", "30"], "properties": ["wind, talk, vocalize", "a, car, pass"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "children speak and play together"], "sample_ids": ["xjvTpk2Zpr8", "yVVP8XvWJTo"], "start_seconds": ["70", "260"], "properties": ["wind, blows, vehicle", "children, speak, play"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a jet engine roars and wind blows ", "children are speaking and breathing with background noise "], "question": "which entity is more likely to be in a vehicle", "label": 0}, {"captions": ["an emergency siren wails as it passes", "wind blows as people chatter quietly"], "sample_ids": ["vGj1XLJvNrw", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["wails, wails, pass", "wind, chatter, people"], "captions_pred_video": ["footage of a police car driving down a city street", "footage is blurry and out of focus"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["v0x1odnXtP0", "yajyRTUQk3U"], "start_seconds": ["210", "400"], "properties": ["keyboard, type, computer", "a woman, something, fried"], "captions_pred_video": ["how to make money on youtube in spanish", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking while food is frying in the background"], "question": "which entity is a demonstration of cooking?", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["u--KhUW8l1Y", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["engine, sound, horn", "water, radio, man"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a man speaks as a machine runs"], "sample_ids": ["uoGVs9yUqY4", "vD6lYD1l0BY"], "start_seconds": ["30", "330"], "properties": ["multiple, vocalize, wind", "a, machine, run"], "captions_pred_video": ["for how to make a wooden shed door youtube", "game controller being held in the hands of the person"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a man is speaking and dishes are being washed "], "question": "which entity is a man speaking to a machine?", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vcmWSmvti8", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["music, man, fire", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a consistent ticking pattern"], "sample_ids": ["vimzuGQvdcU", "sCeWURVHfOM"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "ticking, pattern, clock"], "captions_pred_video": ["a group of people are rafting down a river", "- a close-up view of the clock's inner workings"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "ticking of a clock"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zliInBdC98Y", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["a, baby, cries, wails", "water, radio, man"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["y2ZBGpgbhHM", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["animal, growl, bird", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a baby cries and a woman moans", "dishes cling together then a man begins to speak"], "sample_ids": ["smDKStoHBJo", "sQGXqGcwOTc"], "start_seconds": ["0", "3"], "properties": ["a, cry, woman", "cling, speak, dishes"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "mechanisms are operating and water is splashing "], "question": "which entity is about a baby crying and a woman moaning?", "label": 0}, {"captions": ["an animal quacks rapidly", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["vh30P49Po6s", "siJFXfGWgDk"], "start_seconds": ["30", "50"], "properties": ["animal, quacks, rapidly", "man, woman, vehicle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking and birds are chirping in the background "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["the revving of an engine throttle followed by a man speaking", "several insects fly while two men talk"], "sample_ids": ["tezvROoo4bs", "s-T9OVOiMLo"], "start_seconds": ["40", "330"], "properties": ["audio, throttle, speaking", "several, fly, men"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a duck quacks several times"], "sample_ids": ["yeFvk9x0wWI", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["clack, bird, chirp", "quacks, duck, several"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 0}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uzQnlJXBbOM", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["ringing, beep, stop", "female, spraying, scream"], "captions_pred_video": ["footage of a person using a cell phone on a table", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a telephone rings and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a clock ticks quietly and rhythmically", "winds blows roughly as a vehicle races past"], "sample_ids": ["u7C-AEBQM", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["ticks, rhythmic, quiet", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a ticktock of a clock", "a jet engine roars and wind blows "], "question": "which entity is louder", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["weDbePuc-Xc", "xfaoyyzw2WU"], "start_seconds": ["40", "180"], "properties": ["music, slaps, human", "loud, jet engine, roar"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a horn rings out as a machine runs by"], "sample_ids": ["u5RmF3c3Aw", "slZLHwNbbt4"], "start_seconds": ["60", "300"], "properties": ["engine, car, zoom", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vzceMbklWc", "tDVADusiIoc"], "start_seconds": ["180", "60"], "properties": ["water, faucet, sink", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["water is running and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "water flows as men speak and yell"], "sample_ids": ["tGcFnX0GHI", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["ring, talk, woman", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "vehicles pass by on a roadway"], "sample_ids": ["tPJvjq9QePY", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["animal, bleat, moo", "pass, vehicle, roadway"], "captions_pred_video": ["a dog and a sheep in a barn", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a baby cries and a man speaks", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a man speaks, another man speaks, and a small bell dings"], "sample_ids": ["uEU-Hg5MTN8", "t69a8aRKhmc"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "a, b, c"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and birds are chirping in the background "], "question": "which entity has a ding?", "label": 1}, {"captions": ["a person speaks briefly", "vehicles pass by on a roadway"], "sample_ids": ["zOZleIRqZm4", "tgbONvsP47Y"], "start_seconds": ["80", "0"], "properties": ["person, talk, brief", "pass, vehicle, roadway"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a car is driving on the road "], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a woman speaks happily and an animal chirps"], "sample_ids": ["xC8kbrKJmco", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["background, goat, scream", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "three men talk while wind blows and some liquid flows"], "sample_ids": ["slZLHwNbbt4", "vJ7JPEFhyLA"], "start_seconds": ["300", "16"], "properties": ["a, horn, run", "three men, wind, flow"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a machine running by?", "label": 0}, {"captions": ["food fries in a pan as someone talks and cooks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["ukxt9I7eMMg", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["food, pan, cook", "two men, woman, birds"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "a telephone rings followed by a woman talking"], "sample_ids": ["rqu8iB22IY", "tGcFnX0GHI"], "start_seconds": ["5", "0"], "properties": ["sound, repeats, laugh", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a person is burping while a girl speaks"], "sample_ids": ["yRx9txMcBl0", "vdoxuJn9lTc"], "start_seconds": ["40", "40"], "properties": ["accelerates, tires, squeals", "person, burp, girl"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a child speaks followed by a burp"], "question": "which entity is a person", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "roadway noise occurs and a truck accelerates"], "sample_ids": ["vJvryTwuAV8", "tgbONvsP47Y"], "start_seconds": ["16", "0"], "properties": ["audience, cheer, man", "noise, truck, accelerate"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a car is driving on the road "], "question": "which is not a person", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["xO-Q2BlIIPU", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["two men, exclamation, speak", "a woman, laughs, animal"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and laughing?", "label": 1}, {"captions": ["a helicopter engine idles continuously", "a man speaks while water trickles and flows"], "sample_ids": ["ugHJF0hfYkg", "sapQIQUhFc"], "start_seconds": ["10", "280"], "properties": ["engine, idle, continuously", "water, trickles, flow"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "roadway noise occurs and a truck accelerates"], "sample_ids": ["ukg5L09Wpvo", "tgbONvsP47Y"], "start_seconds": ["150", "0"], "properties": ["a train, a horn, a bell", "noise, truck, accelerate"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a car is driving on the road "], "question": "which is not a vehicle", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a man speaks followed by another man speaking outside"], "sample_ids": ["yZmhM1HcsyE", "viuTg1M-dqg"], "start_seconds": ["4", "30"], "properties": ["engine, roar, water", "two men, speak, follow"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a toilet flushes and a female speaks"], "sample_ids": ["sQwlkXjQabo", "yaln9y8I7ms"], "start_seconds": ["10", "230"], "properties": ["liquid, surface, spray", "female, flushes, toilet"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage is blurry and out of focus"], "captions_pred_audio": ["spraying followed by silence", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wvKpEYswXO0", "zj2R0XoFr5k"], "start_seconds": ["150", "50"], "properties": ["sound, water, running", "airplane, boy, fly"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tgbONvsP47Y", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["pass, vehicle, roadway", "a woman, something, fried"], "captions_pred_video": ["footage of a fire truck entering a garage", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a car is driving on the road ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["people speak softly as food sizzles", "a car accelerates and wind blows"], "sample_ids": ["yhQ2Lg-7qDY", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["food, sizzle, speak", "accelerates, wind, blows"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp as a train approaches", "paper is crumpling consistently"], "sample_ids": ["xM4joTqDVp4", "v5cSxLaHADY"], "start_seconds": ["160", "0"], "properties": ["bird, chirp, train", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["birds are chirping and a train is moving ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["material crumbles into a microphone", "a infant makes noise and is excited"], "sample_ids": ["vofpvUo6NAw", "wIJK3-5y0kA"], "start_seconds": ["220", "30"], "properties": ["material, crumbles, microphone", "noise, excited, infant"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "someone is typing on a computer keyboard"], "sample_ids": ["uWPRNLnpy7Y", "v0x1odnXtP0"], "start_seconds": ["10", "210"], "properties": ["accelerate, laugh, vehicle", "keyboard, type, computer"], "captions_pred_video": ["is taken from a car driving down the street", "how to make money on youtube in spanish"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person is typing on a keyboard"], "question": "which is a person", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "an insect buzzes around continuously"], "sample_ids": ["s4Uz1Ffgo04", "v25l1jef3JY"], "start_seconds": ["100", "0"], "properties": ["water, rushes, motorcycle", "buzzes, continuously, insect"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a fly is buzzing around a microphone "], "question": "which entity is moving faster", "label": 0}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "an insect buzzes around continuously"], "sample_ids": ["tDVADusiIoc", "v25l1jef3JY"], "start_seconds": ["60", "0"], "properties": ["water, radio, man", "buzzes, continuously, insect"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while water drains", "a train horn blows as it passes by"], "sample_ids": ["vSeGhaZt-aI", "zVacuqSb4LI"], "start_seconds": ["50", "30"], "properties": ["water, drain, man", "horn, blows, train"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is moving", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a child speaks in closed space"], "sample_ids": ["sZPuqDgX2V0", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["engine, accelerate, intercom", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a horn blasts loudly as a train passes"], "sample_ids": ["wEBlkGWVWwE", "zsLxS-uLJTw"], "start_seconds": ["260", "20"], "properties": ["a, babble, woman", "horn, blast, train"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage of the train on the tracks at sunrise or sunset"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a train blows its horn and moves on the tracks "], "question": "which entity is louder", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "some men converse over an engine running"], "sample_ids": ["xKB8O8LTs6s", "sCiy7QS1U"], "start_seconds": ["70", "300"], "properties": ["music, gunfire, explosion", "men, converse, engine"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more calm", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a horn rings out as a machine runs by"], "sample_ids": ["zF8yoL0rkbI", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["engine, run, someone", "a, horn, run"], "captions_pred_video": ["footage of the traffic on the street at night", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a man speaks as a car is passing by"], "sample_ids": ["yajyRTUQk3U", "sK4u5T8hW78"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "a, car, pass"], "captions_pred_video": ["- a woman cooking in the kitchen", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["goats bleat and people speak", "three men talk while wind blows and some liquid flows"], "sample_ids": ["z5iUE5h0EPs", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["goats bleat, people speak, language", "three men, wind, flow"], "captions_pred_video": ["of the goat in the barn", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a goat bleats and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "wind blows as people chatter quietly"], "sample_ids": ["sEprKHm8Sj8", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["car, tires, slows", "wind, chatter, people"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage is blurry and out of focus"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zF8yoL0rkbI", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["engine, run, someone", "engine, idle, woman"], "captions_pred_video": ["footage of the traffic on the street at night", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a woman is speaking and a subway train is moving "], "question": "which entity has a running engine", "label": 0}, {"captions": ["small dogs growl, bark and yip.", "a woman speaks as she rubs two objects together"], "sample_ids": ["sShpyu2l4YQ", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["growl, bark, yip", "two objects, woman, speak"], "captions_pred_video": ["the puppies are playing with a toy", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["zliInBdC98Y", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["a, baby, cries, wails", "engine, laugh, loud"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a baby cries and a woman speaks", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["two frogs croak at each other", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["zg0X6BnhOLQ", "xfaoyyzw2WU"], "start_seconds": ["410", "180"], "properties": ["two frogs, croak, at each other", "loud, jet engine, roar"], "captions_pred_video": ["footage of lightning in the sky at night", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a frog is croaking", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a man speaks as a car is passing by"], "sample_ids": ["y8dSeubCNI", "sK4u5T8hW78"], "start_seconds": ["4", "30"], "properties": ["men, women, car", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["an engine revving and people talking in the background", "a man is speaking with background noise and breathing sounds "], "question": "which car is revving and accelerating loudly", "label": 0}, {"captions": ["birds chirp and a dog breathes heavily", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["y2ZBGpgbhHM", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["dog, chirp, breathe", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["a child yells and another yells", "a train horn blows as it passes by"], "sample_ids": ["vMDHu7Lxcgw", "zVacuqSb4LI"], "start_seconds": ["410", "30"], "properties": ["two, yell, child", "horn, blows, train"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["w6RTHR6AeAg", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["call, owl, screech", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a helicopter engine idles continuously", "a train horn blows as it passes by"], "sample_ids": ["ugHJF0hfYkg", "zVacuqSb4LI"], "start_seconds": ["10", "30"], "properties": ["engine, idle, continuously", "horn, blows, train"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a helicopter is flying overhead ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a whistling owl calls out repeatedly and insects screech"], "sample_ids": ["zY3icUyMdh8", "w6RTHR6AeAg"], "start_seconds": ["20", "40"], "properties": ["dog, bark, engine", "call, owl, screech"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "an owl hoots and mechanisms operate "], "question": "which entity is a bird", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "a chime of a clock followed by various tones of ticking with come clinking"], "sample_ids": ["yNtRmrn0io8", "uqFtmnhuqA8"], "start_seconds": ["210", "30"], "properties": ["storm, distance, strike", "a, b, c"], "captions_pred_video": ["footage of a house in the middle of the night", "shows a clock on the wall of a room with a person standing in front of it"], "captions_pred_audio": ["rain falls and thunder roars", "mechanisms are ticking and a hammer is striking "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["t25U-v4k4ts", "wDVMhEdTiVw"], "start_seconds": ["40", "30"], "properties": ["a, chirps, bird", "gun, shoot, water"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause harm", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "small dogs yip and bark sharply"], "sample_ids": ["tMJne1a4AFI", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["wind, buzz, rustling", "bark, yip, sharply"], "captions_pred_video": ["a swarm of bees on the ground", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a swarm of bees buzzing around", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["heavy rain splashes as it falls", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["wP8ZKrlx3oA", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["fall, rain, splash", "animal, grunts, snorts"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking and a baby is crying"], "question": "which entity is not a splash", "label": 1}, {"captions": ["water flows as men speak and yell", "a child speaks in closed space"], "sample_ids": ["vJ7JPEFhyLA", "yW6FWLSLkx4"], "start_seconds": ["16", "40"], "properties": ["water, flow, men", "child, space, speak"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["continuous sneezing together with speech", "waves crash against a shoreline and people speak"], "sample_ids": ["x4dZyf9Gbj0", "yFB25fqfU8I"], "start_seconds": ["130", "300"], "properties": ["continuous, sneeze, speech", "wave, crash, shoreline"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "an engine runs loudly"], "sample_ids": ["ul60S8TXDA8", "vqZuVbG6-HI"], "start_seconds": ["60", "130"], "properties": ["sound, distance, bell", "loud, engine, run"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["s59PfAghdkM", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["bird, chirp, background, horse, neigh", "female, spraying, scream"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wP8ZKrlx3oA", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["rain, storm, thunder", "water, radio, man"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "some tunes played by whistling"], "sample_ids": ["uqFtmnhuqA8", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["a, b, c", "tune, play, whistling"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a person whistling a song"], "question": "which entity is a musical composition", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vddP56-ogds", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["liquid, laughs, man", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a church bell rings several times", "a train horn blows as it passes by"], "sample_ids": ["sUVVjE3Ucp8", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["ring, bell, several", "horn, blows, train"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a church bell is ringing ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["people clap and speak in the distance", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wwyfGO2J4", "xfaoyyzw2WU"], "start_seconds": ["90", "180"], "properties": ["clap, distance, speak", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a person snoring", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["t8tv5YRMJUg", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["a person, snore, loud", "wind, blow, vehicle"], "captions_pred_video": ["of a man getting his face licked by another man", null], "captions_pred_audio": ["a person sniffs and breathes heavily", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a person?", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a stream of water runs briefly"], "sample_ids": ["uzQnlJXBbOM", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["ringing, beep, stop", "stream, water, run"], "captions_pred_video": ["footage of a person using a cell phone on a table", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a telephone rings and a man speaks", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a motor runs in the distance as a soft wind periodically gusts"], "sample_ids": ["sfAvvZwdLCY", "xyL9F5VrjkE"], "start_seconds": ["20", "20"], "properties": ["water drains, flushes, water", "wind, motor, distance"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a caterpillar truck loading logs into a trailer"], "captions_pred_audio": ["a toilet is flushed", "the wind is blowing and a car is passing by "], "question": "which entity is a source of water", "label": 0}, {"captions": ["waves crash against a shoreline and wind blows", "a woman speaks as she rubs two objects together"], "sample_ids": ["zdYdyF9-m8U", "vzxHnu-SFEw"], "start_seconds": ["7", "80"], "properties": ["wind, crash, shoreline", "two objects, woman, speak"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["waves crash and wind blows ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a physical action", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vzxHnu-SFEw", "zFjIWfSD-4"], "start_seconds": ["80", "410"], "properties": ["two objects, woman, speak", "People, motor, brakes"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a clock ticktocks"], "sample_ids": ["uEU-Hg5MTN8", "v-g-j2uTByM"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a stream of water flows quickly", "people applaud and hoot and chat quietly"], "sample_ids": ["wbHTKEJZyhc", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["stream, water, flow", "people, applaud, hoot"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an airplane engine spools and people speak", "a clock ticktocks"], "sample_ids": ["wTjoRj1se3U", "v-g-j2uTByM"], "start_seconds": ["390", "30"], "properties": ["airplane, engine, spool", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a jet engine is running and people are talking", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["water splashes as an animal walks through", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["w1ir-sZ3Im8", "uYT5gxnyMWM"], "start_seconds": ["90", "50"], "properties": ["animal, water, splashes", "female, spraying, scream"], "captions_pred_video": ["footage of a group of people riding horses through a river", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["food is frying and sizzles", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zNRChLjqcU", "vfYTJq7nU"], "start_seconds": ["220", "130"], "properties": ["food is frying, sizzles, food", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running from a faucet into a sink", "a duck quacks and a woman speaks"], "question": "which entity is about a duck?", "label": 1}, {"captions": ["material crumbles into a microphone", "a man speaks followed by another man speaking outside"], "sample_ids": ["vofpvUo6NAw", "viuTg1M-dqg"], "start_seconds": ["220", "30"], "properties": ["material, crumbles, microphone", "two men, speak, follow"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a toilet flushes and a female speaks"], "sample_ids": ["zcDwZ6W7E3E", "yaln9y8I7ms"], "start_seconds": ["180", "230"], "properties": ["man, speak, motorcycles", "female, flushes, toilet"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "an infant crying frantically"], "sample_ids": ["xERFUeZONz8", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "cry, infant, frantically"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "of the baby crying in the car seat"], "captions_pred_audio": ["an emergency vehicle siren blares", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a stream of water flows as people talk and wind blows"], "sample_ids": ["x9JovgqUcs", "xBxDz0CFVn0"], "start_seconds": ["500", "30"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "paper is crumpling consistently"], "sample_ids": ["vs65y4qmyBE", "v5cSxLaHADY"], "start_seconds": ["340", "0"], "properties": ["engine, run, man", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a machine beeps continuously"], "sample_ids": ["wtDqrBygTcU", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["man, engine, run", "beeps, machine, continuously"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", null], "captions_pred_audio": ["a man is speaking and a motor is running", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["water pouring and bubbling", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uyRfq-jKPpo", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["water, bubbles, pouring", "female, spraying, scream"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["water is running from a faucet", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and spraying?", "label": 1}, {"captions": ["birds chirp and objects are moved around", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yPUYU6t3rwo", "vb1fPSDI4c"], "start_seconds": ["370", "30"], "properties": ["birds chirp, objects are moved around, birds", "multiple, people, yell"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", null], "captions_pred_audio": ["insects buzz and a man speaks", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["continuous chugging with birds chirping in the background", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["xM4joTqDVp4", "vbZ-0lGPneg"], "start_seconds": ["160", "30"], "properties": ["background, chirp, birds", "a woman, a television program, a bird"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a woman is speaking and a dog is whimpering"], "question": "which entity has birds chirping in the background?", "label": 0}, {"captions": ["a person is snoring while sleeping", "vehicles pass by on a roadway"], "sample_ids": ["vJrjSeP17yE", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "pass, vehicle, roadway"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person snoring loudly", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["zY3icUyMdh8", "vlS6YMeWAPo"], "start_seconds": ["20", "40"], "properties": ["dog, bark, engine", "sheep, baa, birds"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp as a bell rings", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["ziUT9IFTkjg", "vbZ-0lGPneg"], "start_seconds": ["10", "30"], "properties": ["chirp, bell, ring", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a car accelerates and wind blows"], "sample_ids": ["wTideSjRFS0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["food, sizzle, woman", "accelerates, wind, blows"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a vehicle engine accelerating then running on idle"], "sample_ids": ["zk-xJGQU8-4", "vYkA3cfXp5Q"], "start_seconds": ["130", "30"], "properties": ["food, man, woman", "engine, accelerate, idle"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a machine beeps continuously"], "sample_ids": ["vBHyYJ8pL0", "y682ml90jGw"], "start_seconds": ["2", "11"], "properties": ["noise, door, opening", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a beeping sound is being made "], "question": "which entity is accompanied by a door opening and closing", "label": 0}, {"captions": ["a man speaks over a running engine and blowing wind", "pigeons vocalize and birds chirp"], "sample_ids": ["ylpYOorfH4o", "uiS58TNyUiw"], "start_seconds": ["410", "430"], "properties": ["engine, running, wind", "vocalize, bird, chirp"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "several insects fly while two men talk"], "sample_ids": ["w-4gHptFNuU", "s-T9OVOiMLo"], "start_seconds": ["21", "330"], "properties": ["engine revs, accelerates, bump", "several, fly, men"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a clock ticktocks briefly"], "sample_ids": ["wP8ZKrlx3oA", "u7C-AEBQM"], "start_seconds": ["40", "30"], "properties": ["rain, storm, thunder", "ticktocks, clock, ticktocks briefly"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a ticktock of a clock"], "question": "which entity is a clock?", "label": 1}, {"captions": ["a low rumbling in the distance followed by a motorcycle engine revving up", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vr8ZXjEBhMQ", "uZesmtKZGSw"], "start_seconds": ["150", "250"], "properties": ["sound, distance, engine", "men, talk, cars"], "captions_pred_video": ["is taken from a motorcycle's point of view", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["scraping and female speech with distant music", "birds chirp and objects are moved around"], "sample_ids": ["yHeVV-xeOxQ", "yPUYU6t3rwo"], "start_seconds": ["130", "370"], "properties": ["female, speech, music", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a girl milking a goat's udder", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yFB25fqfU8I", "sSMl2vc3ek"], "start_seconds": ["300", "20"], "properties": ["wave, crash, shoreline", "loud, multiple, distance"], "captions_pred_video": ["footage of a person surfing in the ocean", null], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "waves crash against a shoreline and people speak"], "sample_ids": ["vJvryTwuAV8", "yFB25fqfU8I"], "start_seconds": ["16", "300"], "properties": ["audience, cheer, man", "wave, crash, shoreline"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["a door slams shut roughly", "multiple ducks quack continuously"], "sample_ids": ["zkKdxzNC97Y", "wfHeoPDLMaM"], "start_seconds": ["27", "30"], "properties": ["a door, slams, shut", "multiple, quack, continuously"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire"], "captions_pred_audio": ["a door is opened and closed", "ducks are quacking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaking with light rustling", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zOZleIRqZm4", "wqZ135Ssz0"], "start_seconds": ["80", "60"], "properties": ["light, rustling, man", "two men, woman, birds"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["water flows followed by women screaming", "a man speaks then blows a vehicle horn as wind blows"], "sample_ids": ["w5W5Kqtc8E", "zALy31PjDl0"], "start_seconds": ["100", "21"], "properties": ["water, flow, women", "a man, a vehicle, a horn"], "captions_pred_video": [null, "a motorcycle is parked on the side of a brick walkway"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and a car horn is honking"], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["multiple ducks quack continuously", "some men converse over an engine running"], "sample_ids": ["wfHeoPDLMaM", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["multiple, quack, continuously", "men, converse, engine"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a train horn sounds as it passes by", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["ukg5L09Wpvo", "xfaoyyzw2WU"], "start_seconds": ["150", "180"], "properties": ["sound, train, horn", "loud, jet engine, roar"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vBHyYJ8pL0", "vb1fPSDI4c"], "start_seconds": ["2", "30"], "properties": ["noise, door, opening", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["several ducks are quacking and squawking", "some men converse over an engine running"], "sample_ids": ["wfHeoPDLMaM", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["quacking, squawking, ducks", "men, converse, engine"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a group of people", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "water pouring and bubbling"], "sample_ids": ["vKrYfzleLB8", "uyRfq-jKPpo"], "start_seconds": ["110", "50"], "properties": ["a, ring, gunshots", "water, bubbles, pouring"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks while water drains", "an airplane engine runs"], "sample_ids": ["vSeGhaZt-aI", "yVPZ2MNWpms"], "start_seconds": ["50", "0"], "properties": ["water, drain, man", "engine, airplane, runs"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays"], "sample_ids": ["viuTg1M-dqg", "sU53zg9Jp7s"], "start_seconds": ["30", "380"], "properties": ["two men, speak, follow", "a bird chirps, a door bell ringing, a woman gasps"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "a cartoon girl is standing in front of a blue couch"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "birds chirp and a doorbell rings with breathing and music in the background "], "question": "which entity has a doorbell ringing?", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["su6FAOcOA8c", "uZesmtKZGSw"], "start_seconds": ["4", "250"], "properties": ["engine, run, woman", "men, talk, cars"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has a woman making an announcement?", "label": 0}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a person uses a saw to cut some wood"], "sample_ids": ["ziUT9IFTkjg", "sHbXC6na9hg"], "start_seconds": ["10", "0"], "properties": ["background, birds, rustling", "a person, saw, wood"], "captions_pred_video": [null, "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "an engine is idling and vibrating"], "question": "which entity is more active", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a car speeding up in the distance"], "sample_ids": ["tOSWIURC-4", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["engine, work, nearby", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["birds chirp as a bell rings", "some tunes played by whistling"], "sample_ids": ["ziUT9IFTkjg", "u6BnG6YZqJ4"], "start_seconds": ["10", "0"], "properties": ["chirp, bell, ring", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vBHyYJ8pL0", "zl9Dqx-j7q4"], "start_seconds": ["2", "6"], "properties": ["noise, door, opening", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a child speaks in closed space"], "sample_ids": ["sK4u5T8hW78", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["a, car, pass", "child, space, speak"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a weapon fires multiple times", "water splashes as an animal walks through"], "sample_ids": ["sMC07Ucy7kg", "w1ir-sZ3Im8"], "start_seconds": ["10", "90"], "properties": ["weapon, fire, multiple", "animal, water, splashes"], "captions_pred_video": ["footage is from a car's point of view", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "water splashes and gurgles as people speak"], "question": "which entity is not a weapon", "label": 1}, {"captions": ["a cat meows and children speak", "a power tool runs and touches a surface"], "sample_ids": ["x5cuQjOdM3E", "zfvPRf3chY"], "start_seconds": ["30", "290"], "properties": ["cat, speak, children", "power tool, run, touch"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking while a power tool is being used "], "question": "which entity is a machine", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["rqu8iB22IY", "zFjIWfSD-4"], "start_seconds": ["5", "410"], "properties": ["sound, repeats, laugh", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["a car speeding up in the distance", "water flows followed by women screaming"], "sample_ids": ["u0TrcHhkPQ", "w5W5Kqtc8E"], "start_seconds": ["20", "100"], "properties": ["distance, car, speed", "water, flow, women"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is moving faster", "label": 0}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "a horn rings out as a machine runs by"], "sample_ids": ["wqZ135Ssz0", "slZLHwNbbt4"], "start_seconds": ["60", "300"], "properties": ["two men, woman, birds", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity has a horn", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a car speeding up in the distance"], "sample_ids": ["zgUgkpk78xU", "u0TrcHhkPQ"], "start_seconds": ["70", "20"], "properties": ["horn, bells, ring", "distance, car, speed"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a duck quacks loudly and continuously"], "sample_ids": ["u7C-AEBQM", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["ticks, rhythmic, quiet", "loud, continuous, quacks"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a ticktock of a clock", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["people speak as gunfire rings out", "a dog barks and whimpers"], "sample_ids": ["wqTCwqVRDlk", "sShpyu2l4YQ"], "start_seconds": ["80", "0"], "properties": ["gunfire, ring, speak", "barks, whimpers, dog"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "the puppies are playing with a toy"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a dog is barking and growling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["ylpYOorfH4o", "zl9Dqx-j7q4"], "start_seconds": ["410", "6"], "properties": ["motor, run, steady", "engine, laugh, loud"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["ul60S8TXDA8", "y2bVZ7rz-5M"], "start_seconds": ["60", "280"], "properties": ["sound, distance, bell", "motor noise, horn, siren"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honking and a siren wailing?", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tK4VlLsNxak", "sSMl2vc3ek"], "start_seconds": ["120", "20"], "properties": ["a, dial, telephone", "loud, multiple, distance"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a man speaks followed by another man speaking outside"], "sample_ids": ["uEU-Hg5MTN8", "viuTg1M-dqg"], "start_seconds": ["27", "30"], "properties": ["a woman, laughs, animal", "two men, speak, follow"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["ylpYOorfH4o", "tdWhHV3X25Q"], "start_seconds": ["410", "60"], "properties": ["engine, run, loud", "applause, audience, yells"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "small dogs yip and bark sharply"], "sample_ids": ["vJ7JPEFhyLA", "v-wcQf4BDY0"], "start_seconds": ["16", "120"], "properties": ["three men, wind, flow", "bark, yip, sharply"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tQWGZLItBXk", "zl9Dqx-j7q4"], "start_seconds": ["170", "6"], "properties": ["voice, music, whoosh", "engine, laugh, loud"], "captions_pred_video": ["worms revolution screenshots", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "three men talk while wind blows and some liquid flows"], "sample_ids": ["siJFXfGWgDk", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["a, bird, vehicle", "three men, wind, flow"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["zj2R0XoFr5k", "ukg5L09Wpvo"], "start_seconds": ["50", "150"], "properties": ["airplane, boy, fly", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a telephone rings followed by a woman talking"], "sample_ids": ["vuUVPzd2FXw", "tGcFnX0GHI"], "start_seconds": ["160", "0"], "properties": ["a, steam, release", "ring, talk, woman"], "captions_pred_video": ["of the person cooking on the grill with a spatula", null], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a man speaks as a machine runs", "a man speaks while a rooster crows and other people speak in the background"], "sample_ids": ["vD6lYD1l0BY", "wz7N8YRy74I"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "rooster, crow, background, people"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a clock ticktocks"], "sample_ids": ["tK4VlLsNxak", "v-g-j2uTByM"], "start_seconds": ["120", "30"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "ticktocks, clock, ticktocks"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "wind blows as people chatter quietly"], "sample_ids": ["y2ZBGpgbhHM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["animal, growl, bird", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["children speak as a female ask them questions", "a car accelerates and wind blows"], "sample_ids": ["wEBlkGWVWwE", "u0TrcHhkPQ"], "start_seconds": ["260", "20"], "properties": ["female, speak, questions", "accelerates, wind, blows"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a race car accelerates and revs its engine "], "question": "which is not a person", "label": 0}, {"captions": ["electronic beeps occur in a short series", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["y682ml90jGw", "y2bVZ7rz-5M"], "start_seconds": ["11", "280"], "properties": ["beeps, series, electronic", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a beeping sound is being made ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["w2JXXIAdUdg", "zFjIWfSD-4"], "start_seconds": ["10", "410"], "properties": ["snoring, distance, person", "People, motor, brakes"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", null], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a person speaking indiscriminately in the distance?", "label": 0}, {"captions": ["a woman speaks with water running", "an airplane engine spools and people speak"], "sample_ids": ["wTideSjRFS0", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["water, running, woman", "airplane, engine, spool"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a jet engine is running and people are talking"], "question": "which entity is a video of a woman speaking with water running?", "label": 0}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["vlS6YMeWAPo", "y2bVZ7rz-5M"], "start_seconds": ["40", "280"], "properties": ["sheep, baa, birds", "motor noise, horn, siren"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a goat bleats and birds chirp", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "a motorcycle engine works nearby"], "sample_ids": ["xSKJGCItUWE", "tOSWIURC-4"], "start_seconds": ["10", "0"], "properties": ["engine, work, child", "engine, work, nearby"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a lawn mower is running "], "question": "which engine works nearby", "label": 1}, {"captions": ["a man speaks as a car is passing by", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sK4u5T8hW78", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "multiple, people, yell"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a male speaks and another male speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["viuTg1M-dqg", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["two males, speaking, male", "music, gunfire, explosion"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more action", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a car speeding up in the distance"], "sample_ids": ["xKB8O8LTs6s", "u0TrcHhkPQ"], "start_seconds": ["70", "20"], "properties": ["music, gunshots, explosion", "distance, car, speed"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "dishes cling together then a man begins to speak"], "sample_ids": ["xzKKf9bKNUo", "sQGXqGcwOTc"], "start_seconds": ["10", "3"], "properties": ["background, noise, snoring", "cling, speak, dishes"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a person snoring loudly", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a stream of water runs briefly", "water flows as men speak and yell"], "sample_ids": ["x-PeY8Yb8M4", "vJ7JPEFhyLA"], "start_seconds": ["300", "16"], "properties": ["stream, water, run", "water, flow, men"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows water flowing?", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "pigeons vocalize and birds chirp"], "sample_ids": ["sTpirNYo8vQ", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["a, tone, fast", "vocalize, bird, chirp"], "captions_pred_video": ["of a man taking a selfie on a bus", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["scraping and female speech with distant music", "a man speaks uses a drill"], "sample_ids": ["yHeVV-xeOxQ", "x5eIC7S0fbg"], "start_seconds": ["130", "60"], "properties": ["female, speech, music", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["of a girl milking a goat's udder", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a man is speaking and using a power tool "], "question": "which entity is a tool", "label": 1}, {"captions": ["wind blows strongly", "wind blowing followed by a zoom"], "sample_ids": ["w8uLijTqtlU", "vr8ZXjEBhMQ"], "start_seconds": ["70", "150"], "properties": ["wind, blows, strongly", "wind, blow, zoom"], "captions_pred_video": ["footage is blurry and shaky", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["the wind is blowing strongly", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more calm", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a toilet flushes and a female speaks"], "sample_ids": ["xBxDz0CFVn0", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["wind, chatter, people", "female, flushes, toilet"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a toilet flushes and a man speaks"], "question": "which entity is more silent", "label": 0}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a siren comes to life as a horn blares"], "sample_ids": ["slZLHwNbbt4", "u--KhUW8l1Y"], "start_seconds": ["300", "0"], "properties": ["clap, distance, horn", "horn, siren, life"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a firefighter spraying water from a fire hydrant at night"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a fire truck siren blares and a horn blows "], "question": "which entity is a siren", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a vehicle engine revs and tires squeal"], "sample_ids": ["yJ0TePmaOo", "yDoT73BWsdA"], "start_seconds": ["390", "10"], "properties": ["two hard objects, man, speak", "engine revs, tires squeal, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a man speaks as a car is passing by"], "sample_ids": ["yYJksgsxx5U", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["audio, woman, silverware", "a, car, pass"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a man speaks as a car is passing by"], "sample_ids": ["tQWGZLItBXk", "sK4u5T8hW78"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "a, car, pass"], "captions_pred_video": ["worms revolution screenshots", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a train horn blows as it passes by"], "sample_ids": ["su6FAOcOA8c", "zVacuqSb4LI"], "start_seconds": ["4", "30"], "properties": ["engine, idle, woman", "horn, blows, train"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is moving", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["zj2R0XoFr5k", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["airplane, boy, fly", "engine, laugh, loud"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks uses a drill", "paper folding and crinkling"], "sample_ids": ["x5eIC7S0fbg", "zPpG3RD8lSs"], "start_seconds": ["60", "20"], "properties": ["A man is speaking, uses a drill, and is a tool", "paper, fold, crinkle"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and using a power tool ", "the wind blows and a mouse clicks "], "question": "which is a tool", "label": 0}, {"captions": ["loud intermittent buzzing with intermittent laughter", "water splashes as an animal walks through"], "sample_ids": ["sLUnaPT5gM8", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["loud, laughter, intermittent", "animal, water, splashes"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "water splashes and gurgles as people speak"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a small engine idles continuously", "an engine idles consistently before sputtering some"], "sample_ids": ["y5WII6cTH7k", "rwTERCUno"], "start_seconds": ["40", "90"], "properties": ["engine, idle, continuously", "engine, idle, sputter"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", null], "captions_pred_audio": ["an engine is knocking and vibrating ", "an engine is idling and vibrating"], "question": "which engine idles consistently before sputtering some", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a clock ticktocks"], "sample_ids": ["y8WEcpOlT3I", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["wind, speak, buffeting", "ticktocks, clock, ticktocks"], "captions_pred_video": ["on how to use a sewing machine youtube", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a heavy rain falls endlessly", "people applaud and hoot and chat quietly"], "sample_ids": ["wP8ZKrlx3oA", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["heavy, rain, fall", "people, applaud, hoot"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a movie", "label": 1}, {"captions": ["a train horn blows as it passes by", "paper is crumpling consistently"], "sample_ids": ["zVacuqSb4LI", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["horn, blows, train", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "water splashes and a door squeaks"], "sample_ids": ["y8dSeubCNI", "sdXV-ylviw"], "start_seconds": ["4", "190"], "properties": ["men, women, car", "sound, splash, door"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine revving and people talking in the background", "a dog barks and taps with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vddP56-ogds", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["water, splash, person, laugh", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a car accelerates and wind blows"], "sample_ids": ["sofxkNWaP0s", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["wind, engine, louder", "accelerates, wind, blows"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", null], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a small engine spits as it runs", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sZvwOuuPGP0", "sLUnaPT5gM8"], "start_seconds": ["50", "0"], "properties": ["spits, engine, runs", "loud, laughter, intermittent"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a medium engine is running ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a person", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a duck quacks loudly and continuously"], "sample_ids": ["ylpYOorfH4o", "vh30P49Po6s"], "start_seconds": ["410", "30"], "properties": ["motor, run, steady", "loud, continuous, quacks"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a door slams shut roughly"], "sample_ids": ["vbr9mHKc8WM", "zkKdxzNC97Y"], "start_seconds": ["40", "27"], "properties": ["noise, loudness, engine", "a door, slams, shut"], "captions_pred_video": [null, "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["an engine is idling", "a door is opened and closed"], "question": "which entity is louder", "label": 1}, {"captions": ["people speak and tapping occurs", "plastic is tapped on while someone speaks"], "sample_ids": ["tFCUUGdREgA", "wvKpEYswXO0"], "start_seconds": ["70", "150"], "properties": ["people, tap, speak", "plastic, tap, speak"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "water is sprayed across a hard surface"], "sample_ids": ["uWAAAL4CIoc", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["a woman, chirps, animal", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "wind blowing followed by a zoom"], "sample_ids": ["xfaoyyzw2WU", "vr8ZXjEBhMQ"], "start_seconds": ["180", "150"], "properties": ["loud, jet engine, roar", "wind, blow, zoom"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "wind blows and a chainsaw cuts through wood "], "question": "which is not loud", "label": 1}, {"captions": ["a man woman speak while crickets sing", "pigeons vocalize and birds chirp"], "sample_ids": ["zTLVJCo4WEE", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["a, crickets, sing", "vocalize, bird, chirp"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "of the pigeon in the cage"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "metal clacking as food and oil sizzles followed by a woman talking"], "sample_ids": ["wAAkbZToh8", "vW4x7S1VfQc"], "start_seconds": ["0", "150"], "properties": ["burp, laugh, speak", "clacking, oil, woman"], "captions_pred_video": [null, "footage of a person cooking fish in a frying pan on a stove top"], "captions_pred_audio": ["a man burps and a woman speaks", "food sizzles in a frying pan"], "question": "which entity is a person", "label": 0}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yks4cLgIDMc", "zl9Dqx-j7q4"], "start_seconds": ["170", "6"], "properties": ["background, speaking, child", "engine, laugh, loud"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and a child is crying", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "wind blows as people chatter quietly"], "sample_ids": ["yNtRmrn0io8", "xBxDz0CFVn0"], "start_seconds": ["210", "30"], "properties": ["storm, distance, strike", "wind, chatter, people"], "captions_pred_video": ["footage of a house in the middle of the night", "footage is blurry and out of focus"], "captions_pred_audio": ["rain falls and thunder roars", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet flushes and water drains", "an infant crying frantically"], "sample_ids": ["sfAvvZwdLCY", "zwOBqeFTgiU"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "cry, infant, frantically"], "captions_pred_video": ["footage of the toilet in the bathroom", "of the baby crying in the car seat"], "captions_pred_audio": ["a toilet is flushed", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["a baby cries and a woman speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tMbMDvT50j8", "tDVADusiIoc"], "start_seconds": ["12", "60"], "properties": ["a, cry, woman", "water, radio, man"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["multiple birds vocalize and wind blows", "a woman speaks happily and an animal chirps"], "sample_ids": ["uoGVs9yUqY4", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["multiple, vocalize, wind", "a woman, chirps, animal"], "captions_pred_video": ["for how to make a wooden shed door youtube", null], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xZepNM9qcRA", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["background, motor, run", "engine, revs, vehicle"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a race car accelerates and revs its engine "], "question": "which entity has a vehicle passing by?", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "water splashes as an animal walks through"], "sample_ids": ["uzQnlJXBbOM", "w1ir-sZ3Im8"], "start_seconds": ["50", "90"], "properties": ["ringing, beep, stop", "animal, water, splashes"], "captions_pred_video": ["footage of a person using a cell phone on a table", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a telephone rings and a man speaks", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "vehicles pass by on a roadway"], "sample_ids": ["y2bVZ7rz-5M", "tgbONvsP47Y"], "start_seconds": ["280", "0"], "properties": ["engine, horn, siren", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "some tunes played by whistling"], "sample_ids": ["su6FAOcOA8c", "u6BnG6YZqJ4"], "start_seconds": ["4", "0"], "properties": ["engine, run, woman", "tune, play, whistling"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uJV8NDaHqqk", "su6FAOcOA8c"], "start_seconds": ["100", "4"], "properties": ["loud, fly, chirp", "engine, idle, woman"], "captions_pred_video": ["a bee hive in a wooden box", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking and a subway train is moving "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a man speaks over intermittent keyboard taps"], "sample_ids": ["t69a8aRKhmc", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["a, b, c", "audio, man, keyboard"], "captions_pred_video": ["footage is blurry and out of focus", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a man speaks followed by another man speaking outside"], "sample_ids": ["u21-Z5gJCB8", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["background, voice, man", "two men, speak, follow"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking with another voice speaking in the background?", "label": 0}, {"captions": ["a male speaks over some small clicks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["uXxVebHsGZ8", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["male, clicks, speak", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a man speaks as a car is passing by"], "sample_ids": ["zY3icUyMdh8", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "a, car, pass"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more passive", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["y8dSeubCNI", "vlS6YMeWAPo"], "start_seconds": ["4", "40"], "properties": ["men, women, car", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["an engine revving and people talking in the background", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["yks4cLgIDMc", "xjvTpk2Zpr8"], "start_seconds": ["170", "70"], "properties": ["background, speaking, child", "wind, blows, vehicle"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a child is crying", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xjhAnI2q6hM", "vfYTJq7nU"], "start_seconds": ["6", "130"], "properties": ["engine revs, vehicle, people", "rustling, ducks, quack"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a church bell rings several times", "someone is typing on a computer keyboard"], "sample_ids": ["sUVVjE3Ucp8", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["ring, bell, several", "keyboard, type, computer"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "how to make money on youtube in spanish"], "captions_pred_audio": ["a church bell is ringing ", "a person is typing on a keyboard"], "question": "which is not a type of computer", "label": 0}, {"captions": ["birds chirp quietly and an adult man speaks", "an engine runs loudly"], "sample_ids": ["zuua6-5goWw", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["birds, chirp, quiet, man, speaks", "loud, engine, run"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "footage is blurry because it's raining outside"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a lawn mower is running and men are speaking "], "question": "which is quieter", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "paper is crumpling consistently"], "sample_ids": ["uzQnlJXBbOM", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["ringing, beep, stop", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a person using a cell phone on a table", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a telephone rings and a man speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tw76HGONaKg", "xfaoyyzw2WU"], "start_seconds": ["570", "180"], "properties": ["A, game, keyboard", "loud, jet engine, roar"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "several insects fly while two men talk"], "sample_ids": ["vdoxuJn9lTc", "s-T9OVOiMLo"], "start_seconds": ["40", "330"], "properties": ["burp, loud, girl", "several, fly, men"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a girl speaking?", "label": 0}, {"captions": ["a vehicle engine revs as the vehicle passes", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["yDoT73BWsdA", "wqZ135Ssz0"], "start_seconds": ["10", "60"], "properties": ["engine, revs, vehicle", "two men, woman, birds"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a child speaks in closed space"], "sample_ids": ["xfudFO976zE", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["animal, bleats, cry", "child, space, speak"], "captions_pred_video": ["footage is blurry and shaky", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["paper is crumpling consistently", "a car accelerates and wind blows"], "sample_ids": ["v5cSxLaHADY", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "accelerates, wind, blows"], "captions_pred_video": ["footage of the person holding a pair of scissors", null], "captions_pred_audio": ["paper is crumpled and crinkled", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a man speaks as a motor runs in the background"], "sample_ids": ["vJ7JPEFhyLA", "xZepNM9qcRA"], "start_seconds": ["16", "30"], "properties": ["three men, wind, flow", "background, motor, run"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["continuous sneezing together with speech", "frogs croak and vocalize"], "sample_ids": ["x4dZyf9Gbj0", "yswmmRZFItk"], "start_seconds": ["130", "0"], "properties": ["continuous, sneeze, speech", "croak, vocalize, frog"], "captions_pred_video": ["footage is blurry and out of focus", "a close up of a frog in the water"], "captions_pred_audio": ["a woman sneezes and speaks", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a horn rings out as a machine runs by"], "sample_ids": ["uZesmtKZGSw", "slZLHwNbbt4"], "start_seconds": ["250", "300"], "properties": ["car, track, man", "a, horn, run"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "someone snores nearby"], "sample_ids": ["vJ7JPEFhyLA", "spJCm8tD9Zo"], "start_seconds": ["16", "90"], "properties": ["three men, wind, flow", "someone snores, nearby, someone"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sWZzXuWYY", "vbZ-0lGPneg"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and a dog is whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "someone whistles a tune"], "sample_ids": ["uqFtmnhuqA8", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["a, b, c", "someone, tune, whistle"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", null], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a person snores loudly multiple times at a close distance"], "sample_ids": ["xjvTpk2Zpr8", "sSMl2vc3ek"], "start_seconds": ["70", "20"], "properties": ["wind, blows, vehicle", "loud, multiple, distance"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a toilet flushes and a female speaks"], "sample_ids": ["w0xsN8X18Y", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["music, surface, rain", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a toilet flushes and a man speaks"], "question": "which entity is more likely to be in a bathroom?", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xM4joTqDVp4", "tdWhHV3X25Q"], "start_seconds": ["160", "60"], "properties": ["background, chirp, birds", "applause, audience, yells"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a duck quacks continuously"], "sample_ids": ["un9VQlzgZM", "vh30P49Po6s"], "start_seconds": ["5", "30"], "properties": ["females, talk, laugh", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a male speaks and another male speaks", "small dogs yip and bark sharply"], "sample_ids": ["viuTg1M-dqg", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["two males, speaking, male", "bark, yip, sharply"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "birds chirp and objects are moved around"], "sample_ids": ["yYJksgsxx5U", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["audio, woman, silverware", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "insects buzz and a man speaks"], "question": "which entity is a video?", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a car accelerates and wind blows"], "sample_ids": ["tEE3MpBt1sg", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["drill, something, laugh", "accelerates, wind, blows"], "captions_pred_video": ["footage is blurry due to the smoke in the air", null], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "people applaud and hoot and chat quietly"], "sample_ids": ["xKB8O8LTs6s", "wwyfGO2J4"], "start_seconds": ["70", "90"], "properties": ["music, gunfire, explosion", "people, applaud, hoot"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xyL9F5VrjkE", "zj2R0XoFr5k"], "start_seconds": ["20", "50"], "properties": ["engine, run, wind", "airplane, boy, fly"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man is filing a hard object", "cats meow and then a person begins to talk while the cats continue to meow"], "sample_ids": ["vveS8HT7Uog", "x5cuQjOdM3E"], "start_seconds": ["100", "30"], "properties": ["a man, hard, object", "cat, talk, meow"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a cat meows and a woman speaks"], "question": "which entity is talking", "label": 1}, {"captions": ["a motorcycle engine is idling", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vZAqdHZ81yA", "zj2R0XoFr5k"], "start_seconds": ["180", "50"], "properties": ["engine, motorcycle, idling", "airplane, boy, fly"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["an engine is idling loudly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "a car accelerates and wind blows"], "sample_ids": ["wqUmIEzuNz4", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["frog, bird, vocalize", "accelerates, wind, blows"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", null], "captions_pred_audio": ["a cat meows and rustles", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zTLVJCo4WEE", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "gun, shoot, water"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a consistent ticking pattern"], "sample_ids": ["vdoxuJn9lTc", "sCeWURVHfOM"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "ticking, pattern, clock"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "- a close-up view of the clock's inner workings"], "captions_pred_audio": ["a child speaks followed by a burp", "ticking of a clock"], "question": "which entity is a clock?", "label": 1}, {"captions": ["water flows followed by women screaming", "a vehicle engine accelerating then running on idle"], "sample_ids": ["w5W5Kqtc8E", "vYkA3cfXp5Q"], "start_seconds": ["100", "30"], "properties": ["water, flow, women", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "an aircraft engine runs"], "sample_ids": ["vddP56-ogds", "yLCORCnd35Q"], "start_seconds": ["30", "0"], "properties": ["water, flow, laugh", "engine, aircraft, runs"], "captions_pred_video": [null, "a lufthansa airbus a380 landing at london's heathrow airport"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a train is moving and its wheels are squealing "], "question": "which entity is a moving object", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sxYkFKFIZD0", "zl9Dqx-j7q4"], "start_seconds": ["20", "6"], "properties": ["screech, man, door", "engine, laugh, loud"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a machine beeps continuously"], "sample_ids": ["zofjfKhqLk8", "y682ml90jGw"], "start_seconds": ["10", "11"], "properties": ["background, metal, clank", "beeps, machine, continuously"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a beeping sound is being made "], "question": "which machine beeps continuously", "label": 1}, {"captions": ["an engine starts and increases in power", "an airplane engine runs"], "sample_ids": ["zjTG0gaGCUI", "yVPZ2MNWpms"], "start_seconds": ["80", "0"], "properties": ["power, increase, engine", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "small dogs yip and bark sharply"], "sample_ids": ["vbZ-0lGPneg", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["a woman, a television program, a bird", "bark, yip, sharply"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "paper is crumpling consistently"], "sample_ids": ["slZLHwNbbt4", "v5cSxLaHADY"], "start_seconds": ["300", "0"], "properties": ["train, horn, sound", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a door opens and birds chirp", "a woman speaks as she rubs two objects together"], "sample_ids": ["yeFvk9x0wWI", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["door, open, birds", "two objects, woman, speak"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["frogs croak and vocalize", "frogs croak and vocalize"], "sample_ids": ["yswmmRZFItk", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["croak, vocalize, frog", "croak, vocalize, frog"], "captions_pred_video": ["a close up of a frog in the water", "a close up of a frog in the water"], "captions_pred_audio": ["a frog is croaking", "a frog is croaking"], "question": "which frog is croaking", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a female speaks softly as paper crinkles"], "sample_ids": ["w0xsN8X18Y", "xvDdE3zNf8Y"], "start_seconds": ["30", "120"], "properties": ["music, surface, rain", "a, female, speaks"], "captions_pred_video": [null, "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a woman speaks and crumples paper"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["vBslzh7saPw", "t25U-v4k4ts"], "start_seconds": ["90", "40"], "properties": ["engine, roar, louder", "a, chirps, bird"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking and bees are buzzing"], "question": "which entity is quieter", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "people cheer as a vehicle engine revs"], "sample_ids": ["yYEVLuqEytU", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["grunt, slurp, background", "engine revs, vehicle, people"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a car speeding up in the distance"], "sample_ids": ["s4Uz1Ffgo04", "u0TrcHhkPQ"], "start_seconds": ["100", "20"], "properties": ["roars, background, people speaking", "distance, car, speed"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a race car accelerates and revs its engine "], "question": "which car is speeding up in the distance", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "paper is crumpling consistently"], "sample_ids": ["xfudFO976zE", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["animal, bleats, cry", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry and shaky", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a woman and man speak while food is frying", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zk-xJGQU8-4", "uZesmtKZGSw"], "start_seconds": ["130", "250"], "properties": ["food, man, woman", "men, talk, cars"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a man and woman speaking?", "label": 0}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "winds blows roughly as a vehicle races past"], "sample_ids": ["w9lpbUn0hPc", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["male, wind, rustling", "wind, blows, vehicle"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle racing past?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["x9JovgqUcs", "uYT5gxnyMWM"], "start_seconds": ["500", "50"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and typing on a computer keyboard?", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["y8WEcpOlT3I", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["wind, speak, buffeting", "People, motor, brakes"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a car?", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a horn rings out as a machine runs by"], "sample_ids": ["sOa7g-44Dag", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["background, man, spray", "a, horn, run"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is about a machine running by?", "label": 1}, {"captions": ["someone snores nearby", "water splashes and a motorboat passes as people yell"], "sample_ids": ["spJCm8tD9Zo", "w5W5Kqtc8E"], "start_seconds": ["90", "100"], "properties": ["someone snores, nearby, someone", "water, splashes, motorboat"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more active", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a woman speaks in a fast tone with a male"], "sample_ids": ["uYT5gxnyMWM", "sTpirNYo8vQ"], "start_seconds": ["50", "30"], "properties": ["a, scream, girl", "a, tone, fast"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while a car is revving and accelerating "], "question": "which entity is more calm", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["v7jJS8aAyA", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["wind, blows, loudly", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["goats bleat and metal clings", "an infant crying frantically"], "sample_ids": ["tH17JPjDPnc", "zwOBqeFTgiU"], "start_seconds": ["260", "30"], "properties": ["bleat, metal, clings", "cry, infant, frantically"], "captions_pred_video": ["feed of the goats eating hay in the barn", "of the baby crying in the car seat"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a person is snoring while sleeping", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vJrjSeP17yE", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["a person is sleeping, snoring, person", "rustling, ducks, quack"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a duck quacks and a woman speaks"], "question": "which entity is a video", "label": 1}, {"captions": ["a horse runs while two women talk", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sdvI1mHAsc", "su6FAOcOA8c"], "start_seconds": ["20", "4"], "properties": ["two women, horse, run", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a woman is speaking and a subway train is moving "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a woman and man are speaking"], "sample_ids": ["tw76HGONaKg", "vbpKkWvfOu4"], "start_seconds": ["570", "560"], "properties": ["A, game, keyboard", "two people, speaking, woman, man"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a woman is speaking and a man is speaking"], "question": "which entity shows two people speaking", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "continuous snoring"], "sample_ids": ["sapQIQUhFc", "sLkeqCDJIyw"], "start_seconds": ["280", "120"], "properties": ["liquid, flow, distance", "loud, snoring, noise"], "captions_pred_video": [null, ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["electronic beeps occur in a short series", "some tunes played by whistling"], "sample_ids": ["y682ml90jGw", "u6BnG6YZqJ4"], "start_seconds": ["11", "0"], "properties": ["beeps, series, electronic", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a beeping sound is being made ", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["children speak and play together", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["yVVP8XvWJTo", "tdWhHV3X25Q"], "start_seconds": ["260", "60"], "properties": ["children, speak, play", "applause, audience, yells"], "captions_pred_video": ["footage of a playground at a school or daycare center", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["s4Uz1Ffgo04", "tDlysoZiA1I"], "start_seconds": ["100", "0"], "properties": ["roars, background, people speaking", "animal, grunts, chirps"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "birds are chirping and a rooster is crowing "], "question": "which entity is quieter", "label": 1}, {"captions": ["a helicopter engine runs", "a vehicle engine runs while a siren and horn sound"], "sample_ids": ["t5ZbXbniOWk", "u--KhUW8l1Y"], "start_seconds": ["30", "0"], "properties": ["engine, helicopter, run", "engine, sound, horn"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "a firefighter spraying water from a fire hydrant at night"], "captions_pred_audio": ["a helicopter is flying overhead ", "a fire truck siren blares and a horn blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a man talks while metallic objects are rapped and steam is released"], "sample_ids": ["un9VQlzgZM", "vuUVPzd2FXw"], "start_seconds": ["5", "160"], "properties": ["females, talk, laugh", "a, steam, release"], "captions_pred_video": [null, "of the person cooking on the grill with a spatula"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking and dishes are clanging"], "question": "which entity is a man?", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vKrYfzleLB8", "vlS6YMeWAPo"], "start_seconds": ["110", "40"], "properties": ["a, ring, gunshots", "sheep, baa, birds"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wqZ135Ssz0", "zj2R0XoFr5k"], "start_seconds": ["60", "50"], "properties": ["two men, woman, birds", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about flying?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "wind blowing followed by a zoom"], "sample_ids": ["tQWGZLItBXk", "vr8ZXjEBhMQ"], "start_seconds": ["170", "150"], "properties": ["voice, music, whoosh", "wind, blow, zoom"], "captions_pred_video": ["worms revolution screenshots", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more like a natural phenomenon", "label": 1}, {"captions": ["paper folding and crinkling", "a man speaks while water drains"], "sample_ids": ["zPpG3RD8lSs", "vSeGhaZt-aI"], "start_seconds": ["20", "50"], "properties": ["paper, fold, crinkle", "water, drain, man"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video", "label": 1}, {"captions": ["a goat bleats as a person speaks", "water splashes as an animal walks through"], "sample_ids": ["tPJvjq9QePY", "w1ir-sZ3Im8"], "start_seconds": ["40", "90"], "properties": ["bleats, person, speak", "animal, water, splashes"], "captions_pred_video": ["a dog and a sheep in a barn", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a baby cries and a man speaks", "water splashes and gurgles as people speak"], "question": "which entity is a video of an animal walking through water?", "label": 1}, {"captions": ["an infant crying as a woman laughs", "someone is typing on a computer keyboard"], "sample_ids": ["xhmRY9yhC7c", "v0x1odnXtP0"], "start_seconds": ["20", "210"], "properties": ["a, laugh, infant", "keyboard, type, computer"], "captions_pred_video": ["of a baby crying in a baby bouncer", "how to make money on youtube in spanish"], "captions_pred_audio": ["a baby cries and a woman speaks", "a person is typing on a keyboard"], "question": "which is not a person", "label": 0}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a motorcycle engine works nearby"], "sample_ids": ["rwtmaKiCcQU", "tOSWIURC-4"], "start_seconds": ["30", "0"], "properties": ["nozzle, depressed, spray can", "engine, work, nearby"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", null], "captions_pred_audio": ["spraying and people speaking", "a lawn mower is running "], "question": "which entity is a machine", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["uZesmtKZGSw", "wSVhSdj0F0"], "start_seconds": ["250", "10"], "properties": ["men, talk, cars", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a car horn honks and keys jangle with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a car accelerates and wind blows"], "sample_ids": ["vf9xf3vMsGM", "u0TrcHhkPQ"], "start_seconds": ["540", "20"], "properties": ["A man speaks while turning a water faucet on.", "accelerates, wind, blows"], "captions_pred_video": ["of the person washing their hands under the faucet", null], "captions_pred_audio": ["a man is speaking while water is running in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks with water running", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["wTideSjRFS0", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["water, running, woman", "sound, chirp, buzz"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a bee on a purple thistle flower"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a bee buzzes and a woman speaks"], "question": "which entity has a woman speaking with water running?", "label": 0}, {"captions": ["continuous chugging with birds chirping in the background", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["xM4joTqDVp4", "vqZuVbG6-HI"], "start_seconds": ["160", "130"], "properties": ["background, chirp, birds", "background, male, female"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage is blurry because it's raining outside"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a lawn mower is running and men are speaking "], "question": "which entity has a male and female voice in the background?", "label": 1}, {"captions": ["a door slams shut roughly", "someone snores nearby"], "sample_ids": ["zkKdxzNC97Y", "spJCm8tD9Zo"], "start_seconds": ["27", "90"], "properties": ["a door, slams, shut", "someone snores, nearby, someone"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a door is opened and closed", "a person is snoring loudly"], "question": "which entity is more annoying", "label": 1}, {"captions": ["small dogs yip and bark sharply", "pigeons vocalize and birds chirp"], "sample_ids": ["v-wcQf4BDY0", "uiS58TNyUiw"], "start_seconds": ["120", "430"], "properties": ["bark, yip, sharply", "vocalize, bird, chirp"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "of the pigeon in the cage"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wSVhSdj0F0", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["horn honks, keys jingle, slam", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be in a car", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vbpKkWvfOu4", "sSMl2vc3ek"], "start_seconds": ["560", "20"], "properties": ["a, man, speaks", "loud, multiple, distance"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "birds chirp and objects are moved around"], "sample_ids": ["vK93VuO0yNc", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["male voice, bus, rumble", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks as several small engines run", "an infant crying as a woman laughs"], "sample_ids": ["u9A6VZQCZpU", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["a, man, talk", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a door opens and closes", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vBHyYJ8pL0", "xKB8O8LTs6s"], "start_seconds": ["2", "70"], "properties": ["open, close, door", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "someone snores nearby"], "sample_ids": ["sa6TLVbooCc", "spJCm8tD9Zo"], "start_seconds": ["240", "90"], "properties": ["people, laugh, child", "someone snores, nearby, someone"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["an animal quacks rapidly", "a toilet flushes and a female speaks"], "sample_ids": ["vh30P49Po6s", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["animal, quacks, rapidly", "female, flushes, toilet"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage is blurry and out of focus"], "captions_pred_audio": ["a duck is quacking loudly", "a toilet flushes and a man speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a music is played followed by a frog croaking and then music is played again"], "sample_ids": ["viuTg1M-dqg", "voJh2gJxXhA"], "start_seconds": ["30", "50"], "properties": ["two men, speak, follow", "music, frog, croak"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "a frog on a black background with a red diamond in the center"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "music is playing and crickets are chirping "], "question": "which entity is a frog?", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "vehicles pass by on a roadway"], "sample_ids": ["tDlysoZiA1I", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["animal, grunts, chirps", "pass, vehicle, roadway"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a car is driving on the road "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a male speaks over some small clicks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uXxVebHsGZ8", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["male, clicks, speak", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "birds chirp and objects are moved around"], "sample_ids": ["sQGXqGcwOTc", "yPUYU6t3rwo"], "start_seconds": ["3", "370"], "properties": ["cling, speak, dishes", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a door opens and closes", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vBHyYJ8pL0", "tiDFTC-5vU"], "start_seconds": ["2", "30"], "properties": ["open, close, door", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["an infant crying as a woman laughs", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xhmRY9yhC7c", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["a, laugh, infant", "music, gunfire, explosion"], "captions_pred_video": ["of a baby crying in a baby bouncer", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a baby cries and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["frogs croak and vocalize", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["yswmmRZFItk", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["croak, vocalize, frog", "a train, a horn, a bell"], "captions_pred_video": ["a close up of a frog in the water", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a frog is croaking", "a train blows its whistle and blows its horn "], "question": "which entity is a warning device", "label": 1}, {"captions": ["some men converse over an engine running", "water flows and trickles"], "sample_ids": ["sCiy7QS1U", "tB7hWb9gTuQ"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "water quietly rushes by while birds chirp in the background"], "sample_ids": ["sG7TyPnFDR0", "sYITalLZjj4"], "start_seconds": ["180", "30"], "properties": ["beeps, machine, smoke alarm", "water, rushes, background, birds"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "two ducks are swimming in the water near each other"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "wind blows and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a propeller rotates loudly and intensely"], "sample_ids": ["sU53zg9Jp7s", "ugHJF0hfYkg"], "start_seconds": ["380", "10"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "loud, intense, propeller"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a man speaks as a car is passing by"], "sample_ids": ["yRx9txMcBl0", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["accelerates, tires, squeals", "a, car, pass"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a man speaks as a vehicle engine idles", "winds blows roughly as a vehicle races past"], "sample_ids": ["shmR4OZtzqA", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["man, engine, idle", "wind, blows, vehicle"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man speaks while a motor runs", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "pigeons vocalize and birds chirp"], "sample_ids": ["uPDn2BFTHk", "uiS58TNyUiw"], "start_seconds": ["140", "430"], "properties": ["woman, laughs, speaks", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "pigeons vocalize and birds chirp"], "sample_ids": ["vmrxwuAMb2I", "uiS58TNyUiw"], "start_seconds": ["40", "430"], "properties": ["a dog, inhales, exhales", "vocalize, bird, chirp"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "of the pigeon in the cage"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["goats bleat and people speak", "wind blowing followed by a zoom"], "sample_ids": ["z5iUE5h0EPs", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["goats bleat, people speak, language", "wind, blow, zoom"], "captions_pred_video": ["of the goat in the barn", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a goat bleats and a man speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a frog croaks as other frogs croak in the background"], "sample_ids": ["tDlysoZiA1I", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["animal, grunt, chirp", "background, frog, croak"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a close up of a frog in the water"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a frog is croaking"], "question": "which entity is a solitary animal?", "label": 0}, {"captions": ["continuous snoring", "a person is burping while a girl speaks"], "sample_ids": ["sLkeqCDJIyw", "vdoxuJn9lTc"], "start_seconds": ["120", "40"], "properties": ["loud, snoring, noise", "person, burp, girl"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a person is snoring loudly", "a child speaks followed by a burp"], "question": "which noise is louder", "label": 0}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "water splashes as an animal walks through"], "sample_ids": ["xOZfdgAgJ9o", "w1ir-sZ3Im8"], "start_seconds": ["40", "90"], "properties": ["woman, whimpering, speaking", "animal, water, splashes"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "multiple beeps are followed by a squawk and a child speaking"], "sample_ids": ["sLUnaPT5gM8", "w34HjHr6gAY"], "start_seconds": ["0", "30"], "properties": ["loud, laughter, intermittent", "beeps, squawk, child speaking"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a beep sounds followed by a child speaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a vehicle accelerates before a race car idles then accelerates quickly"], "sample_ids": ["ugHJF0hfYkg", "sjlVMgdGSK0"], "start_seconds": ["10", "30"], "properties": ["loud, intense, propeller", "accelerates, vehicle, race car"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car accelerates and revs its engine "], "question": "which is a vehicle", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vKrYfzleLB8", "uYT5gxnyMWM"], "start_seconds": ["110", "50"], "properties": ["a, ring, gunshots", "female, spraying, scream"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person spraying and screaming?", "label": 1}, {"captions": ["a small engine idles continuously", "a person snores loudly multiple times at a close distance"], "sample_ids": ["y5WII6cTH7k", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["engine, idle, continuously", "loud, multiple, distance"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", null], "captions_pred_audio": ["an engine is knocking and vibrating ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["goats bleat and people speak", "winds blows roughly as a vehicle races past"], "sample_ids": ["z5iUE5h0EPs", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["goats bleat, people speak, language", "wind, blows, vehicle"], "captions_pred_video": ["of the goat in the barn", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a goat bleats and a man speaks", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a man speaks, another man speaks, and a small bell dings"], "sample_ids": ["wEBlkGWVWwE", "t69a8aRKhmc"], "start_seconds": ["260", "30"], "properties": ["a, babble, woman", "a, b, c"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking and birds are chirping in the background "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["people converse in the distance as a clock ticks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vZAw4apG0Es", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["people, clock, converse", "engine, idle, woman"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a clock is ticking and people are talking", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a telephone rings followed by a woman talking"], "sample_ids": ["w8uLijTqtlU", "tGcFnX0GHI"], "start_seconds": ["70", "0"], "properties": ["wind, microphone, noise", "ring, talk, woman"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a recording", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xO-Q2BlIIPU", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["two men, exclamation, speak", "engine, idle, woman"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["tapping occurs then a baby cries", "water is sprayed across a hard surface"], "sample_ids": ["wIJK3-5y0kA", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["a, cry, baby", "water, spray, surface"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a baby cries and a woman speaks", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["yaln9y8I7ms", "w5W5Kqtc8E"], "start_seconds": ["230", "100"], "properties": ["female, flushes, toilet", "wind, blow, vehicle"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and a man speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a telephone rings followed by a woman talking"], "sample_ids": ["uEU-Hg5MTN8", "tGcFnX0GHI"], "start_seconds": ["27", "0"], "properties": ["a woman, laughs, animal", "ring, talk, woman"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "wind blows as people chatter quietly"], "sample_ids": ["w2JXXIAdUdg", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["snoring, distance, person", "wind, chatter, people"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage is blurry and out of focus"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person sniffs and sneezes", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uRlbY6aoBU", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["sneezes, person, sniffs", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is sneezing ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "a stream of water runs briefly"], "sample_ids": ["wz7N8YRy74I", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["rooster, crow, background, men", "stream, water, run"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["s3cTDAj31g", "zFjIWfSD-4"], "start_seconds": ["80", "410"], "properties": ["man, talk, woman", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a man talking followed by a woman shouting?", "label": 0}, {"captions": ["music plays followed by gunshots and then an explosion", "water flows and trickles"], "sample_ids": ["xKB8O8LTs6s", "tB7hWb9gTuQ"], "start_seconds": ["70", "30"], "properties": ["music, gunshots, explosion", "water, flow, trickle"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "water is splashing and gurgling"], "question": "which entity is more calm", "label": 1}, {"captions": ["a man talks as several small engines run", "dishes cling together then a man begins to speak"], "sample_ids": ["u9A6VZQCZpU", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["a, man, talk", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["continuous sneezing together with speech", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["x4dZyf9Gbj0", "tdWhHV3X25Q"], "start_seconds": ["130", "60"], "properties": ["continuous, sneeze, speech", "applause, audience, yells"], "captions_pred_video": ["footage is blurry and out of focus", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a baby coos and fidgets as a lady speaks and laughs"], "sample_ids": ["wudZTNBtVqc", "uPDn2BFTHk"], "start_seconds": ["60", "140"], "properties": ["accelerates, engine, wind", "lady, laugh, baby"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a baby laughs and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a infant makes noise and is excited"], "sample_ids": ["shmR4OZtzqA", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["man, engine, idle", "noise, excited, infant"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man speaks while a motor runs", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["an adult man speaks over glass clinking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["u6jIvCtKarQ", "tdWhHV3X25Q"], "start_seconds": ["70", "60"], "properties": ["a, man, speaks", "applause, audience, yells"], "captions_pred_video": ["footage of a person using a blender on a stove top", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["sa6TLVbooCc", "xjvTpk2Zpr8"], "start_seconds": ["240", "70"], "properties": ["people, laugh, child", "wind, blows, vehicle"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a duck quacks continuously"], "sample_ids": ["uWPRNLnpy7Y", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["accelerate, laugh, vehicle", "quacks, continuously, duck"], "captions_pred_video": ["is taken from a car driving down the street", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "a propeller rotates loudly and intensely"], "sample_ids": ["wAAkbZToh8", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["burp, laugh, speak", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man burps and a woman speaks", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "an airplane engine runs"], "sample_ids": ["xvDdE3zNf8Y", "yVPZ2MNWpms"], "start_seconds": ["120", "0"], "properties": ["A, crumple, paper", "engine, airplane, runs"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman speaks and crumples paper", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a man speaks as a car is passing by"], "sample_ids": ["ziUT9IFTkjg", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["background, birds, rustling", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "a car accelerates and wind blows"], "sample_ids": ["uC9dtII1KDI", "u0TrcHhkPQ"], "start_seconds": ["150", "20"], "properties": ["wind, gusts, distance", "accelerates, wind, blows"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", null], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is about a car?", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vMf1dLD6Sng", "zl9Dqx-j7q4"], "start_seconds": ["6", "6"], "properties": ["frog, bird, vocalize", "engine, laugh, loud"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a frog croaks loudly", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "birds chirp and objects are moved around"], "sample_ids": ["yeFvk9x0wWI", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["clack, bird, chirp", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a duck quacks loudly and continuously"], "sample_ids": ["w2M4i1mklOA", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["alarm, gears, turn", "loud, continuous, quacks"], "captions_pred_video": ["footage of an antique clock", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a chime of a clock followed by various tones of ticking with come clinking"], "sample_ids": ["vzxHnu-SFEw", "uqFtmnhuqA8"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "a, b, c"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "shows a clock on the wall of a room with a person standing in front of it"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "mechanisms are ticking and a hammer is striking "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a woman speaks followed by clicks and scraping", "a man speaks over intermittent keyboard taps"], "sample_ids": ["yYJksgsxx5U", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["audio, clicks, scraping", "audio, man, keyboard"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a machine beeps continuously"], "sample_ids": ["sWZzXuWYY", "y682ml90jGw"], "start_seconds": ["420", "11"], "properties": ["male, clanks, thumps", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a beeping sound is being made "], "question": "which machine beeps continuously", "label": 1}, {"captions": ["an airplane engine runs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yVPZ2MNWpms", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["engine, airplane, runs", "a woman, something, fried"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a car is driving by on the road ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of something being fried?", "label": 1}, {"captions": ["an airplane engine spools and people speak", "a man speaks as a motor runs in the background"], "sample_ids": ["wTjoRj1se3U", "xZepNM9qcRA"], "start_seconds": ["390", "30"], "properties": ["airplane, engine, spool", "background, motor, run"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["w5W5Kqtc8E", "ukg5L09Wpvo"], "start_seconds": ["100", "150"], "properties": ["water, splashes, motorboat", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a helicopter engine idles continuously", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ugHJF0hfYkg", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["engine, idle, continuously", "female, spraying, scream"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wvKpEYswXO0", "sSMl2vc3ek"], "start_seconds": ["150", "20"], "properties": ["plastic, tap, speak", "loud, multiple, distance"], "captions_pred_video": ["of the person preparing food in the kitchen", null], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["s4tUs779vBA", "y2bVZ7rz-5M"], "start_seconds": ["160", "280"], "properties": ["a, sound, stop", "motor noise, horn, siren"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honking?", "label": 1}, {"captions": ["a vehicle engine runs as a siren and horn sound", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["u--KhUW8l1Y", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["sound, vehicle, horn", "clickety-clack, train, whistle"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["siJFXfGWgDk", "ukg5L09Wpvo"], "start_seconds": ["50", "150"], "properties": ["man, woman, vehicle", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "an infant crying as a woman laughs"], "sample_ids": ["vXlk0lIQBFo", "xhmRY9yhC7c"], "start_seconds": ["470", "20"], "properties": ["wind, speak, vocalize", "a, laugh, infant"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a child speaks in closed space", "an engine runs loudly"], "sample_ids": ["yW6FWLSLkx4", "vqZuVbG6-HI"], "start_seconds": ["40", "130"], "properties": ["child, space, speak", "loud, engine, run"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["dogs barking and whimpering", "people speak in a closed space"], "sample_ids": ["tIY7qOV3rEM", "sTpirNYo8vQ"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "people, space, speak"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking while a car is revving and accelerating "], "question": "which entity is more passive", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "paper folding and crinkling"], "sample_ids": ["s4Uz1Ffgo04", "zPpG3RD8lSs"], "start_seconds": ["100", "20"], "properties": ["water, rushes, motorcycle", "paper, fold, crinkle"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "the wind blows and a mouse clicks "], "question": "which entity is stationary", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "dog barking and vehicle engine idling followed shortly by vehicle engine revving"], "sample_ids": ["vBHyYJ8pL0", "zY3icUyMdh8"], "start_seconds": ["2", "20"], "properties": ["noise, door, opening", "dog, bark, engine"], "captions_pred_video": [null, "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a car is driving and dogs are barking and squealing "], "question": "which entity is accompanied by a door opening and closing", "label": 0}, {"captions": ["the revving of an engine throttle followed by a man speaking", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tezvROoo4bs", "wqZ135Ssz0"], "start_seconds": ["40", "60"], "properties": ["audio, throttle, speaking", "two men, woman, birds"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", null], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["an insect buzzes around continuously", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["v25l1jef3JY", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["buzzes, continuously, insect", "a woman, something, fried"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a woman speaks happily and an animal chirps"], "sample_ids": ["vimzuGQvdcU", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["a, man, yells", "a woman, chirps, animal"], "captions_pred_video": ["a group of people are rafting down a river", null], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vJvryTwuAV8", "uYT5gxnyMWM"], "start_seconds": ["16", "50"], "properties": ["audience, cheer, man", "a, scream, girl"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a consistent ticking pattern", "water pouring and bubbling"], "sample_ids": ["sCeWURVHfOM", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["ticking, pattern, clock", "water, bubbles, pouring"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["ticking of a clock", "water is running from a faucet"], "question": "which entity is more likely to be found in a kitchen", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vqZuVbG6-HI", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["background, male, female", "animal, grunts, snorts"], "captions_pred_video": ["footage is blurry because it's raining outside", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and an animal grunts and snorts?", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "an airplane engine runs"], "sample_ids": ["wz7N8YRy74I", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["rooster, crow, background, people", "engine, airplane, runs"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "wind blowing followed by a zoom"], "sample_ids": ["vzceMbklWc", "vr8ZXjEBhMQ"], "start_seconds": ["180", "150"], "properties": ["water, faucet, sink", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["water is running and a man is speaking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "an infant crying as a woman laughs"], "sample_ids": ["s4Uz1Ffgo04", "xhmRY9yhC7c"], "start_seconds": ["100", "20"], "properties": ["water, rushes, vehicle", "a, laugh, infant"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a beep occurs briefly", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xtWeJ56-U-g", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["beep, occur, briefly", "three men, wind, flow"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["an airplane engine runs", "people applaud and hoot and chat quietly"], "sample_ids": ["yVPZ2MNWpms", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["engine, airplane, runs", "people, applaud, hoot"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "people are clapping and speaking with background noise "], "question": "which entity is a performance", "label": 1}, {"captions": ["a duck quacks several times", "someone is typing on a computer keyboard"], "sample_ids": ["vh30P49Po6s", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["quacks, duck, several", "keyboard, type, computer"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "how to make money on youtube in spanish"], "captions_pred_audio": ["a duck is quacking loudly", "a person is typing on a keyboard"], "question": "which is not a type of keyboard", "label": 0}, {"captions": ["scraping and female speech with distant music", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yHeVV-xeOxQ", "wz7N8YRy74I"], "start_seconds": ["130", "30"], "properties": ["female, speech, music", "rooster, crow, background, men"], "captions_pred_video": ["of a girl milking a goat's udder", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster crow in the background?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "water splashes and a door squeaks"], "sample_ids": ["wSVhSdj0F0", "sdXV-ylviw"], "start_seconds": ["10", "190"], "properties": ["horn honks, keys jingle, slam", "sound, splash, door"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a dog barks and taps with background noise "], "question": "which entity has a door?", "label": 1}, {"captions": ["an animal quacks rapidly", "a woman talks while a baby cries and a man whispers"], "sample_ids": ["vh30P49Po6s", "smDKStoHBJo"], "start_seconds": ["30", "0"], "properties": ["animal, quacks, rapidly", "a, talk, baby, cry"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a man holding a crying baby in his arms"], "captions_pred_audio": ["a duck is quacking loudly", "a baby is crying and a woman is speaking"], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sG7TyPnFDR0", "sSMl2vc3ek"], "start_seconds": ["180", "20"], "properties": ["beeps, machine, smoke alarm", "loud, multiple, distance"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", null], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["scraping and female speech with distant music", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yHeVV-xeOxQ", "ukg5L09Wpvo"], "start_seconds": ["130", "150"], "properties": ["female, speech, music", "clickety-clack, train, whistle"], "captions_pred_video": ["of a girl milking a goat's udder", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a stream of water flows quickly", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wbHTKEJZyhc", "yDoT73BWsdA"], "start_seconds": ["20", "10"], "properties": ["stream, water, flow", "engine, revs, vehicle"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zofjfKhqLk8", "zFjIWfSD-4"], "start_seconds": ["10", "410"], "properties": ["background, metal, clings", "People, motor, brakes"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a saw running?", "label": 0}, {"captions": ["a man speaks as a car is passing by", "wind blowing followed by a zoom"], "sample_ids": ["sK4u5T8hW78", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["a, car, pass", "wind, blow, zoom"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a small engine idles continuously", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["y5WII6cTH7k", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["engine, idle, continuously", "a woman, laughs, animal"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a woman is speaking and a baby is crying"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a woman speaks happily and an animal chirps", "people applaud and hoot and chat quietly"], "sample_ids": ["uWAAAL4CIoc", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["a woman, chirps, animal", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "an infant crying as a woman laughs"], "sample_ids": ["vs65y4qmyBE", "xhmRY9yhC7c"], "start_seconds": ["340", "20"], "properties": ["engine, run, man", "a, laugh, infant"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a infant makes noise and is excited"], "sample_ids": ["vqZuVbG6-HI", "wIJK3-5y0kA"], "start_seconds": ["130", "30"], "properties": ["background, male, female", "noise, excited, infant"], "captions_pred_video": ["footage is blurry because it's raining outside", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "several insects fly while two men talk"], "sample_ids": ["sTpirNYo8vQ", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["a, tone, fast", "several, fly, men"], "captions_pred_video": ["of a man taking a selfie on a bus", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a woman speaking in a fast tone with a male?", "label": 0}, {"captions": ["an engine idles quietly then gradually becomes louder", "wind blows as people chatter quietly"], "sample_ids": ["vbr9mHKc8WM", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["noise, loudness, engine", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["an engine is idling", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "a propeller rotates loudly and intensely"], "sample_ids": ["s4tUs779vBA", "ugHJF0hfYkg"], "start_seconds": ["160", "10"], "properties": ["a, sound, stop", "loud, intense, propeller"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sjlVMgdGSK0", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["car, revving, loudly", "People, motor, brakes"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which is not a vehicle", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["wy1eKjR7KC0", "yLy-WycbVVE"], "start_seconds": ["30", "30"], "properties": ["people, talk, distance", "background, people, talk"], "captions_pred_video": ["two police officers riding motorcycles down the street", "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity has more people talking", "label": 1}, {"captions": ["a child speaks in closed space", "water flows and trickles"], "sample_ids": ["yW6FWLSLkx4", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["child, space, speak", "water, flow, trickle"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a motorcycle engine works nearby"], "sample_ids": ["zCrAfDfv6-A", "tOSWIURC-4"], "start_seconds": ["30", "0"], "properties": ["person, mouse, click", "engine, work, nearby"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", null], "captions_pred_audio": ["a person whistles a song", "a lawn mower is running "], "question": "which entity is a person", "label": 0}, {"captions": ["a jet engine spools up and takes off", "a man speaks followed by another man speaking outside"], "sample_ids": ["vBslzh7saPw", "viuTg1M-dqg"], "start_seconds": ["90", "30"], "properties": ["engine, spools, takes", "two men, speak, follow"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["a train horn blows as it passes by", "a steam engine runs and whistles as it passes by"], "sample_ids": ["zVacuqSb4LI", "se87d6yxEOA"], "start_seconds": ["30", "10"], "properties": ["horn, blows, train", "run, whistle, pass"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a train is moving and blowing its whistle "], "question": "which train is more likely to blow its horn", "label": 0}, {"captions": ["people speak and laugh as a child speaks", "several insects fly while two men talk"], "sample_ids": ["sa6TLVbooCc", "s-T9OVOiMLo"], "start_seconds": ["240", "330"], "properties": ["people, laugh, child", "several, fly, men"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more people", "label": 0}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vBHyYJ8pL0", "xfaoyyzw2WU"], "start_seconds": ["2", "180"], "properties": ["noise, door, opening", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "an aircraft engine roars and a man speaks "], "question": "which noise is louder", "label": 1}, {"captions": ["a person speaks briefly", "a man speaks as a car is passing by"], "sample_ids": ["zOZleIRqZm4", "sK4u5T8hW78"], "start_seconds": ["80", "30"], "properties": ["person, talk, brief", "a, car, pass"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a person talking briefly?", "label": 0}, {"captions": ["an animal bleats and cries out and metal bangs", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["xfudFO976zE", "vlS6YMeWAPo"], "start_seconds": ["0", "40"], "properties": ["animal, bleats, cry", "sheep, baa, birds"], "captions_pred_video": ["footage is blurry and shaky", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a goat bleats and birds chirp"], "question": "which entity is a sheep?", "label": 1}, {"captions": ["a person is whistling", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sIXTftIuUgw", "sLUnaPT5gM8"], "start_seconds": ["90", "0"], "properties": ["person, whistling, person", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a person whistling a song", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "water flows and trickles"], "sample_ids": ["sofxkNWaP0s", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["wind, engine, louder", "water, flow, trickle"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vzxHnu-SFEw", "wqZ135Ssz0"], "start_seconds": ["80", "60"], "properties": ["two objects, woman, speak", "two men, woman, birds"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["people speak softly as food sizzles", "a train horn blows as it passes by"], "sample_ids": ["yhQ2Lg-7qDY", "zVacuqSb4LI"], "start_seconds": ["130", "30"], "properties": ["food, sizzle, speak", "horn, blows, train"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["a woman speaks and is crumpling paper", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xvDdE3zNf8Y", "xKB8O8LTs6s"], "start_seconds": ["120", "70"], "properties": ["A, crumple, paper", "music, gunfire, explosion"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman speaks and crumples paper", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a person is snoring while sleeping", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vJrjSeP17yE", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["a person is sleeping, snoring, person", "clickety-clack, train, whistle"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a person snoring loudly", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "water pouring and bubbling"], "sample_ids": ["zofjfKhqLk8", "uyRfq-jKPpo"], "start_seconds": ["10", "50"], "properties": ["background, metal, clings", "water, bubbles, pouring"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "people applaud and hoot and chat quietly"], "sample_ids": ["t97k0cejSQE", "wwyfGO2J4"], "start_seconds": ["250", "90"], "properties": ["bird, chirp, insect", "people, applaud, hoot"], "captions_pred_video": ["a bee on a purple thistle flower", null], "captions_pred_audio": ["a bee buzzes and a woman speaks", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a stream of water runs briefly"], "sample_ids": ["zj2R0XoFr5k", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["airplane, boy, fly", "stream, water, run"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "several insects fly while two men talk"], "sample_ids": ["w2M4i1mklOA", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["alarm, gears, turn", "several, fly, men"], "captions_pred_video": ["footage of an antique clock", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a clock?", "label": 0}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "wind blowing followed by a zoom"], "sample_ids": ["su6FAOcOA8c", "vr8ZXjEBhMQ"], "start_seconds": ["4", "150"], "properties": ["engine, idle, woman", "wind, blow, zoom"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a propeller moves loudly nearby", "people applaud and hoot and chat quietly"], "sample_ids": ["ugHJF0hfYkg", "wwyfGO2J4"], "start_seconds": ["10", "90"], "properties": ["loud, propeller, move", "people, applaud, hoot"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a weapon fires multiple times"], "sample_ids": ["ul60S8TXDA8", "sMC07Ucy7kg"], "start_seconds": ["60", "10"], "properties": ["sound, distance, bell", "weapon, fire, multiple"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage is from a car's point of view"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["birds chirp as a bell rings", "goats bleat and people speak"], "sample_ids": ["ziUT9IFTkjg", "z5iUE5h0EPs"], "start_seconds": ["10", "30"], "properties": ["chirp, bell, ring", "goats bleat, people speak, language"], "captions_pred_video": [null, "of the goat in the barn"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a goat bleats and a man speaks"], "question": "which entity is speaking a language", "label": 1}, {"captions": ["paper folding and crinkling", "a person is snoring while sleeping"], "sample_ids": ["zPpG3RD8lSs", "vJrjSeP17yE"], "start_seconds": ["20", "40"], "properties": ["paper, fold, crinkle", "a person is sleeping, snoring, person"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a woman speaks with water running", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wTideSjRFS0", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["water, running, woman", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["uKCSGgof8gI", "y8WEcpOlT3I"], "start_seconds": ["12", "40"], "properties": ["chirps, distance, signal", "harsh, wind, blows"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["s4Uz1Ffgo04", "su6FAOcOA8c"], "start_seconds": ["100", "4"], "properties": ["water, rushes, vehicle", "engine, idle, woman"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "water flows and trickles"], "sample_ids": ["sfAvvZwdLCY", "tB7hWb9gTuQ"], "start_seconds": ["20", "30"], "properties": ["flushes, drains, water", "water, flow, trickle"], "captions_pred_video": ["footage of the toilet in the bathroom", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a toilet is flushed", "water is splashing and gurgling"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a person is whistling", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sIXTftIuUgw", "tDVADusiIoc"], "start_seconds": ["90", "60"], "properties": ["person, whistling, person", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person whistling a song", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a clock ticktocks"], "sample_ids": ["wDVMhEdTiVw", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["gun, shoot, water", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a blurry image of trees and water in the forest", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "a clock ticktocks"], "sample_ids": ["skd2PphS6oI", "v-g-j2uTByM"], "start_seconds": ["190", "30"], "properties": ["ring, bird, vocalize", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a vehicle engine accelerating then running on idle"], "sample_ids": ["zuua6-5goWw", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["birds, chirp, quiet, man, speaks", "engine, accelerate, idle"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "an engine is idling"], "question": "which is quieter", "label": 1}, {"captions": ["a man speaks as a machine runs", "an airplane engine runs"], "sample_ids": ["vD6lYD1l0BY", "yVPZ2MNWpms"], "start_seconds": ["330", "0"], "properties": ["a, machine, run", "engine, airplane, runs"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a car is driving by on the road "], "question": "which machine runs", "label": 1}, {"captions": ["a man is filing a hard object", "an insect buzzes around continuously"], "sample_ids": ["vveS8HT7Uog", "v25l1jef3JY"], "start_seconds": ["100", "0"], "properties": ["a man, hard, object", "buzzes, continuously, insect"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a door opens and closes", "an insect buzzes around continuously"], "sample_ids": ["vBHyYJ8pL0", "v25l1jef3JY"], "start_seconds": ["2", "0"], "properties": ["open, close, door", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a fly is buzzing around a microphone "], "question": "which entity is more likely to be a nuisance", "label": 1}, {"captions": ["water flows as men speak and yell", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vJ7JPEFhyLA", "sSMl2vc3ek"], "start_seconds": ["16", "20"], "properties": ["water, flow, men", "loud, multiple, distance"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "paper is crumpling consistently"], "sample_ids": ["xjvTpk2Zpr8", "v5cSxLaHADY"], "start_seconds": ["70", "0"], "properties": ["engine, run, wind", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a jet engine roars and wind blows ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "several insects fly while two men talk"], "sample_ids": ["tQWGZLItBXk", "s-T9OVOiMLo"], "start_seconds": ["170", "330"], "properties": ["voice, music, whoosh", "several, fly, men"], "captions_pred_video": ["worms revolution screenshots", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sZPuqDgX2V0", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["engine, accelerate, intercom", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "an engine is idling"], "question": "which entity shows a vehicle engine accelerating then running on idle?", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "wind blows and a vehicle blows a hard then a train blows a horn"], "sample_ids": ["wSVhSdj0F0", "wnpJndXuxLc"], "start_seconds": ["10", "50"], "properties": ["beep, clang, footsteps", "blows, vehicle, train"], "captions_pred_video": [null, "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is a train?", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "a infant makes noise and is excited"], "sample_ids": ["yFB25fqfU8I", "wIJK3-5y0kA"], "start_seconds": ["300", "30"], "properties": ["wave, crash, shoreline", "noise, excited, infant"], "captions_pred_video": ["footage of a person surfing in the ocean", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a infant makes noise and is excited", "a infant makes noise and is excited"], "sample_ids": ["wIJK3-5y0kA", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["noise, excited, infant", "noise, excited, infant"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a baby cries and a woman speaks", "a baby cries and a woman speaks"], "question": "which infant is making noise and is excited", "label": 1}, {"captions": ["water flows as men speak and yell", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vJ7JPEFhyLA", "vYkA3cfXp5Q"], "start_seconds": ["16", "30"], "properties": ["water, flow, men", "engine, accelerate, idle"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a dog barks and whimpers", "winds blows roughly as a vehicle races past"], "sample_ids": ["sShpyu2l4YQ", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["barks, whimpers, dog", "wind, blows, vehicle"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a dog is barking and growling", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "a man speaks followed by another man speaking outside"], "sample_ids": ["vf44CgrjT0A", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["loud, long, person", "two men, speak, follow"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a loud burp", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uZesmtKZGSw", "vfYTJq7nU"], "start_seconds": ["250", "130"], "properties": ["men, talk, cars", "rustling, ducks, quack"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yaln9y8I7ms", "vbZ-0lGPneg"], "start_seconds": ["230", "30"], "properties": ["female, flushes, toilet", "a woman, a television program, a bird"], "captions_pred_video": ["footage is blurry and out of focus", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "vehicles pass by on a roadway"], "sample_ids": ["su6FAOcOA8c", "tgbONvsP47Y"], "start_seconds": ["4", "0"], "properties": ["engine, idle, woman", "pass, vehicle, roadway"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["an engine runs and a man speaks", "a car speeds away loudly followed by a car revving loudly and driving away while outside"], "sample_ids": ["yT5WfYMRr-U", "sjlVMgdGSK0"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "car, revving, loudly"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a car accelerates and revs its engine "], "question": "which entity is revving loudly", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "some tunes played by whistling"], "sample_ids": ["wyllXV6PjKo", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["a baby, a woman, a man", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman speaks and a baby cries", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vbr9mHKc8WM", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["noise, loudness, engine", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine is idling", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man woman speak while crickets sing", "people applaud and hoot and chat quietly"], "sample_ids": ["zTLVJCo4WEE", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["a, crickets, sing", "people, applaud, hoot"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a man speaks followed by another man speaking outside"], "sample_ids": ["vimzuGQvdcU", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "two men, speak, follow"], "captions_pred_video": ["a group of people are rafting down a river", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a dog barks and whimpers"], "sample_ids": ["sQwlkXjQabo", "sShpyu2l4YQ"], "start_seconds": ["10", "0"], "properties": ["liquid, surface, spray", "barks, whimpers, dog"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "the puppies are playing with a toy"], "captions_pred_audio": ["spraying followed by silence", "a dog is barking and growling"], "question": "which entity is a dog", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wjsXBsc7M40", "yajyRTUQk3U"], "start_seconds": ["10", "400"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "a woman, something, fried"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking while food is frying in the background"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a clock ticktocks in wind", "a man speaks as a car is passing by"], "sample_ids": ["yVumC9TGknc", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, wind", "a, car, pass"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "water flows and trickles"], "sample_ids": ["sxYkFKFIZD0", "tB7hWb9gTuQ"], "start_seconds": ["20", "30"], "properties": ["screech, man, door", "water, flow, trickle"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "wind blowing followed by a zoom"], "sample_ids": ["yYEVLuqEytU", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["animal, pig, background", "wind, blow, zoom"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["several sheep bleat and a man speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "females talk and laugh over gusting wind"], "sample_ids": ["w2bYrCVLT60", "un9VQlzgZM"], "start_seconds": ["120", "5"], "properties": ["ducks, speak, quack", "females, talk, laugh"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", null], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is more social", "label": 1}, {"captions": ["a baby laugh at a sputter", "a car accelerates and wind blows"], "sample_ids": ["sLUnaPT5gM8", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["laugh, sputter, baby", "accelerates, wind, blows"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a man speaks while turning a water faucet on"], "sample_ids": ["v-wcQf4BDY0", "vf9xf3vMsGM"], "start_seconds": ["120", "540"], "properties": ["bark, yip, sharply", "A man speaks while turning a water faucet on."], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "of the person washing their hands under the faucet"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking while water is running in the background"], "question": "which entity is a human", "label": 1}, {"captions": ["a machine runs continuously", "a machine beeps continuously"], "sample_ids": ["wdXV3Pv0jiY", "y682ml90jGw"], "start_seconds": ["11", "11"], "properties": ["machine, running, continuously", "beeps, machine, continuously"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a beeping sound is being made "], "question": "which machine beeps continuously", "label": 1}, {"captions": ["water flows followed by women screaming", "wind blows as people chatter quietly"], "sample_ids": ["w5W5Kqtc8E", "xBxDz0CFVn0"], "start_seconds": ["100", "30"], "properties": ["water, flow, women", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sjlVMgdGSK0", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["car, revving, loudly", "engine, revs, vehicle"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which vehicle is revving loudly", "label": 0}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a woman speaks happily and an animal chirps"], "sample_ids": ["uPDn2BFTHk", "uWAAAL4CIoc"], "start_seconds": ["140", "0"], "properties": ["lady, laugh, baby", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking and a dog is barking "], "question": "which entity has a baby?", "label": 0}, {"captions": ["a woman sneezes then speaks", "several ducks quack and cocks crow far away"], "sample_ids": ["x4dZyf9Gbj0", "sNB8zxXneIM"], "start_seconds": ["130", "20"], "properties": ["sneezes, speaks, woman", "several, quack, cocks"], "captions_pred_video": ["footage is blurry and out of focus", "a group of geese in a cage"], "captions_pred_audio": ["a woman sneezes and speaks", "a rooster is crowing and wind is blowing "], "question": "which entity is a bird", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a telephone rings followed by a woman talking"], "sample_ids": ["vdoxuJn9lTc", "tGcFnX0GHI"], "start_seconds": ["40", "0"], "properties": ["burp, loud, girl", "ring, talk, woman"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", null], "captions_pred_audio": ["a child speaks followed by a burp", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a stream of water runs briefly", "pigeons vocalize and birds chirp"], "sample_ids": ["x-PeY8Yb8M4", "uiS58TNyUiw"], "start_seconds": ["300", "430"], "properties": ["stream, water, run", "vocalize, bird, chirp"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "of the pigeon in the cage"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a stream of water?", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "some men converse over an engine running"], "sample_ids": ["xjhAnI2q6hM", "sCiy7QS1U"], "start_seconds": ["6", "300"], "properties": ["engine revs, vehicle, people", "men, converse, engine"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows people talking to each other?", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a man speaks followed by another man speaking outside"], "sample_ids": ["zOZleIRqZm4", "viuTg1M-dqg"], "start_seconds": ["80", "30"], "properties": ["rustling, leaves, person", "two men, speak, follow"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yZmhM1HcsyE", "tDVADusiIoc"], "start_seconds": ["4", "60"], "properties": ["engine, roar, water", "water, radio, man"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sfAvvZwdLCY", "yDoT73BWsdA"], "start_seconds": ["20", "10"], "properties": ["water drains, flushes, water", "engine, revs, vehicle"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a toilet is flushed", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man talks while vehicles pass by", "several insects fly while two men talk"], "sample_ids": ["sK4u5T8hW78", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["a, man, talk", "several, fly, men"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more vehicles passing by", "label": 0}, {"captions": ["a child speaks in closed space", "paper is crumpling consistently"], "sample_ids": ["yW6FWLSLkx4", "v5cSxLaHADY"], "start_seconds": ["40", "0"], "properties": ["child, space, speak", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["sNB8zxXneIM", "yLy-WycbVVE"], "start_seconds": ["20", "30"], "properties": ["several, quack, cocks", "background, people, talk"], "captions_pred_video": ["a group of geese in a cage", "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity is talking", "label": 1}, {"captions": ["a infant makes noise and is excited", "someone is typing on a computer keyboard"], "sample_ids": ["wIJK3-5y0kA", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["noise, excited, infant", "keyboard, type, computer"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "how to make money on youtube in spanish"], "captions_pred_audio": ["a baby cries and a woman speaks", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["an engine runs and a man speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["yT5WfYMRr-U", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["engine, run, man", "engine, idle, woman"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity has a man speaking while an engine runs?", "label": 0}, {"captions": ["a woman speaks with water running", "paper is crumpling consistently"], "sample_ids": ["wTideSjRFS0", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["water, running, woman", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zFjIWfSD-4", "uYT5gxnyMWM"], "start_seconds": ["410", "50"], "properties": ["People, motor, brakes", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a woman and man speak while food is frying", "multiple birds vocalize and wind blows"], "sample_ids": ["zk-xJGQU8-4", "uoGVs9yUqY4"], "start_seconds": ["130", "30"], "properties": ["food, man, woman", "multiple, vocalize, wind"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "for how to make a wooden shed door youtube"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "birds are chirping and flapping their wings with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vf9xf3vMsGM", "tiDFTC-5vU"], "start_seconds": ["540", "30"], "properties": ["A man speaks while turning a water faucet on.", "male, duck, laugh"], "captions_pred_video": ["of the person washing their hands under the faucet", null], "captions_pred_audio": ["a man is speaking while water is running in the background", "a man is speaking and ducks are quacking"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a man speaks over intermittent keyboard taps", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tw76HGONaKg", "tiDFTC-5vU"], "start_seconds": ["570", "30"], "properties": ["audio, man, keyboard", "male, duck, laugh"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck in it?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "small dogs yip and bark sharply"], "sample_ids": ["x9JovgqUcs", "v-wcQf4BDY0"], "start_seconds": ["500", "120"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a duck quacks continuously"], "sample_ids": ["w5W5Kqtc8E", "vh30P49Po6s"], "start_seconds": ["100", "30"], "properties": ["wind, blow, vehicle", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "someone whistles a tune"], "sample_ids": ["v0x1odnXtP0", "sIXTftIuUgw"], "start_seconds": ["210", "90"], "properties": ["keyboard, type, computer", "someone, tune, whistle"], "captions_pred_video": ["how to make money on youtube in spanish", null], "captions_pred_audio": ["a person is typing on a keyboard", "a person whistling a song"], "question": "which is a musical instrument", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a car accelerates and wind blows"], "sample_ids": ["tOSWIURC-4", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["engine, work, nearby", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "someone is typing on a computer keyboard"], "sample_ids": ["zofjfKhqLk8", "v0x1odnXtP0"], "start_seconds": ["10", "210"], "properties": ["background, metal, clank", "keyboard, type, computer"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "how to make money on youtube in spanish"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a person is typing on a keyboard"], "question": "which is not a machine", "label": 1}, {"captions": ["a clock ticktocks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["v-g-j2uTByM", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["ticktocks, clock, ticktocks", "men, talk, cars"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "someone is typing on a computer keyboard"], "sample_ids": ["wqZ135Ssz0", "v0x1odnXtP0"], "start_seconds": ["60", "210"], "properties": ["man, woman, squawks", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a person is typing on a keyboard"], "question": "which is not a type of keyboard", "label": 0}, {"captions": ["someone is burping continuously", "a duck quacks continuously"], "sample_ids": ["y636gklDioE", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["burps, burps, burps", "quacks, continuously, duck"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person burps loudly several times", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["goats bleat and people speak", "a woman speaks as she rubs two objects together"], "sample_ids": ["z5iUE5h0EPs", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["goats bleat, people speak, language", "two objects, woman, speak"], "captions_pred_video": ["of the goat in the barn", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a goat bleats and a man speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "dishes cling together then a man begins to speak"], "sample_ids": ["wRBHTgrbiwg", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["bird, owl, speak", "cling, speak, dishes"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking briefly?", "label": 0}, {"captions": ["three men talk while wind blows and some liquid flows", "a car speeding up in the distance"], "sample_ids": ["vJ7JPEFhyLA", "u0TrcHhkPQ"], "start_seconds": ["16", "20"], "properties": ["three men, wind, flow", "distance, car, speed"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["continuous snoring", "women speak as water runs briefly, children call out, and a man speaks"], "sample_ids": ["sLkeqCDJIyw", "uRExseg-0XI"], "start_seconds": ["120", "210"], "properties": ["loud, snoring, noise", "woman, man, water"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking while water is running and birds are chirping "], "question": "which entity is quieter", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "an engine runs loudly"], "sample_ids": ["xyL9F5VrjkE", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["engine, run, wind", "loud, engine, run"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage is blurry because it's raining outside"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a lawn mower is running and men are speaking "], "question": "which entity is running", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["w2M4i1mklOA", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["loud, chime, bell", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of an antique clock", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a train blows its whistle and blows its horn "], "question": "which entity is quieter", "label": 1}, {"captions": ["rustling with distant murmuring", "a stream of water runs briefly"], "sample_ids": ["wnNNcxAPwGQ", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["sound, distance, rustling", "stream, water, run"], "captions_pred_video": ["footage of a yellow truck doing a burnout on a race track", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a crowd of people are talking and laughing while a skateboard rolls by ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["s4Uz1Ffgo04", "zj2R0XoFr5k"], "start_seconds": ["100", "50"], "properties": ["water, rushes, vehicle", "airplane, boy, fly"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["uKCSGgof8gI", "siJFXfGWgDk"], "start_seconds": ["12", "50"], "properties": ["chirps, distance, signal", "man, woman, vehicle"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a man is speaking and birds are chirping in the background "], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "someone snores nearby"], "sample_ids": ["yLy-WycbVVE", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["background, people, talk", "someone snores, nearby, someone"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a person is snoring loudly"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "water flows as men speak and yell"], "sample_ids": ["slZLHwNbbt4", "vJ7JPEFhyLA"], "start_seconds": ["300", "16"], "properties": ["clap, distance, horn", "water, flow, men"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "wind blows in gusts as a woman speaks in the distance"], "sample_ids": ["x6ijhqRY38s", "uC9dtII1KDI"], "start_seconds": ["250", "150"], "properties": ["bowl, silverware, man", "wind, gusts, distance"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage of a person riding a horse in a riding arena"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a woman is speaking with wind noise and breathing in the background "], "question": "which entity is more likely to be in a bowl", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a clock ticktocks"], "sample_ids": ["zkKdxzNC97Y", "v-g-j2uTByM"], "start_seconds": ["27", "30"], "properties": ["loud, bang, noise", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a door is opened and closed", "a clock is ticking loudly"], "question": "which entity is silent", "label": 0}, {"captions": ["a large bell chimes back and forth loudly", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["w2M4i1mklOA", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["loud, chime, bell", "a woman, something, fried"], "captions_pred_video": ["footage of an antique clock", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["a person sniffles and sneezes", "paper is crumpling consistently"], "sample_ids": ["uRlbY6aoBU", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["sneezes, sniffles, person", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is sneezing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zkKdxzNC97Y", "vbZ-0lGPneg"], "start_seconds": ["27", "30"], "properties": ["hard, surface, door", "a woman, a television program, a bird"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a door is opened and closed", "a woman is speaking and a dog is whimpering"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["continuous snoring", "a clicking followed by some people laughing and a kid speaking"], "sample_ids": ["sLkeqCDJIyw", "vz8868znkVQ"], "start_seconds": ["120", "60"], "properties": ["loud, snoring, noise", "audio, click, kid speaking"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "a video of a plane flying over a cloudy sky"], "captions_pred_audio": ["a person is snoring loudly", "a baby is laughing and breathing with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "water flows as men speak and yell"], "sample_ids": ["yJ0TePmaOo", "vJ7JPEFhyLA"], "start_seconds": ["390", "16"], "properties": ["two hard objects, man, speak", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing?", "label": 1}, {"captions": ["a toilet flushes and water drains", "some men converse over an engine running"], "sample_ids": ["sfAvvZwdLCY", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["water drains, flushes, water", "men, converse, engine"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["food is frying then a woman speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["ukxt9I7eMMg", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["food, woman, speak", "cling, speak, dishes"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["some liquid flows while a woman laughs and man talks", "a baby cries and a woman speaks"], "sample_ids": ["vddP56-ogds", "tMbMDvT50j8"], "start_seconds": ["30", "12"], "properties": ["liquid, laughs, man", "a, cry, woman"], "captions_pred_video": [null, "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a baby cries and a woman speaks"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a telephone rings followed by a woman talking"], "sample_ids": ["vh30P49Po6s", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["loud, continuous, quacks", "ring, talk, woman"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a dial tone sounds followed by a woman speaking"], "question": "which entity is quieter", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xyx6eNVEYRY", "su6FAOcOA8c"], "start_seconds": ["380", "4"], "properties": ["loud, engine, muffles", "engine, idle, woman"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a woman is speaking and a subway train is moving "], "question": "which engine is quieter", "label": 1}, {"captions": ["a man speaks as a car is passing by", "loud ringing of a telephone stops followed by a man speaking and a digital beep"], "sample_ids": ["sK4u5T8hW78", "uzQnlJXBbOM"], "start_seconds": ["30", "50"], "properties": ["a, car, pass", "ringing, beep, stop"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of a person using a cell phone on a table"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a telephone rings and a man speaks"], "question": "which entity is about a car passing by?", "label": 0}, {"captions": ["people speak softly as food sizzles", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["yhQ2Lg-7qDY", "x9JovgqUcs"], "start_seconds": ["130", "500"], "properties": ["food, sizzle, speak", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a man speaks and types on a keyboard"], "question": "which entity is speaking", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a horn rings out as a machine runs by"], "sample_ids": ["s4Uz1Ffgo04", "slZLHwNbbt4"], "start_seconds": ["100", "300"], "properties": ["roars, background, people speaking", "a, horn, run"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is quieter", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a horn rings out as a machine runs by"], "sample_ids": ["wnpJndXuxLc", "slZLHwNbbt4"], "start_seconds": ["50", "300"], "properties": ["beeps, loud, whistle", "a, horn, run"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vW4x7S1VfQc", "y8WEcpOlT3I"], "start_seconds": ["150", "40"], "properties": ["clacking, oil, woman", "harsh, wind, blows"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "on how to use a sewing machine youtube"], "captions_pred_audio": ["food sizzles in a frying pan", "a man is speaking with wind noise in the background "], "question": "which entity is a man speaking to another man?", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["w34HjHr6gAY", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["beeps, hit, woman", "music, gunfire, explosion"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["zofjfKhqLk8", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["background, metal, clings", "engine, revs, vehicle"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["food is frying and sizzles", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["zNRChLjqcU", "y8WEcpOlT3I"], "start_seconds": ["220", "40"], "properties": ["food is frying, sizzles, food", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking with wind noise in the background "], "question": "which entity is not a person?", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yZp6xizR0yU", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["animal, bleat, cry", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a woman is speaking and a dog is whimpering"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["male speech with light ticking", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xO-Q2BlIIPU", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["male, speech, ticking", "water, radio, man"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "insects humming with a dog barking and small goat bleating"], "sample_ids": ["t25U-v4k4ts", "tIY7qOV3rEM"], "start_seconds": ["40", "0"], "properties": ["a, chirps, bird", "animal, bark, dog, barking, small, goat, bleating"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a dog is barking and a cat is meowing"], "question": "which entity has more animals", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "an infant crying as a woman laughs"], "sample_ids": ["vYkA3cfXp5Q", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["speed, idle, accelerate", "a, laugh, infant"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["an engine is idling", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "a man speaks as a car is passing by"], "sample_ids": ["sHbXC6na9hg", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["a person, saw, wood", "a, car, pass"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a person cutting wood?", "label": 0}, {"captions": ["an engine works in idle nearby followed by a man talking", "a man speaks as a car is passing by"], "sample_ids": ["wqADXCzngMw", "sK4u5T8hW78"], "start_seconds": ["340", "30"], "properties": ["engine, idle, man", "a, car, pass"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a helicopter engine runs", "women speak as water runs briefly, children call out, and a man speaks"], "sample_ids": ["t5ZbXbniOWk", "uRExseg-0XI"], "start_seconds": ["30", "210"], "properties": ["engine, helicopter, run", "woman, man, water"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking while water is running and birds are chirping "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["food is frying then a woman speaks", "a child speaks in closed space"], "sample_ids": ["ukxt9I7eMMg", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["food, woman, speak", "child, space, speak"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a child speaking in a closed space?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a mechanical buzzing getting louder"], "sample_ids": ["v0x1odnXtP0", "sEprKHm8Sj8"], "start_seconds": ["210", "90"], "properties": ["keyboard, type, computer", "noise, loud, buzzing"], "captions_pred_video": ["how to make money on youtube in spanish", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a person is typing on a keyboard", "a race car accelerates and revs its engine "], "question": "which is a noise", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zFjIWfSD-4", "wz7N8YRy74I"], "start_seconds": ["410", "30"], "properties": ["People, motor, brakes", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "paper folding and crinkling"], "sample_ids": ["x6ijhqRY38s", "zPpG3RD8lSs"], "start_seconds": ["250", "20"], "properties": ["bowl, silverware, man", "paper, fold, crinkle"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "the wind blows and a mouse clicks "], "question": "which is not a person", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wyllXV6PjKo", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["a kid, talk, cry", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["someone snores nearby", "a crowd yells, reacts and applauds"], "sample_ids": ["spJCm8tD9Zo", "wztCSUxOf8"], "start_seconds": ["90", "130"], "properties": ["someone snores, nearby, someone", "a crowd, yells, applauds"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a baby laugh at a sputter"], "sample_ids": ["vSeGhaZt-aI", "sLUnaPT5gM8"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, run", "laugh, sputter, baby"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is a baby?", "label": 1}, {"captions": ["a person is whistling a tune", "a stream of water flows as people talk and wind blows"], "sample_ids": ["scYRUkrFLiQ", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, tune, whistle", "stream, water, flow"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["an airplane engine spools and people speak", "wind blowing followed by a zoom"], "sample_ids": ["wTjoRj1se3U", "vr8ZXjEBhMQ"], "start_seconds": ["390", "150"], "properties": ["airplane, engine, spool", "wind, blow, zoom"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a jet engine is running and people are talking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["uC9dtII1KDI", "wz7N8YRy74I"], "start_seconds": ["150", "30"], "properties": ["wind, gusts, distance", "rooster, crow, background, men"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in it?", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a frog vocalizes as birds chirp"], "sample_ids": ["xyL9F5VrjkE", "wqUmIEzuNz4"], "start_seconds": ["20", "30"], "properties": ["engine, run, wind", "frog, bird, vocalize"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "a frog sitting in the grass on a sunny day"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a cat meows and rustles"], "question": "which entity is not a vehicle", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "birds chirp and objects are moved around"], "sample_ids": ["wvKpEYswXO0", "yPUYU6t3rwo"], "start_seconds": ["150", "370"], "properties": ["water, tap, run", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "insects buzz and a man speaks"], "question": "which entity is about moving objects around?", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "plastic is tapped on while someone speaks"], "sample_ids": ["uPDn2BFTHk", "wvKpEYswXO0"], "start_seconds": ["140", "150"], "properties": ["woman, laughs, speaks", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a man talks as several small engines run", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["u9A6VZQCZpU", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["a, man, talk", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "wind blowing followed by a zoom"], "sample_ids": ["wRV8yMk886E", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["liquid, spray, nozzle", "wind, blow, zoom"], "captions_pred_video": ["two cars are parked in a parking lot at night", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man speaks followed by a loud burst", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["people speak then an engine runs", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uMTTDZ2mb4", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["engine, run, people", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a man speaks as a car is passing by"], "sample_ids": ["zofjfKhqLk8", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["background, metal, clings", "a, car, pass"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an engine runs and wind blows", "a dark barks and whimpers"], "sample_ids": ["vs65y4qmyBE", "sYj4hpDUZDQ"], "start_seconds": ["340", "30"], "properties": ["engine, run, wind", "barks, whimpers, dark"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a brown and white dog standing in front of a wall with its mouth open"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a dog barks and a cat meows"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "an engine runs loudly"], "sample_ids": ["shmR4OZtzqA", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["man, engine, idle", "loud, engine, run"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man speaks while a motor runs", "a lawn mower is running and men are speaking "], "question": "which engine is running loudly", "label": 1}, {"captions": ["speaking following by laughing and clapping", "an insect buzzes around continuously"], "sample_ids": ["u2f5NpsoHBg", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["person, laugh, clap", "buzzes, continuously, insect"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a fly is buzzing around a microphone "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a child yells and another yells", "a man speaks as a motor runs in the background"], "sample_ids": ["vMDHu7Lxcgw", "xZepNM9qcRA"], "start_seconds": ["410", "30"], "properties": ["two, yell, child", "background, motor, run"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a woman and man are speaking", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["vbpKkWvfOu4", "siJFXfGWgDk"], "start_seconds": ["560", "50"], "properties": ["two people, speaking, woman, man", "man, woman, vehicle"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking and birds are chirping in the background "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "wind blows as people chatter quietly"], "sample_ids": ["skd2PphS6oI", "xBxDz0CFVn0"], "start_seconds": ["190", "30"], "properties": ["ring, bird, vocalize", "wind, chatter, people"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "footage is blurry and out of focus"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a kid speaks followed by music playing"], "sample_ids": ["xC8kbrKJmco", "tQWGZLItBXk"], "start_seconds": ["0", "170"], "properties": ["background, goat, scream", "music, kid, speak"], "captions_pred_video": [null, "worms revolution screenshots"], "captions_pred_audio": ["a goat is bleating ", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a door slams shut roughly", "a man speaks as a car is passing by"], "sample_ids": ["zkKdxzNC97Y", "sK4u5T8hW78"], "start_seconds": ["27", "30"], "properties": ["a door, slams, shut", "a, car, pass"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a beep repeats multiple times", "a man speaks with another voice speaking in the background"], "sample_ids": ["y682ml90jGw", "u21-Z5gJCB8"], "start_seconds": ["11", "30"], "properties": ["beep, repeat, multiple", "background, voice, man"], "captions_pred_video": [null, "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["a beeping sound is being made ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a single voice speaking in the background?", "label": 0}, {"captions": ["a motorcycle engine is idling", "an airplane engine spools and people speak"], "sample_ids": ["vZAqdHZ81yA", "wTjoRj1se3U"], "start_seconds": ["180", "390"], "properties": ["engine, motorcycle, idling", "airplane, engine, spool"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["an engine is idling loudly", "a jet engine is running and people are talking"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a person sniffs and sneezes", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["uRlbY6aoBU", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["sneezes, person, sniffs", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a person speaking over a radio?", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "paper folding and crinkling"], "sample_ids": ["xKB8O8LTs6s", "zPpG3RD8lSs"], "start_seconds": ["70", "20"], "properties": ["music, gunfire, explosion", "paper, fold, crinkle"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "the wind blows and a mouse clicks "], "question": "which entity is more likely to be a movie", "label": 0}, {"captions": ["a drill runs and two people laugh", "someone is typing on a computer keyboard"], "sample_ids": ["tEE3MpBt1sg", "v0x1odnXtP0"], "start_seconds": ["50", "210"], "properties": ["two people, laugh, drill", "keyboard, type, computer"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "how to make money on youtube in spanish"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a person is typing on a keyboard"], "question": "which is not a drill", "label": 1}, {"captions": ["an adult woman and an adult man speak", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zTLVJCo4WEE", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["two people, adult, speak", "rooster, crow, background, men"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["water splashes and a door squeaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["sdXV-ylviw", "vzxHnu-SFEw"], "start_seconds": ["190", "80"], "properties": ["sound, splash, door", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity has a door?", "label": 0}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "small dogs yip and bark sharply"], "sample_ids": ["tDVADusiIoc", "v-wcQf4BDY0"], "start_seconds": ["60", "120"], "properties": ["water, radio, man", "bark, yip, sharply"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a drill runs and two people laugh", "water flows as men speak and yell"], "sample_ids": ["tEE3MpBt1sg", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["two people, laugh, drill", "water, flow, men"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a drill running and two people laughing?", "label": 0}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a person is burping while a girl speaks"], "sample_ids": ["tDVADusiIoc", "vdoxuJn9lTc"], "start_seconds": ["60", "40"], "properties": ["wind, radio, waves", "person, burp, girl"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a child speaks followed by a burp"], "question": "which entity is a person speaking over a radio?", "label": 0}, {"captions": ["continuous snoring", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["sLkeqCDJIyw", "ziUT9IFTkjg"], "start_seconds": ["120", "10"], "properties": ["loud, snoring, noise", "background, birds, rustling"], "captions_pred_video": [", what is the man doing on the couch? sleeping", null], "captions_pred_audio": ["a person is snoring loudly", "birds are chirping and a chime is ringing "], "question": "which noise is quieter", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["w34HjHr6gAY", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["beeps, hit, woman", "People, motor, brakes"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vYkA3cfXp5Q", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["engine, accelerate, idle", "music, gunfire, explosion"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["an engine is idling", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "race cars go around a track as a man commentates"], "sample_ids": ["y8WEcpOlT3I", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["harsh, wind, blows", "car, track, man"], "captions_pred_video": ["on how to use a sewing machine youtube", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars going around a track?", "label": 1}, {"captions": ["a person is snoring while sleeping", "vehicles pass by on a roadway"], "sample_ids": ["vJrjSeP17yE", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "pass, vehicle, roadway"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person snoring loudly", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet flushes and water drains", "someone is typing on a computer keyboard"], "sample_ids": ["sfAvvZwdLCY", "v0x1odnXtP0"], "start_seconds": ["20", "210"], "properties": ["water drains, flushes, water", "keyboard, type, computer"], "captions_pred_video": ["footage of the toilet in the bathroom", "how to make money on youtube in spanish"], "captions_pred_audio": ["a toilet is flushed", "a person is typing on a keyboard"], "question": "which object is used to type on a computer", "label": 1}, {"captions": ["a man speaks as a machine runs", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vD6lYD1l0BY", "w5W5Kqtc8E"], "start_seconds": ["330", "100"], "properties": ["a, machine, run", "wind, blow, vehicle"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a man speaking as a machine runs?", "label": 0}, {"captions": ["a baby cries and wails as an adult female speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["zliInBdC98Y", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["a, baby, cries, wails", "people, applaud, hoot"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", null], "captions_pred_audio": ["a baby cries and a woman speaks", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motorcycle engine is idling", "a person is burping while a girl speaks"], "sample_ids": ["vZAqdHZ81yA", "vdoxuJn9lTc"], "start_seconds": ["180", "40"], "properties": ["engine, motorcycle, idling", "person, burp, girl"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["an engine is idling loudly", "a child speaks followed by a burp"], "question": "which entity is not a person?", "label": 0}, {"captions": ["an infant crying frantically", "some men converse over an engine running"], "sample_ids": ["zwOBqeFTgiU", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["cry, infant, frantically", "men, converse, engine"], "captions_pred_video": ["of the baby crying in the car seat", null], "captions_pred_audio": ["a baby cries loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "a woman speaks as she rubs two objects together"], "sample_ids": ["vbpKkWvfOu4", "vzxHnu-SFEw"], "start_seconds": ["560", "80"], "properties": ["a, woman, man", "two objects, woman, speak"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman is speaking", "label": 1}, {"captions": ["an adult woman and an adult man speak", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zTLVJCo4WEE", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["two people, adult, speak", "applause, audience, yells"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["tDlysoZiA1I", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["animal, grunts, chirps", "animal, grunts, snorts"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking and a baby is crying"], "question": "which entity has more grunts", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sfAvvZwdLCY", "wz7N8YRy74I"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "rooster, crow, background, men"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "water is sprayed across a hard surface"], "sample_ids": ["uC9dtII1KDI", "sQwlkXjQabo"], "start_seconds": ["150", "10"], "properties": ["wind, gusts, distance", "water, spray, surface"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["siJFXfGWgDk", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["man, woman, vehicle", "music, gunfire, explosion"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more action", "label": 1}, {"captions": ["a person is snoring while sleeping", "continuous snoring"], "sample_ids": ["vJrjSeP17yE", "sLkeqCDJIyw"], "start_seconds": ["40", "120"], "properties": ["a person is sleeping, snoring, person", "loud, snoring, noise"], "captions_pred_video": ["a black background with a small plane flying in the sky", ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["a person snoring loudly", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows and a stream of water flows nearby", "some men converse over an engine running"], "sample_ids": ["sYITalLZjj4", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["stream, flow, wind", "men, converse, engine"], "captions_pred_video": ["two ducks are swimming in the water near each other", null], "captions_pred_audio": ["wind blows and birds chirp", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks while vehicles pass by", "people applaud and hoot and chat quietly"], "sample_ids": ["sK4u5T8hW78", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["a, man, talk", "people, applaud, hoot"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "a duck quacks loudly and continuously"], "sample_ids": ["vfYTJq7nU", "vh30P49Po6s"], "start_seconds": ["130", "30"], "properties": ["ducks, quack, man", "loud, continuous, quacks"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a duck is quacking loudly"], "question": "which duck is quacking loudly", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "paper is crumpling consistently"], "sample_ids": ["wz7N8YRy74I", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["rooster, crow, background, people", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["w2M4i1mklOA", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["alarm, gears, turn", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of an antique clock", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wvKpEYswXO0", "yDoT73BWsdA"], "start_seconds": ["150", "10"], "properties": ["water, tap, run", "engine, revs, vehicle"], "captions_pred_video": ["of the person preparing food in the kitchen", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a clock ticktocks", "a stream of water flows quickly"], "sample_ids": ["v-g-j2uTByM", "wbHTKEJZyhc"], "start_seconds": ["30", "20"], "properties": ["ticktocks, clock, ticktocks", "stream, water, flow"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a clock is ticking loudly", "a waterfall is flowing and people are speaking "], "question": "which entity is moving", "label": 1}, {"captions": ["a heavy rain falls endlessly", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wP8ZKrlx3oA", "xfaoyyzw2WU"], "start_seconds": ["40", "180"], "properties": ["heavy, rain, fall", "loud, jet engine, roar"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a heavy rain is falling on a surface", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["y2ZBGpgbhHM", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["birds, tweet, pant", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "an infant crying as a woman laughs"], "sample_ids": ["zF8yoL0rkbI", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["engine, run, someone", "a, laugh, infant"], "captions_pred_video": ["footage of the traffic on the street at night", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wSVhSdj0F0", "vbZ-0lGPneg"], "start_seconds": ["10", "30"], "properties": ["horn honks, keys jingle, electronic beep", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a man speaks as a car is passing by"], "sample_ids": ["t97k0cejSQE", "sK4u5T8hW78"], "start_seconds": ["250", "30"], "properties": ["bird, chirp, insect", "a, car, pass"], "captions_pred_video": ["a bee on a purple thistle flower", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a moving object", "label": 1}, {"captions": ["birds vocalize and a man speaks", "a man speaks with another voice speaking in the background"], "sample_ids": ["v0wPrLBI3hg", "u21-Z5gJCB8"], "start_seconds": ["30", "30"], "properties": ["vocalize, bird, speak", "background, voice, man"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking to birds?", "label": 0}, {"captions": ["an engine runs and wind blows", "wind blows as people chatter quietly"], "sample_ids": ["vs65y4qmyBE", "xBxDz0CFVn0"], "start_seconds": ["340", "30"], "properties": ["engine, run, wind", "wind, chatter, people"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage is blurry and out of focus"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "people applaud and hoot and chat quietly"], "sample_ids": ["vXlk0lIQBFo", "wwyfGO2J4"], "start_seconds": ["470", "90"], "properties": ["wind, talk, vocalize", "people, applaud, hoot"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", null], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "winds blows roughly as a vehicle races past"], "sample_ids": ["vJvryTwuAV8", "xjvTpk2Zpr8"], "start_seconds": ["16", "70"], "properties": ["audience, cheer, man", "wind, blows, vehicle"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "a guy speaks with birds chirping in the background"], "sample_ids": ["vBslzh7saPw", "v5P-ThUCINM"], "start_seconds": ["90", "400"], "properties": ["power, scream, increase", "background, chirp, bird"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking and birds are chirping"], "question": "which entity is quieter", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "wind blows as people chatter quietly"], "sample_ids": ["xyL9F5VrjkE", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["wind, blows, vehicle", "wind, chatter, people"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage is blurry and out of focus"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a vehicle engine accelerating then running on idle"], "sample_ids": ["uiItxDsDMFI", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["wood, piece, saw", "engine, accelerate, idle"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a saw is being used with background noise ", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "water flows as men speak and yell"], "sample_ids": ["yI-KvObbDoY", "vJ7JPEFhyLA"], "start_seconds": ["260", "16"], "properties": ["sound, smack, wind", "water, flow, men"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of water flowing?", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["w0xsN8X18Y", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["music, surface, rain", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a duck quacks and a woman speaks"], "question": "which entity is about ducks?", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "a horn rings out as a machine runs by"], "sample_ids": ["s4Uz1Ffgo04", "slZLHwNbbt4"], "start_seconds": ["100", "300"], "properties": ["water, rushes, motorcycle", "a, horn, run"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["t25U-v4k4ts", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["a, chirps, bird", "a woman, a television program, a bird"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird that chirps?", "label": 0}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zofjfKhqLk8", "vbZ-0lGPneg"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird in it?", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "people applaud and hoot and chat quietly"], "sample_ids": ["zY3icUyMdh8", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["dog, bark, engine", "people, applaud, hoot"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person speaks briefly", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zOZleIRqZm4", "uYT5gxnyMWM"], "start_seconds": ["80", "50"], "properties": ["person, talk, brief", "a, scream, girl"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a recording of a person talking?", "label": 0}, {"captions": ["a man rubs two objects together then speaks", "a car accelerates and wind blows"], "sample_ids": ["vveS8HT7Uog", "u0TrcHhkPQ"], "start_seconds": ["100", "20"], "properties": ["a man, objects, speak", "accelerates, wind, blows"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a race car accelerates and revs its engine "], "question": "which object is moving", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["skd2PphS6oI", "zl9Dqx-j7q4"], "start_seconds": ["190", "6"], "properties": ["ring, bird, vocalize", "engine, laugh, loud"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vYkA3cfXp5Q", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["engine, accelerate, idle", "engine, revs, vehicle"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["an engine is idling", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle engine", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sHbXC6na9hg", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["a person, saw, wood", "rooster, crow, background, men"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is about a person cutting wood?", "label": 0}, {"captions": ["goats bleat and metal clings", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tH17JPjDPnc", "uYT5gxnyMWM"], "start_seconds": ["260", "50"], "properties": ["bleat, metal, clings", "female, spraying, scream"], "captions_pred_video": ["feed of the goats eating hay in the barn", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a person speaks briefly", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zOZleIRqZm4", "vbZ-0lGPneg"], "start_seconds": ["80", "30"], "properties": ["person, talk, brief", "a woman, a television program, a bird"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a dog is whimpering"], "question": "which entity has more people talking", "label": 1}, {"captions": ["a male speaks and another male speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["viuTg1M-dqg", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["two males, speaking, male", "wind, blows, vehicle"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a helicopter engine runs continuously", "a duck quacks loudly and continuously"], "sample_ids": ["ugHJF0hfYkg", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["engine, running, continuously", "loud, continuous, quacks"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a helicopter is flying overhead ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks in wind", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yVumC9TGknc", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["ticktocks, clock, wind", "engine, laugh, loud"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a series of beeps and chirps", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "a car accelerates and wind blows"], "sample_ids": ["w34HjHr6gAY", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["beeps, squawk, child speaking", "accelerates, wind, blows"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zkKdxzNC97Y", "yajyRTUQk3U"], "start_seconds": ["27", "400"], "properties": ["hard, surface, door", "a woman, something, fried"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a door is opened and closed", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["tIY7qOV3rEM", "ziUT9IFTkjg"], "start_seconds": ["0", "10"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "background, birds, rustling"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "birds are chirping and a chime is ringing "], "question": "which entity has a background of birds chirping?", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "an engine runs loudly"], "sample_ids": ["zofjfKhqLk8", "vqZuVbG6-HI"], "start_seconds": ["10", "130"], "properties": ["background, metal, clank", "loud, engine, run"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["paper is crumpling consistently", "some men converse over an engine running"], "sample_ids": ["v5cSxLaHADY", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "men, converse, engine"], "captions_pred_video": ["footage of the person holding a pair of scissors", null], "captions_pred_audio": ["paper is crumpled and crinkled", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video", "label": 1}, {"captions": ["long loud burping by a man", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["xmiUIOhtZyQ", "sLUnaPT5gM8"], "start_seconds": ["60", "0"], "properties": ["loud, burp, man", "loud, laughter, intermittent"], "captions_pred_video": ["homer simpson drinking a beer", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a person burps and music plays in the background ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wqN6IIHw3po", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["rain, surface, fall", "three men, wind, flow"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and water is splashing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking while rain falls onto a hard surface?", "label": 0}, {"captions": ["a toilet flushes and water drains", "roadway noise occurs and a truck accelerates"], "sample_ids": ["sfAvvZwdLCY", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["water drains, flushes, water", "noise, truck, accelerate"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a toilet is flushed", "a car is driving on the road "], "question": "which entity is a source of noise", "label": 1}, {"captions": ["a woman speaks followed by clicks and scraping", "pigeons vocalize and birds chirp"], "sample_ids": ["yYJksgsxx5U", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["audio, clicks, scraping", "vocalize, bird, chirp"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "of the pigeon in the cage"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a man speaks over intermittent keyboard taps", "birds chirp and objects are moved around"], "sample_ids": ["tw76HGONaKg", "yPUYU6t3rwo"], "start_seconds": ["570", "370"], "properties": ["audio, man, keyboard", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "waves crash against a shoreline and people speak"], "sample_ids": ["slZLHwNbbt4", "yFB25fqfU8I"], "start_seconds": ["300", "300"], "properties": ["a, horn, run", "wave, crash, shoreline"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a clock ticktocks briefly", "a speedboat passes quickly on the water"], "sample_ids": ["u7C-AEBQM", "tjmoSi330GM"], "start_seconds": ["30", "23"], "properties": ["ticktocks, clock, ticktocks briefly", "speed, water, boat"], "captions_pred_video": [null, "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a ticktock of a clock", "a motorboat speeds through water with wind noise "], "question": "which is moving faster", "label": 0}, {"captions": ["a man makes an exclamation, then another man speaks", "water flows as a woman laughs and a man speaks"], "sample_ids": ["xO-Q2BlIIPU", "vddP56-ogds"], "start_seconds": ["30", "30"], "properties": ["two men, exclamation, speak", "water, flow, laugh"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "water is running and gurgling and a man is speaking"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a dog barks and whimpers", "frogs croak and vocalize"], "sample_ids": ["sShpyu2l4YQ", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["barks, whimpers, dog", "croak, vocalize, frog"], "captions_pred_video": ["the puppies are playing with a toy", "a close up of a frog in the water"], "captions_pred_audio": ["a dog is barking and growling", "a frog is croaking"], "question": "which animal is more vocal", "label": 1}, {"captions": ["a man speaks while water drains", "a telephone rings followed by a woman talking"], "sample_ids": ["vSeGhaZt-aI", "tGcFnX0GHI"], "start_seconds": ["50", "0"], "properties": ["water, drain, man", "ring, talk, woman"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["u5RmF3c3Aw", "zl9Dqx-j7q4"], "start_seconds": ["60", "6"], "properties": ["engine, car, zoom", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a jet engine roars "], "question": "which entity is followed by a car zooming by", "label": 0}, {"captions": ["rain falls on a surface as men speak and thunder roars", "a clock ticktocks in wind"], "sample_ids": ["w0xsN8X18Y", "yVumC9TGknc"], "start_seconds": ["30", "30"], "properties": ["rain, thunder, surface", "ticktocks, clock, wind"], "captions_pred_video": [null, "game title screen of the game shadow of the colossus on sony playstation 2"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a series of beeps and chirps"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["some tunes played by whistling", "a child speaks in closed space"], "sample_ids": ["u6BnG6YZqJ4", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["tune, play, whistling", "child, space, speak"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vBslzh7saPw", "su6FAOcOA8c"], "start_seconds": ["90", "4"], "properties": ["power, scream, increase", "engine, idle, woman"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a woman is speaking and a subway train is moving "], "question": "which engine is quieter", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "vehicles pass by on a roadway"], "sample_ids": ["rwtmaKiCcQU", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["nozzle, depressed, spray can", "pass, vehicle, roadway"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "footage of a fire truck entering a garage"], "captions_pred_audio": ["spraying and people speaking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xl2PIWyXaM", "tDVADusiIoc"], "start_seconds": ["160", "60"], "properties": ["chirp, man, younger person", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["birds are chirping and people are talking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "someone whistles a tune"], "sample_ids": ["t69a8aRKhmc", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["a, b, c", "someone, tune, whistle"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a person whistling a song"], "question": "which entity has a tune", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "an engine runs loudly"], "sample_ids": ["wAAkbZToh8", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["burp, laugh, speak", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man burps and a woman speaks", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["birds fly and flutter around", "pigeons vocalize and birds chirp"], "sample_ids": ["wGKgwOP3h30", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["fly, flutter, around", "vocalize, bird, chirp"], "captions_pred_video": ["of the pigeons in the coop", "of the pigeon in the cage"], "captions_pred_audio": ["pigeons coo and flap their wings", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a toilet flushes and a female speaks"], "sample_ids": ["wz7N8YRy74I", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["rooster, crow, background, people", "female, flushes, toilet"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a man speaks as a car is passing by"], "sample_ids": ["spYNpeN7rPY", "sK4u5T8hW78"], "start_seconds": ["1", "30"], "properties": ["a clock, ticktock, man", "a, car, pass"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a clock?", "label": 0}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "small dogs yip and bark sharply"], "sample_ids": ["y2ZBGpgbhHM", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["birds, tweet, pant", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["birds chirping and a dog panting", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "an engine runs loudly"], "sample_ids": ["zl9Dqx-j7q4", "vqZuVbG6-HI"], "start_seconds": ["6", "130"], "properties": ["engine, laugh, loud", "loud, engine, run"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a jet engine roars ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vddP56-ogds", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["water, flow, laugh", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "an airplane engine spools and people speak"], "sample_ids": ["uYT5gxnyMWM", "wTjoRj1se3U"], "start_seconds": ["50", "390"], "properties": ["female, spraying, scream", "airplane, engine, spool"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a jet engine is running and people are talking"], "question": "which entity is a video of a person speaking and then spraying and screaming?", "label": 0}, {"captions": ["multiple people speak and children yell while water gurgles", "paper folding and crinkling"], "sample_ids": ["vb1fPSDI4c", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["multiple, people, yell", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a crowd of people are talking and laughing", "the wind blows and a mouse clicks "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["xC8kbrKJmco", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["background, goat, scream", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a goat is bleating ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "water flows as men speak and yell"], "sample_ids": ["wvKpEYswXO0", "vJ7JPEFhyLA"], "start_seconds": ["150", "16"], "properties": ["plastic, tap, speak", "water, flow, men"], "captions_pred_video": ["of the person preparing food in the kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a liquid", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "paper folding and crinkling"], "sample_ids": ["xjhAnI2q6hM", "zPpG3RD8lSs"], "start_seconds": ["6", "20"], "properties": ["engine revs, vehicle, people", "paper, fold, crinkle"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "the wind blows and a mouse clicks "], "question": "which entity is a static object", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "a infant makes noise and is excited"], "sample_ids": ["vms5XGTDVQc", "wIJK3-5y0kA"], "start_seconds": ["220", "30"], "properties": ["paper, crumpled, crinkled", "noise, excited, infant"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["paper is crumpled and crinkled", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yDoT73BWsdA", "vfYTJq7nU"], "start_seconds": ["10", "130"], "properties": ["engine, revs, vehicle", "rustling, ducks, quack"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "an engine starts and increases in power"], "sample_ids": ["spJCm8tD9Zo", "zjTG0gaGCUI"], "start_seconds": ["90", "80"], "properties": ["snores, wheezes, sleeps", "power, increase, engine"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a jet engine roars as wind blows "], "question": "which entity is a source of power", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "a person snores loudly multiple times at a close distance"], "sample_ids": ["w2JXXIAdUdg", "sSMl2vc3ek"], "start_seconds": ["10", "20"], "properties": ["emits, sleeping, person", "loud, multiple, distance"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", null], "captions_pred_audio": ["a person snoring and a dog whimpering", "a person snoring loudly"], "question": "which person is emitting a snore", "label": 1}, {"captions": ["a clock ticktocks briefly", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["u7C-AEBQM", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["ticktocks, clock, ticktocks briefly", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and spraying?", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a woman speaks happily and an animal chirps"], "sample_ids": ["wRBHTgrbiwg", "uWAAAL4CIoc"], "start_seconds": ["50", "0"], "properties": ["bird, owl, speak", "a woman, chirps, animal"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman is speaking and a dog is barking "], "question": "which entity has a man speaking briefly?", "label": 0}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["tiDFTC-5vU", "xV7Mg1QucSc"], "start_seconds": ["30", "14"], "properties": ["male, duck, laugh", "alarm, ticktocks, laughs"], "captions_pred_video": [null, "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "an alarm clock ticks and a woman laughs"], "question": "which entity is about a clock ticktocking and a man laughing?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a motorcycle engine is revving while people are speaking"], "sample_ids": ["sxYkFKFIZD0", "y8dSeubCNI"], "start_seconds": ["20", "4"], "properties": ["screech, man, door", "engine revving, people speaking, motorcycle"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "an engine revving and people talking in the background"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["waves of water rumble", "a mechanical buzzing getting louder"], "sample_ids": ["vwqaIHKxLvM", "sEprKHm8Sj8"], "start_seconds": ["20", "90"], "properties": ["sound, wave, water", "noise, loud, buzzing"], "captions_pred_video": ["of a surfer riding a big wave in the ocean", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["waves crash and wind blows ", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["an insect buzzes around continuously", "people speaking indiscriminately in the distance with a person snoring loudly nearby"], "sample_ids": ["v25l1jef3JY", "w2JXXIAdUdg"], "start_seconds": ["0", "10"], "properties": ["buzzes, continuously, insect", "snoring, distance, person"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "a close up shot of a person's mouth with a toothbrush in it"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a person snoring and a dog whimpering"], "question": "which entity is louder", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vfYTJq7nU", "ukg5L09Wpvo"], "start_seconds": ["130", "150"], "properties": ["rustling, ducks, quack", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["ukxt9I7eMMg", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["food, pan, cook", "loud, multiple, distance"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a helicopter engine runs continuously", "an insect buzzes around continuously"], "sample_ids": ["ugHJF0hfYkg", "v25l1jef3JY"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "buzzes, continuously, insect"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a helicopter is flying overhead ", "a fly is buzzing around a microphone "], "question": "which entity is not a helicopter?", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "waves crash against a shoreline and people speak"], "sample_ids": ["spJCm8tD9Zo", "yFB25fqfU8I"], "start_seconds": ["90", "300"], "properties": ["snores, wheezes, sleeps", "wave, crash, shoreline"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["men speak and a nozzle sprays liquid", "a vehicle engine accelerates and wind blows"], "sample_ids": ["wRV8yMk886E", "wudZTNBtVqc"], "start_seconds": ["0", "60"], "properties": ["liquid, spray, nozzle", "accelerates, engine, wind"], "captions_pred_video": ["two cars are parked in a parking lot at night", "footage is of a parking lot with cars parked in it"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["people speak then an engine runs", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["uMTTDZ2mb4", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["engine, run, people", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program playing?", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wvKpEYswXO0", "wz7N8YRy74I"], "start_seconds": ["150", "30"], "properties": ["water, tap, run", "rooster, crow, background, men"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is about a rooster?", "label": 1}, {"captions": ["a train engine runs and a horn blows", "a child speaks in closed space"], "sample_ids": ["zPX9o1uDiI", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["engine, horn, run", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a stream of water runs briefly"], "sample_ids": ["zcDwZ6W7E3E", "x-PeY8Yb8M4"], "start_seconds": ["180", "300"], "properties": ["man, speak, motorcycles", "stream, water, run"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["someone snores nearby", "a stream of water runs briefly"], "sample_ids": ["spJCm8tD9Zo", "x-PeY8Yb8M4"], "start_seconds": ["90", "300"], "properties": ["someone snores, nearby, someone", "stream, water, run"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person is snoring loudly", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "winds blows roughly as a vehicle races past"], "sample_ids": ["vzxHnu-SFEw", "xjvTpk2Zpr8"], "start_seconds": ["80", "70"], "properties": ["two objects, woman, speak", "wind, blows, vehicle"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["male speech with light ticking", "a telephone rings followed by a woman talking"], "sample_ids": ["xO-Q2BlIIPU", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["male, speech, ticking", "ring, talk, woman"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a cat meows and children speak", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["x5cuQjOdM3E", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["cat, speak, children", "a woman, laughs, animal"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "water pouring and bubbling"], "sample_ids": ["v5P-ThUCINM", "uyRfq-jKPpo"], "start_seconds": ["400", "50"], "properties": ["background, chirp, bird", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and birds are chirping", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tDVADusiIoc", "tDVADusiIoc"], "start_seconds": ["60", "60"], "properties": ["man, radio, blows", "water, radio, man"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be in a storm", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "a machine beeps continuously"], "sample_ids": ["vveS8HT7Uog", "y682ml90jGw"], "start_seconds": ["100", "11"], "properties": ["a man, objects, speak", "beeps, machine, continuously"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "an infant crying as a woman laughs"], "sample_ids": ["uPDn2BFTHk", "xhmRY9yhC7c"], "start_seconds": ["140", "20"], "properties": ["woman, laughs, speaks", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a baby cries and a woman speaks"], "question": "which woman is laughing", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["vb1fPSDI4c", "vXlk0lIQBFo"], "start_seconds": ["30", "470"], "properties": ["multiple, people, yell", "wind, speak, vocalize"], "captions_pred_video": [null, "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a crowd of people are talking and laughing", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "birds chirp and objects are moved around"], "sample_ids": ["x6ijhqRY38s", "yPUYU6t3rwo"], "start_seconds": ["250", "370"], "properties": ["something metal, glass, hit", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and wind blows", "several insects fly while two men talk"], "sample_ids": ["sxIvBMSavMQ", "s-T9OVOiMLo"], "start_seconds": ["210", "330"], "properties": ["birds, chirp, wind", "several, fly, men"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a man speaks followed by another man speaking outside"], "sample_ids": ["zTLVJCo4WEE", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "two men, speak, follow"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["a drill runs and two people laugh", "an engine runs loudly"], "sample_ids": ["tEE3MpBt1sg", "vqZuVbG6-HI"], "start_seconds": ["50", "130"], "properties": ["two people, laugh, drill", "loud, engine, run"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage is blurry because it's raining outside"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sTpirNYo8vQ", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["a, tone, fast", "engine, idle, woman"], "captions_pred_video": ["of a man taking a selfie on a bus", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a woman is speaking and a subway train is moving "], "question": "which woman is speaking", "label": 1}, {"captions": ["a weapon fires multiple times", "several insects fly while two men talk"], "sample_ids": ["sMC07Ucy7kg", "s-T9OVOiMLo"], "start_seconds": ["10", "330"], "properties": ["weapon, fire, multiple", "several, fly, men"], "captions_pred_video": ["footage is from a car's point of view", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is not a weapon", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xZepNM9qcRA", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["background, motor, run", "airplane, boy, fly"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a motor running in the background?", "label": 0}, {"captions": ["a rumble grows louder", "water pouring and bubbling"], "sample_ids": ["y4MY9mp8-TA", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["loudness, increase, rumble", "water, bubbles, pouring"], "captions_pred_video": ["a helicopter flying in the sky", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a helicopter flies overhead ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xyL9F5VrjkE", "zj2R0XoFr5k"], "start_seconds": ["20", "50"], "properties": ["wind, motor, distance", "airplane, boy, fly"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w34HjHr6gAY", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["beeps, squawk, child speaking", "three men, wind, flow"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "some men converse over an engine running"], "sample_ids": ["tOj4tdLRaA", "sCiy7QS1U"], "start_seconds": ["70", "300"], "properties": ["woman, laugh, baby", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a woman speaking and a baby laughing?", "label": 0}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wnpJndXuxLc", "vfYTJq7nU"], "start_seconds": ["50", "130"], "properties": ["beeps, loud, whistle", "rustling, ducks, quack"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a duck quacks and a woman speaks"], "question": "which entity is more natural", "label": 1}, {"captions": ["frogs croak and vocalize", "someone whistles a tune"], "sample_ids": ["yswmmRZFItk", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["croak, vocalize, frog", "someone, tune, whistle"], "captions_pred_video": ["a close up of a frog in the water", null], "captions_pred_audio": ["a frog is croaking", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a frog croaks as other frogs croak in the background"], "sample_ids": ["wvKpEYswXO0", "yswmmRZFItk"], "start_seconds": ["150", "0"], "properties": ["sound, water, running", "background, frog, croak"], "captions_pred_video": ["of the person preparing food in the kitchen", "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a frog is croaking"], "question": "which entity has a background of frogs?", "label": 1}, {"captions": ["a child babbles as a woman speaks", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wEBlkGWVWwE", "sLUnaPT5gM8"], "start_seconds": ["260", "0"], "properties": ["a, babble, woman", "loud, laughter, intermittent"], "captions_pred_video": ["shows a person writing on the whiteboard", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a clock ticktocks"], "sample_ids": ["sEprKHm8Sj8", "v-g-j2uTByM"], "start_seconds": ["90", "30"], "properties": ["car, tires, slows", "ticktocks, clock, ticktocks"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a man speaks as a motor runs in the background"], "sample_ids": ["y1saVTXsKwc", "xZepNM9qcRA"], "start_seconds": ["80", "30"], "properties": ["a, dog, talk", "background, motor, run"], "captions_pred_video": ["a dog playing with a pink ball", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a dog barks and a man speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["tEE3MpBt1sg", "yDoT73BWsdA"], "start_seconds": ["50", "10"], "properties": ["drill, something, laugh", "engine, revs, vehicle"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which is a vehicle", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "pigeons vocalize and birds chirp"], "sample_ids": ["s7knHCFW82w", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["blow horn, get close, train", "vocalize, bird, chirp"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "of the pigeon in the cage"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a train?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xO-Q2BlIIPU", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["two men, exclamation, speak", "three men, wind, flow"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["a small engine idles continuously", "a horn rings out as a machine runs by"], "sample_ids": ["y5WII6cTH7k", "slZLHwNbbt4"], "start_seconds": ["40", "300"], "properties": ["engine, idle, continuously", "a, horn, run"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is moving", "label": 1}, {"captions": ["a car accelerates and wind blows", "someone is typing on a computer keyboard"], "sample_ids": ["u0TrcHhkPQ", "v0x1odnXtP0"], "start_seconds": ["20", "210"], "properties": ["accelerates, wind, blows", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person is typing on a keyboard"], "question": "which object is moving", "label": 0}, {"captions": ["a dog barks and whimpers", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sShpyu2l4YQ", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["barks, whimpers, dog", "engine, idle, woman"], "captions_pred_video": ["the puppies are playing with a toy", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and a subway train is moving "], "question": "which entity is a human", "label": 1}, {"captions": ["people speak as gunfire rings out", "an engine runs loudly"], "sample_ids": ["wqTCwqVRDlk", "vqZuVbG6-HI"], "start_seconds": ["80", "130"], "properties": ["gunfire, ring, speak", "loud, engine, run"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "music plays followed by gunshots and then an explosion"], "sample_ids": ["uC9dtII1KDI", "xKB8O8LTs6s"], "start_seconds": ["150", "70"], "properties": ["wind, gusts, distance", "music, gunshots, explosion"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["male speech with light ticking", "a vehicle is skidding and squealing tires"], "sample_ids": ["xO-Q2BlIIPU", "soTOh3zYJfY"], "start_seconds": ["30", "40"], "properties": ["male, speech, ticking", "vehicle, skid, tires"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "some men converse over an engine running"], "sample_ids": ["xfudFO976zE", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["animal, bleats, cry", "men, converse, engine"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "paper folding and crinkling"], "sample_ids": ["sEprKHm8Sj8", "zPpG3RD8lSs"], "start_seconds": ["90", "20"], "properties": ["noise, loud, buzzing", "paper, fold, crinkle"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "the wind blows and a mouse clicks "], "question": "which entity is not a noise", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["y2ZBGpgbhHM", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["birds, tweet, pant", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sa6TLVbooCc", "uYT5gxnyMWM"], "start_seconds": ["240", "50"], "properties": ["people, laugh, child", "a, scream, girl"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a woman is speaking and a baby is crying"], "question": "which entity has a child speaking?", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["yZp6xizR0yU", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["animal, bleat, cry", "people, applaud, hoot"], "captions_pred_video": ["footage of a woman feeding goats in a barn", null], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds fly and flutter around", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wGKgwOP3h30", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["fly, flutter, around", "applause, audience, yells"], "captions_pred_video": ["of the pigeons in the coop", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["pigeons coo and flap their wings", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a drill runs and two people laugh", "dishes cling together then a man begins to speak"], "sample_ids": ["tEE3MpBt1sg", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["two people, laugh, drill", "cling, speak, dishes"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "mechanisms are operating and water is splashing "], "question": "which entity is about a drill?", "label": 0}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wSVhSdj0F0", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["beep, clang, footsteps", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["a motorcycle engine is idling", "a machine beeps continuously"], "sample_ids": ["vZAqdHZ81yA", "y682ml90jGw"], "start_seconds": ["180", "11"], "properties": ["engine, motorcycle, idling", "beeps, machine, continuously"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a beeping sound is being made "], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vMf1dLD6Sng", "sSMl2vc3ek"], "start_seconds": ["6", "20"], "properties": ["frog, bird, vocalize", "loud, multiple, distance"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", null], "captions_pred_audio": ["a frog croaks loudly", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "several insects fly while two men talk"], "sample_ids": ["vddP56-ogds", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["liquid, laughs, man", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more insects", "label": 1}, {"captions": ["a woman and man speak while food is frying", "people speak as gunfire rings out"], "sample_ids": ["zk-xJGQU8-4", "wqTCwqVRDlk"], "start_seconds": ["130", "80"], "properties": ["food, man, woman", "gunfire, ring, speak"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uC9dtII1KDI", "vb1fPSDI4c"], "start_seconds": ["150", "30"], "properties": ["wind, gusts, distance", "multiple, people, yell"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", null], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an engine starts and increases in power", "two men speak as a buffeting wind blows"], "sample_ids": ["zjTG0gaGCUI", "y8WEcpOlT3I"], "start_seconds": ["80", "40"], "properties": ["power, increase, engine", "wind, speak, buffeting"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a man is speaking with wind noise in the background "], "question": "which entity is not a person?", "label": 0}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a toilet flushes and water drains unevenly"], "sample_ids": ["yks4cLgIDMc", "vhJWZheqaE"], "start_seconds": ["170", "0"], "properties": ["background, speaking, child", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["footage of two kids wrestling on the floor", null], "captions_pred_audio": ["a man is speaking and a child is crying", "a toilet is flushed"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "an airplane flies overhead as a woman speaks"], "sample_ids": ["wz7N8YRy74I", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["rooster, crow, background, men", "airplane, fly, overhead"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying overhead", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "several ducks quack and cocks crow far away"], "sample_ids": ["y2ZBGpgbhHM", "sNB8zxXneIM"], "start_seconds": ["30", "20"], "properties": ["birds, tweet, pant", "several, quack, cocks"], "captions_pred_video": [null, "a group of geese in a cage"], "captions_pred_audio": ["birds chirping and a dog panting", "a rooster is crowing and wind is blowing "], "question": "which entity is about birds?", "label": 0}, {"captions": ["an animal quacks rapidly", "water pouring and bubbling"], "sample_ids": ["vh30P49Po6s", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["animal, quacks, rapidly", "water, bubbles, pouring"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a duck is quacking loudly", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a dark barks and whimpers", "small dogs yip and bark sharply"], "sample_ids": ["sYj4hpDUZDQ", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["barks, whimpers, dark", "bark, yip, sharply"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a dog barks and a cat meows", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["w5W5Kqtc8E", "su6FAOcOA8c"], "start_seconds": ["100", "4"], "properties": ["water, splashes, motorboat", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["w5W5Kqtc8E", "wz7N8YRy74I"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman sneezes then speaks", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["x4dZyf9Gbj0", "sLUnaPT5gM8"], "start_seconds": ["130", "0"], "properties": ["sneezes, speaks, woman", "loud, laughter, intermittent"], "captions_pred_video": ["footage is blurry and out of focus", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman sneezes and speaks", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a sneeze", "label": 0}, {"captions": ["a woman speaks with water running", "people applaud and hoot and chat quietly"], "sample_ids": ["wTideSjRFS0", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["water, running, woman", "people, applaud, hoot"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["ticking continues without interruption", "water pouring and bubbling"], "sample_ids": ["v-g-j2uTByM", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["ticking, continuous, clock", "water, bubbles, pouring"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a clock is ticking loudly", "water is running from a faucet"], "question": "which entity is not continuous", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "a man speaks over intermittent keyboard taps"], "sample_ids": ["x9JovgqUcs", "tw76HGONaKg"], "start_seconds": ["500", "570"], "properties": ["a, man, speaks, keyboard", "audio, man, keyboard"], "captions_pred_video": [null, "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a man speaks and types on a computer keyboard "], "question": "which entity is a video?", "label": 0}, {"captions": ["a man speaking with light rustling", "birds chirp and objects are moved around"], "sample_ids": ["zOZleIRqZm4", "yPUYU6t3rwo"], "start_seconds": ["80", "370"], "properties": ["light, rustling, man", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "vehicles pass by on a roadway"], "sample_ids": ["shmR4OZtzqA", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["man, engine, idle", "pass, vehicle, roadway"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man speaks while a motor runs", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a person speaks briefly", "a car accelerates and wind blows"], "sample_ids": ["zOZleIRqZm4", "u0TrcHhkPQ"], "start_seconds": ["80", "20"], "properties": ["person, talk, brief", "accelerates, wind, blows"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a race car accelerates and revs its engine "], "question": "which is not a person", "label": 1}, {"captions": ["a person is whistling a tune", "wind blows as people chatter quietly"], "sample_ids": ["scYRUkrFLiQ", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, tune, whistle", "wind, chatter, people"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a chime of a clock followed by various tones of ticking with come clinking"], "sample_ids": ["zY3icUyMdh8", "uqFtmnhuqA8"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "a, b, c"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "shows a clock on the wall of a room with a person standing in front of it"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "mechanisms are ticking and a hammer is striking "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a door opens and closes", "a person sneezes followed by another person speaking"], "sample_ids": ["vBHyYJ8pL0", "t8CV69hcvF0"], "start_seconds": ["2", "210"], "properties": ["open, close, door", "person, sneeze, follow"], "captions_pred_video": [null, "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a woman sneezes and speaks"], "question": "which entity is a follow up to something else?", "label": 1}, {"captions": ["ticking continues without interruption", "a clock ticktocks"], "sample_ids": ["v-g-j2uTByM", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "ticktocks, clock, ticktocks"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a clock is ticking loudly", "a clock is ticking loudly"], "question": "which clock ticks continuously", "label": 0}, {"captions": ["an engine idles quietly then gradually becomes louder", "loud clanking and banging with brief male speech"], "sample_ids": ["vbr9mHKc8WM", "sWZzXuWYY"], "start_seconds": ["40", "420"], "properties": ["noise, loudness, engine", "male, speech, banging"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine is idling", "a sewing machine runs and a man speaks"], "question": "which entity is louder", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zY3icUyMdh8", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["dog, bark, engine", "female, spraying, scream"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a car accelerates and wind blows", "a man talks as something metal hits against and glass is set down"], "sample_ids": ["u0TrcHhkPQ", "x6ijhqRY38s"], "start_seconds": ["20", "250"], "properties": ["accelerates, wind, blows", "something metal, glass, hit"], "captions_pred_video": [null, "a chef preparing a dish with a bottle of wine and a plate of food on a table"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and dishes are clanging "], "question": "which entity is a demonstration of a force", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "water splashes and a door squeaks"], "sample_ids": ["weDbePuc-Xc", "sdXV-ylviw"], "start_seconds": ["40", "190"], "properties": ["cartoon character, music, vocalize", "sound, splash, door"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a dog barks and taps with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["uYT5gxnyMWM", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["female, spraying, scream", "music, gunfire, explosion"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a man speaks as a car is passing by"], "sample_ids": ["xjvTpk2Zpr8", "sK4u5T8hW78"], "start_seconds": ["70", "30"], "properties": ["engine, run, wind", "a, car, pass"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is stationary", "label": 1}, {"captions": ["someone snores nearby", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["spJCm8tD9Zo", "uEU-Hg5MTN8"], "start_seconds": ["90", "27"], "properties": ["someone snores, nearby, someone", "a woman, laughs, animal"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["t97k0cejSQE", "uZesmtKZGSw"], "start_seconds": ["250", "250"], "properties": ["bird, chirp, insect", "men, talk, cars"], "captions_pred_video": ["a bee on a purple thistle flower", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a drill runs and two people laugh", "a telephone rings followed by a woman talking"], "sample_ids": ["tEE3MpBt1sg", "tGcFnX0GHI"], "start_seconds": ["50", "0"], "properties": ["two people, laugh, drill", "ring, talk, woman"], "captions_pred_video": ["footage is blurry due to the smoke in the air", null], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sSMl2vc3ek", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["loud, multiple, distance", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person snoring loudly", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xjhAnI2q6hM", "wDVMhEdTiVw"], "start_seconds": ["6", "30"], "properties": ["wind, blow, loudly", "gun, shoot, water"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause damage", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zALy31PjDl0", "uZesmtKZGSw"], "start_seconds": ["21", "250"], "properties": ["a man, a vehicle, a horn", "men, talk, cars"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["several ducks are quacking and squawking", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["wfHeoPDLMaM", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["quacking, squawking, ducks", "background, birds, rustling"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "birds are chirping and a chime is ringing "], "question": "which entity is a bird", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "rain falls on a surface as men speak and thunder roars"], "sample_ids": ["w-4gHptFNuU", "w0xsN8X18Y"], "start_seconds": ["21", "30"], "properties": ["engine revs, accelerates, bump", "rain, thunder, surface"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while a motorboat is moving in the background "], "question": "which entity is a natural occurrence", "label": 1}, {"captions": ["a man talks as several small engines run", "a woman speaks as she rubs two objects together"], "sample_ids": ["u9A6VZQCZpU", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["a, man, talk", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a heavy rain falls endlessly", "an airplane engine roars increasingly louder"], "sample_ids": ["wP8ZKrlx3oA", "vBslzh7saPw"], "start_seconds": ["40", "90"], "properties": ["heavy, rain, fall", "engine, roar, louder"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a jet engine roars and accelerates "], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks in wind", "plastic is tapped on while someone speaks"], "sample_ids": ["yVumC9TGknc", "wvKpEYswXO0"], "start_seconds": ["30", "150"], "properties": ["ticktocks, clock, wind", "plastic, tap, speak"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a series of beeps and chirps", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a man speaks as a machine runs", "a man speaks as a motor runs in the background"], "sample_ids": ["vD6lYD1l0BY", "xZepNM9qcRA"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "background, motor, run"], "captions_pred_video": ["game controller being held in the hands of the person", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man speaks while a motorcycle revs and accelerates "], "question": "which machine runs in the background", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a woman speaks and then a man speaks"], "sample_ids": ["xyL9F5VrjkE", "vbpKkWvfOu4"], "start_seconds": ["20", "560"], "properties": ["engine, run, wind", "a, man, speaks"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and a man is speaking"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "someone whistles a tune"], "sample_ids": ["vveS8HT7Uog", "sIXTftIuUgw"], "start_seconds": ["100", "90"], "properties": ["a man, objects, speak", "someone, tune, whistle"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["speaking following by laughing and clapping", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["u2f5NpsoHBg", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["person, laugh, clap", "men, talk, cars"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity shows a person speaking?", "label": 0}, {"captions": ["food fries in a pan as someone talks and cooks", "a infant makes noise and is excited"], "sample_ids": ["ukxt9I7eMMg", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["food, pan, cook", "noise, excited, infant"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["siJFXfGWgDk", "vlJS7LN2XyM"], "start_seconds": ["50", "30"], "properties": ["a, bird, vehicle", "background, clocks, ticking"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a ticktock of a clock"], "question": "which entity has a vehicle in the background?", "label": 0}, {"captions": ["engines sputter roughly and tires squeal", "a telephone rings followed by a woman talking"], "sample_ids": ["zhx6hoYrHeI", "tGcFnX0GHI"], "start_seconds": ["160", "0"], "properties": ["engine, sputter, rough", "ring, talk, woman"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "an engine runs loudly"], "sample_ids": ["wSVhSdj0F0", "vqZuVbG6-HI"], "start_seconds": ["10", "130"], "properties": ["horn honks, keys jingle, slam", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle engine is idling", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vZAqdHZ81yA", "xfaoyyzw2WU"], "start_seconds": ["180", "180"], "properties": ["engine, motorcycle, idling", "loud, jet engine, roar"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["an engine is idling loudly", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a woman speaks happily and an animal chirps"], "sample_ids": ["xNMovAf3o50", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["rain, thunder, music", "a woman, chirps, animal"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", null], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an engine runs and a man speaks", "wind blows as people chatter quietly"], "sample_ids": ["yT5WfYMRr-U", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "wind, chatter, people"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["continuous snoring", "people applaud and hoot and chat quietly"], "sample_ids": ["sLkeqCDJIyw", "wwyfGO2J4"], "start_seconds": ["120", "90"], "properties": ["loud, snoring, noise", "people, applaud, hoot"], "captions_pred_video": [", what is the man doing on the couch? sleeping", null], "captions_pred_audio": ["a person is snoring loudly", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "paper is crumpling consistently"], "sample_ids": ["vb1fPSDI4c", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["multiple, people, yell", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a crowd of people are talking and laughing", "paper is crumpled and crinkled"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "paper folding and crinkling"], "sample_ids": ["slZLHwNbbt4", "zPpG3RD8lSs"], "start_seconds": ["300", "20"], "properties": ["a, horn, run", "paper, fold, crinkle"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "the wind blows and a mouse clicks "], "question": "which is not a machine", "label": 1}, {"captions": ["an engine starts and increases in power", "a train engine runs and a horn blows"], "sample_ids": ["zjTG0gaGCUI", "zPX9o1uDiI"], "start_seconds": ["80", "40"], "properties": ["power, increase, engine", "engine, horn, run"], "captions_pred_video": [null, null], "captions_pred_audio": ["a jet engine roars as wind blows ", "a train moves with its horn blowing and wheels squealing "], "question": "which entity is a train engine?", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["wRBHTgrbiwg", "w34HjHr6gAY"], "start_seconds": ["50", "30"], "properties": ["bird, owl, speak", "beeps, hit, woman"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a beep sounds followed by a child speaking"], "question": "which entity has a man speaking briefly?", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vzxHnu-SFEw", "vJ7JPEFhyLA"], "start_seconds": ["80", "16"], "properties": ["two objects, woman, speak", "three men, wind, flow"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a man speaks while water trickles and flows", "some tunes played by whistling"], "sample_ids": ["sapQIQUhFc", "u6BnG6YZqJ4"], "start_seconds": ["280", "0"], "properties": ["water, trickles, flow", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zj2R0XoFr5k", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["airplane, boy, fly", "a, scream, girl"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a baby is crying"], "question": "which entity has a boy speaking?", "label": 0}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["tDlfY3nmx1A", "uEU-Hg5MTN8"], "start_seconds": ["160", "27"], "properties": ["applause, laugh, man", "animal, grunts, snorts"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["continuous chugging with birds chirping in the background", "a toilet flushes and a female speaks"], "sample_ids": ["xM4joTqDVp4", "yaln9y8I7ms"], "start_seconds": ["160", "230"], "properties": ["background, chirp, birds", "female, flushes, toilet"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["water rushes by", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["x-PeY8Yb8M4", "zj2R0XoFr5k"], "start_seconds": ["300", "50"], "properties": ["water, rushes, by", "airplane, boy, fly"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a car is driving on a wet road ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xKB8O8LTs6s", "vJ7JPEFhyLA"], "start_seconds": ["70", "16"], "properties": ["music, gunfire, explosion", "three men, wind, flow"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sAam2NqGhLY", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["snoring, breathing, child", "stream, water, flow"], "captions_pred_video": ["of a little girl sleeping on a couch", "footage is blurry and out of focus"], "captions_pred_audio": ["a person is snoring", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sapQIQUhFc", "wqZ135Ssz0"], "start_seconds": ["280", "60"], "properties": ["water, trickles, flow", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tDlfY3nmx1A", "tdWhHV3X25Q"], "start_seconds": ["160", "60"], "properties": ["applause, laugh, man", "applause, audience, yells"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a man is speaking and a crowd is clapping"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["an airplane engine runs", "pigeons vocalize and birds chirp"], "sample_ids": ["yVPZ2MNWpms", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["engine, airplane, runs", "vocalize, bird, chirp"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "of the pigeon in the cage"], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "a clock ticktocks"], "sample_ids": ["wAAkbZToh8", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["burp, laugh, speak", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man burps and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "dishes cling together then a man begins to speak"], "sample_ids": ["xZepNM9qcRA", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["background, motor, run", "cling, speak, dishes"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["heavy rain splashes as it falls", "water is sprayed across a hard surface"], "sample_ids": ["wP8ZKrlx3oA", "sQwlkXjQabo"], "start_seconds": ["40", "10"], "properties": ["fall, rain, splash", "water, spray, surface"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a heavy rain is falling on a surface", "spraying followed by silence"], "question": "which entity is a spray of water?", "label": 1}, {"captions": ["a person snoring several times", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["spJCm8tD9Zo", "wqZ135Ssz0"], "start_seconds": ["90", "60"], "properties": ["snore, person, several", "two men, woman, birds"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a person", "label": 0}, {"captions": ["a man speaks as water trickles down a stream", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sapQIQUhFc", "wz7N8YRy74I"], "start_seconds": ["280", "30"], "properties": ["water, stream, trickles", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in it?", "label": 1}, {"captions": ["some men converse over an engine running", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["sCiy7QS1U", "ukg5L09Wpvo"], "start_seconds": ["300", "150"], "properties": ["men, converse, engine", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a person screams glaringly"], "sample_ids": ["w2M4i1mklOA", "xC8kbrKJmco"], "start_seconds": ["30", "0"], "properties": ["loud, chime, bell", "glaringly, screams, person"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a goat is bleating "], "question": "which entity is louder", "label": 1}, {"captions": ["a kid speaks followed by music playing", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tQWGZLItBXk", "uZesmtKZGSw"], "start_seconds": ["170", "250"], "properties": ["music, kid, speak", "men, talk, cars"], "captions_pred_video": ["worms revolution screenshots", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["vzxHnu-SFEw", "siJFXfGWgDk"], "start_seconds": ["80", "50"], "properties": ["two objects, woman, speak", "man, woman, vehicle"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking and birds are chirping in the background "], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "water flows as men speak and yell"], "sample_ids": ["sK4u5T8hW78", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["a, car, pass", "water, flow, men"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking as a car is passing by?", "label": 0}, {"captions": ["a young woman speaks and laughs and an animal snorts", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["uEU-Hg5MTN8", "xKB8O8LTs6s"], "start_seconds": ["27", "70"], "properties": ["a woman, laughs, animal", "music, gunfire, explosion"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more action", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["ylpYOorfH4o", "uEU-Hg5MTN8"], "start_seconds": ["410", "27"], "properties": ["motor, run, steady", "a woman, laughs, animal"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a woman speaks followed by another woman whimpering and speaking"], "sample_ids": ["sWZzXuWYY", "xOZfdgAgJ9o"], "start_seconds": ["420", "40"], "properties": ["male, speech, banging", "woman, whimpering, speaking"], "captions_pred_video": [null, "footage of a woman talking to a man in a doctor's office"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a woman speaking?", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["sQGXqGcwOTc", "s7knHCFW82w"], "start_seconds": ["3", "30"], "properties": ["cling, speak, dishes", "blow horn, get close, train"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is moving", "label": 1}, {"captions": ["a dog barks and whimpers", "a woman speaks as she rubs two objects together"], "sample_ids": ["sShpyu2l4YQ", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["barks, whimpers, dog", "two objects, woman, speak"], "captions_pred_video": ["the puppies are playing with a toy", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "water splashes as an animal walks through"], "sample_ids": ["yks4cLgIDMc", "w1ir-sZ3Im8"], "start_seconds": ["170", "90"], "properties": ["background, speaking, child", "animal, water, splashes"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and a child is crying", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vzceMbklWc", "ukg5L09Wpvo"], "start_seconds": ["180", "150"], "properties": ["water, faucet, sink", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["water is running and a man is speaking", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "water is sprayed across a hard surface"], "sample_ids": ["uiItxDsDMFI", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["wood, piece, saw", "water, spray, surface"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a saw is being used with background noise ", "spraying followed by silence"], "question": "which entity is wetter", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "frogs croak and vocalize"], "sample_ids": ["xfudFO976zE", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["animal, bleats, cry", "croak, vocalize, frog"], "captions_pred_video": ["footage is blurry and shaky", "a close up of a frog in the water"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a frog is croaking"], "question": "which animal is more vocal", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "water splashes and a door squeaks"], "sample_ids": ["wz7N8YRy74I", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["rooster, crow, background, men", "sound, splash, door"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a dog barks and taps with background noise "], "question": "which entity has a door?", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "a stream of water runs briefly"], "sample_ids": ["yVumC9TGknc", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["humming, clock, birds", "stream, water, run"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a series of beeps and chirps", "a car is driving on a wet road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["birds chirp and wind blows", "a man speaks as a car is passing by"], "sample_ids": ["sxIvBMSavMQ", "sK4u5T8hW78"], "start_seconds": ["210", "30"], "properties": ["birds, chirp, wind", "a, car, pass"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "an airplane engine spools and people speak"], "sample_ids": ["smDKStoHBJo", "wTjoRj1se3U"], "start_seconds": ["0", "390"], "properties": ["a, talk, baby, cry", "airplane, engine, spool"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a jet engine is running and people are talking"], "question": "which entity is a video of a person talking?", "label": 0}, {"captions": ["a man speaks while rain falls onto a hard surface", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["wqN6IIHw3po", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["rain, surface, fall", "gun, shoot, water"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and water is splashing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not a gun?", "label": 0}, {"captions": ["a baby cries and a woman speaks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["tMbMDvT50j8", "w34HjHr6gAY"], "start_seconds": ["12", "30"], "properties": ["a, cry, woman", "beeps, hit, woman"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a baby cries and a woman speaks", "a beep sounds followed by a child speaking"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a race car approaches quickly and slows down squealing tires", "wind blowing followed by a zoom"], "sample_ids": ["sEprKHm8Sj8", "vr8ZXjEBhMQ"], "start_seconds": ["90", "150"], "properties": ["car, tires, slows", "wind, blow, zoom"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "a door opens and closes"], "sample_ids": ["tOj4tdLRaA", "vBHyYJ8pL0"], "start_seconds": ["70", "2"], "properties": ["woman, laugh, baby", "open, close, door"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is more passive", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yZmhM1HcsyE", "vJ7JPEFhyLA"], "start_seconds": ["4", "16"], "properties": ["engine, roar, water", "three men, wind, flow"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a person screams glaringly", "a toilet flushes and water drains"], "sample_ids": ["xC8kbrKJmco", "sfAvvZwdLCY"], "start_seconds": ["0", "20"], "properties": ["glaringly, screams, person", "water drains, flushes, water"], "captions_pred_video": [null, "footage of the toilet in the bathroom"], "captions_pred_audio": ["a goat is bleating ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a cat meows and children speak"], "sample_ids": ["wqN6IIHw3po", "x5cuQjOdM3E"], "start_seconds": ["30", "30"], "properties": ["rain, surface, fall", "cat, speak, children"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a man is speaking and water is splashing", "a cat meows and a woman speaks"], "question": "which entity is a cat?", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["uWAAAL4CIoc", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["a, dog, vocalize", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which animal is speaking", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wy1eKjR7KC0", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["people, talk, distance", "loud, jet engine, roar"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking and a siren is going off", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a toilet flushes and water drains", "water is sprayed across a hard surface"], "sample_ids": ["sfAvvZwdLCY", "sQwlkXjQabo"], "start_seconds": ["20", "10"], "properties": ["water drains, flushes, water", "water, spray, surface"], "captions_pred_video": ["footage of the toilet in the bathroom", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a toilet is flushed", "spraying followed by silence"], "question": "which entity is a source of water", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a infant makes noise and is excited"], "sample_ids": ["wnpJndXuxLc", "wIJK3-5y0kA"], "start_seconds": ["50", "30"], "properties": ["blows, vehicle, train", "noise, excited, infant"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "plastic is tapped on while someone speaks"], "sample_ids": ["vzceMbklWc", "wvKpEYswXO0"], "start_seconds": ["180", "150"], "properties": ["water, faucet, sink", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["water is running and a man is speaking", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["xBxDz0CFVn0", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["stream, water, flow", "loud, laughter, intermittent"], "captions_pred_video": ["footage is blurry and out of focus", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a stream of water flows as people talk and wind blows", "people cheer as a vehicle engine revs"], "sample_ids": ["xBxDz0CFVn0", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["stream, water, flow", "engine revs, vehicle, people"], "captions_pred_video": ["footage is blurry and out of focus", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "a man speaks as a motor runs in the background"], "sample_ids": ["sapQIQUhFc", "xZepNM9qcRA"], "start_seconds": ["280", "30"], "properties": ["water, trickles, flow", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "dishes cling together then a man begins to speak"], "sample_ids": ["tOSWIURC-4", "sQGXqGcwOTc"], "start_seconds": ["0", "3"], "properties": ["engine, work, nearby", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a lawn mower is running ", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["multiple ducks quack continuously", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wfHeoPDLMaM", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["multiple, quack, continuously", "loud, laughter, intermittent"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["ducks are quacking", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["w0xsN8X18Y", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["music, surface, rain", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a man speaks as a motor runs in the background"], "sample_ids": ["vZAw4apG0Es", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["background, clock, ticktocks", "background, motor, run"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a clock in the background", "label": 0}, {"captions": ["a man sprays as a scraping occurs in the background", "a telephone rings followed by a woman talking"], "sample_ids": ["sOa7g-44Dag", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["background, man, spray", "ring, talk, woman"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a woman speaks happily and an animal chirps"], "sample_ids": ["zsLxS-uLJTw", "uWAAAL4CIoc"], "start_seconds": ["20", "0"], "properties": ["horn, blast, train", "a woman, chirps, animal"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", null], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a child speaks in closed space"], "sample_ids": ["viuTg1M-dqg", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["two men, speak, follow", "child, space, speak"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["zY3icUyMdh8", "s7knHCFW82w"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "blow horn, get close, train"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a cat meows and children speak", "several insects fly while two men talk"], "sample_ids": ["x5cuQjOdM3E", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["cat, speak, children", "several, fly, men"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["male speech with light ticking", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xO-Q2BlIIPU", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["male, speech, ticking", "three men, wind, flow"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a frog croaks as other frogs croak in the background"], "sample_ids": ["u2f5NpsoHBg", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["person, laugh, clap", "background, frog, croak"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a frog is croaking"], "question": "which entity is a frog", "label": 1}, {"captions": ["a male speaks over some small clicks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["uXxVebHsGZ8", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["male, clicks, speak", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "birds chirp and objects are moved around"], "sample_ids": ["y8dSeubCNI", "yPUYU6t3rwo"], "start_seconds": ["4", "370"], "properties": ["men, women, car", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["an engine revving and people talking in the background", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["leaves rustle while man speaks", "a car accelerates and wind blows"], "sample_ids": ["zOZleIRqZm4", "u0TrcHhkPQ"], "start_seconds": ["80", "20"], "properties": ["leaves, rustle, speak", "accelerates, wind, blows"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "motors rev and run loudly as a person laughs"], "sample_ids": ["x9JovgqUcs", "zl9Dqx-j7q4"], "start_seconds": ["500", "6"], "properties": ["a, man, speaks, keyboard", "motors rev, laugh, loudly"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a jet engine roars "], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a train engine runs and a horn blows"], "sample_ids": ["sapQIQUhFc", "zPX9o1uDiI"], "start_seconds": ["280", "40"], "properties": ["liquid, flow, distance", "engine, horn, run"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a train moves with its horn blowing and wheels squealing "], "question": "which entity is moving", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sQGXqGcwOTc", "zl9Dqx-j7q4"], "start_seconds": ["3", "6"], "properties": ["cling, speak, dishes", "engine, laugh, loud"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a man driving a car in the dark"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "water is sprayed across a hard surface"], "sample_ids": ["tEE3MpBt1sg", "sQwlkXjQabo"], "start_seconds": ["50", "10"], "properties": ["drill, something, laugh", "water, spray, surface"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yeFvk9x0wWI", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["chirp, twitter, clatter", "rooster, crow, background, men"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a rooster?", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "vehicles pass by on a roadway"], "sample_ids": ["vXlk0lIQBFo", "tgbONvsP47Y"], "start_seconds": ["470", "0"], "properties": ["wind, talk, vocalize", "pass, vehicle, roadway"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of a fire truck entering a garage"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "people cheer as a vehicle engine revs"], "sample_ids": ["uZesmtKZGSw", "xjhAnI2q6hM"], "start_seconds": ["250", "6"], "properties": ["men, talk, cars", "engine revs, vehicle, people"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "a small musical boom and then birds tweet and a few dogs pant"], "sample_ids": ["zuua6-5goWw", "y2ZBGpgbhHM"], "start_seconds": ["30", "30"], "properties": ["sound, pop, bird", "birds, tweet, pant"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", null], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "birds chirping and a dog panting"], "question": "which entity has more birds", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w8uLijTqtlU", "xBxDz0CFVn0"], "start_seconds": ["70", "30"], "properties": ["wind, microphone, noise", "stream, water, flow"], "captions_pred_video": ["footage is blurry and shaky", "footage is blurry and out of focus"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["s6DESzUTGjY", "yajyRTUQk3U"], "start_seconds": ["16", "400"], "properties": ["wind, laugh, woman", "a woman, something, fried"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which woman is frying something?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yRx9txMcBl0", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["motors, tires, screech", "a woman, laughs, animal"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "an insect buzzes around continuously"], "sample_ids": ["spJCm8tD9Zo", "v25l1jef3JY"], "start_seconds": ["90", "0"], "properties": ["snores, wheezes, sleeps", "buzzes, continuously, insect"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a person is snoring loudly", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a woman speaks as she rubs two objects together"], "sample_ids": ["w5W5Kqtc8E", "vzxHnu-SFEw"], "start_seconds": ["100", "80"], "properties": ["water, splashes, motorboat", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["food is frying while a woman speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yhQ2Lg-7qDY", "sSMl2vc3ek"], "start_seconds": ["130", "20"], "properties": ["food, woman, speak", "loud, multiple, distance"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["wind blows and people talk while livestock vocalizes", "birds chirp and objects are moved around"], "sample_ids": ["vXlk0lIQBFo", "yPUYU6t3rwo"], "start_seconds": ["470", "370"], "properties": ["wind, talk, vocalize", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["t8CV69hcvF0", "wz7N8YRy74I"], "start_seconds": ["210", "30"], "properties": ["person, sneeze, follow", "rooster, crow, background, men"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["birds fly and flutter around", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wGKgwOP3h30", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["fly, flutter, around", "a woman, laughs, animal"], "captions_pred_video": ["of the pigeons in the coop", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["pigeons coo and flap their wings", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 0}, {"captions": ["a girl talking, laughing and sneezing noise", "people cheer as a vehicle engine revs"], "sample_ids": ["y4tPJXBKDig", "xjhAnI2q6hM"], "start_seconds": ["20", "6"], "properties": ["a, noise, talk", "engine revs, vehicle, people"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a clock ticktocks"], "sample_ids": ["sAam2NqGhLY", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["snoring, breathing, child", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a little girl sleeping on a couch", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person is snoring", "a clock is ticking loudly"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["a female speaks softly as paper crinkles", "water splashes as an animal walks through"], "sample_ids": ["xvDdE3zNf8Y", "w1ir-sZ3Im8"], "start_seconds": ["120", "90"], "properties": ["a, female, speaks", "animal, water, splashes"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman speaks and crumples paper", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a goat bleats as a person speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tPJvjq9QePY", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["bleats, person, speak", "water, radio, man"], "captions_pred_video": ["a dog and a sheep in a barn", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a baby cries and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a person speaking over a radio?", "label": 1}, {"captions": ["a helicopter engine runs", "winds blows roughly as a vehicle races past"], "sample_ids": ["t5ZbXbniOWk", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["engine, helicopter, run", "wind, blows, vehicle"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a helicopter is flying overhead ", "a jet engine roars and wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a woman speaks as frying food sizzles"], "sample_ids": ["v5P-ThUCINM", "wTideSjRFS0"], "start_seconds": ["400", "30"], "properties": ["background, chirp, bird", "food, sizzle, woman"], "captions_pred_video": [null, "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a woman is speaking while water is running in the background"], "question": "which entity is more likely to be in a restaurant", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sWZzXuWYY", "xBxDz0CFVn0"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a speedboat passes quickly on the water"], "sample_ids": ["sLUnaPT5gM8", "tjmoSi330GM"], "start_seconds": ["0", "23"], "properties": ["loud, laughter, intermittent", "speed, water, boat"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a motorboat speeds through water with wind noise "], "question": "which is moving faster", "label": 0}, {"captions": ["insects humming with a dog barking and small goat bleating", "an airplane engine runs"], "sample_ids": ["tIY7qOV3rEM", "yVPZ2MNWpms"], "start_seconds": ["0", "0"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "engine, airplane, runs"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["a power tool runs and touches a surface", "a diesel truck engine runs continuously"], "sample_ids": ["zfvPRf3chY", "sZvwOuuPGP0"], "start_seconds": ["290", "50"], "properties": ["power tool, run, touch", "engine, diesel, truck"], "captions_pred_video": [null, "of a bulldozer clearing a road in a forest stock footage and royalty-free videos"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a medium engine is running "], "question": "which entity is a machine", "label": 1}, {"captions": ["a motorcycle engine works nearby", "an airplane engine runs"], "sample_ids": ["tOSWIURC-4", "yVPZ2MNWpms"], "start_seconds": ["0", "0"], "properties": ["engine, work, nearby", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a lawn mower is running ", "a car is driving by on the road "], "question": "which entity has a moving engine", "label": 1}, {"captions": ["children cry and people talk", "vehicles pass by on a roadway"], "sample_ids": ["xLwHe825Zs", "tgbONvsP47Y"], "start_seconds": ["18", "0"], "properties": ["people talk, children cry, people talk", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a baby cries and a woman speaks", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a machine runs continuously", "people cheer as a vehicle engine revs"], "sample_ids": ["wdXV3Pv0jiY", "xjhAnI2q6hM"], "start_seconds": ["11", "6"], "properties": ["machine, running, continuously", "engine revs, vehicle, people"], "captions_pred_video": ["footage is blurry and shaky", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a truck is revving its engine and a man is speaking "], "question": "which machine is running continuously", "label": 0}, {"captions": ["a saw finishes running as metal clings in the background", "a power tool runs and touches a surface"], "sample_ids": ["zofjfKhqLk8", "zfvPRf3chY"], "start_seconds": ["10", "290"], "properties": ["background, metal, clings", "power tool, run, touch"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking while a power tool is being used "], "question": "which tool is touching a surface", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "a person is burping then speaks and laughs"], "sample_ids": ["vs65y4qmyBE", "wAAkbZToh8"], "start_seconds": ["340", "0"], "properties": ["wind, blows, strongly", "burp, laugh, speak"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man burps and a woman speaks"], "question": "which entity is speaking", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "an airplane engine spools and people speak"], "sample_ids": ["yJ0TePmaOo", "wTjoRj1se3U"], "start_seconds": ["390", "390"], "properties": ["two hard objects, man, speak", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a jet engine is running and people are talking"], "question": "which object is moving", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["sWZzXuWYY", "vzxHnu-SFEw"], "start_seconds": ["420", "80"], "properties": ["male, clanks, thumps", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a machine?", "label": 0}, {"captions": ["a crowd yells, reacts and applauds", "females talk and laugh over gusting wind"], "sample_ids": ["wztCSUxOf8", "un9VQlzgZM"], "start_seconds": ["130", "5"], "properties": ["a crowd, yells, applauds", "females, talk, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is more likely to be at a sporting event", "label": 0}, {"captions": ["a man speaks as horns blow", "a man speaks followed by another man speaking outside"], "sample_ids": ["tHyNqRyK34A", "viuTg1M-dqg"], "start_seconds": ["24", "30"], "properties": ["a, man, speaks", "two men, speak, follow"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a horn rings out as a machine runs by"], "sample_ids": ["vZAw4apG0Es", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["background, tick, repeat", "a, horn, run"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a clock is ticking and people are talking", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity has a horn?", "label": 1}, {"captions": ["water bubbles and gurgles.", "a car speeding up in the distance"], "sample_ids": ["tB7hWb9gTuQ", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["bubbles, gurgles, water", "distance, car, speed"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", null], "captions_pred_audio": ["water is splashing and gurgling", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["zofjfKhqLk8", "ziUT9IFTkjg"], "start_seconds": ["10", "10"], "properties": ["background, metal, clank", "background, birds, rustling"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "birds are chirping and a chime is ringing "], "question": "which entity has a bird in the background?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["wSVhSdj0F0", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["horn honks, keys jingle, slam", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity has a snort?", "label": 1}, {"captions": ["a duck quacks continuously", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vh30P49Po6s", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["quacks, continuously, duck", "a woman, a television program, a bird"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and a dog is whimpering"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "waves crash against a shoreline and people speak"], "sample_ids": ["sfAvvZwdLCY", "yFB25fqfU8I"], "start_seconds": ["20", "300"], "properties": ["flushes, drains, water", "wave, crash, shoreline"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a source of water", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "a vehicle engine accelerating then running on idle"], "sample_ids": ["uYT5gxnyMWM", "vYkA3cfXp5Q"], "start_seconds": ["50", "30"], "properties": ["person, spray, yell", "engine, accelerate, idle"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["birds chirp and wind blows", "a woman speaks happily and an animal chirps"], "sample_ids": ["sxIvBMSavMQ", "uWAAAL4CIoc"], "start_seconds": ["210", "0"], "properties": ["birds, chirp, wind", "a woman, chirps, animal"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["a goat screams and people speak in the background", "someone is typing on a computer keyboard"], "sample_ids": ["xC8kbrKJmco", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["background, goat, scream", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a goat is bleating ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a person whistles a meandering tune", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["uFoga8sHpiw", "zFjIWfSD-4"], "start_seconds": ["90", "410"], "properties": ["person, tune, whistle", "People, motor, brakes"], "captions_pred_video": ["footage of a bird in a cage", null], "captions_pred_audio": ["a person whistles a song", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vBslzh7saPw", "vYkA3cfXp5Q"], "start_seconds": ["90", "30"], "properties": ["power, scream, increase", "engine, accelerate, idle"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a jet engine roars and accelerates ", "an engine is idling"], "question": "which engine is running on idle", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zl9Dqx-j7q4", "tDVADusiIoc"], "start_seconds": ["6", "60"], "properties": ["motors rev, laugh, loudly", "water, radio, man"], "captions_pred_video": ["footage of a man driving a car in the dark", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a train horn sounds as it passes by", "a train horn blows as it passes by"], "sample_ids": ["ukg5L09Wpvo", "zVacuqSb4LI"], "start_seconds": ["150", "30"], "properties": ["sound, train, horn", "horn, blows, train"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which train horn blows as it passes by", "label": 1}, {"captions": ["birds chirp and wind blows", "an engine runs loudly"], "sample_ids": ["sxIvBMSavMQ", "vqZuVbG6-HI"], "start_seconds": ["210", "130"], "properties": ["birds, chirp, wind", "loud, engine, run"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage is blurry because it's raining outside"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a lawn mower is running and men are speaking "], "question": "which entity is quieter", "label": 0}, {"captions": ["a child speaks", "a duck quacks continuously"], "sample_ids": ["yW6FWLSLkx4", "vh30P49Po6s"], "start_seconds": ["40", "30"], "properties": ["a, child, speaks", "quacks, continuously, duck"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a duck is quacking loudly"], "question": "which entity is speaking", "label": 0}, {"captions": ["a motor noise is accompanied by a door opening and closing", "pigeons vocalize and birds chirp"], "sample_ids": ["vBHyYJ8pL0", "uiS58TNyUiw"], "start_seconds": ["2", "430"], "properties": ["noise, door, opening", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["zhx6hoYrHeI", "xfaoyyzw2WU"], "start_seconds": ["160", "180"], "properties": ["engine, sputter, rough", "loud, jet engine, roar"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a car accelerates and revs its engine ", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["s6DESzUTGjY", "vfYTJq7nU"], "start_seconds": ["16", "130"], "properties": ["wind, laugh, woman", "rustling, ducks, quack"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", null], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["some people speak", "a child speaks in closed space"], "sample_ids": ["vbZ-0lGPneg", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "child, space, speak"], "captions_pred_video": ["of a man holding a baby duck in his hands", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking english", "label": 0}, {"captions": ["people speak in the background as a clock ticktocks", "someone is typing on a computer keyboard"], "sample_ids": ["vZAw4apG0Es", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["background, clock, ticktocks", "keyboard, type, computer"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "how to make money on youtube in spanish"], "captions_pred_audio": ["a clock is ticking and people are talking", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a infant makes noise and is excited"], "sample_ids": ["sofxkNWaP0s", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["wind, engine, louder", "noise, excited, infant"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vzceMbklWc", "uYT5gxnyMWM"], "start_seconds": ["180", "50"], "properties": ["water, faucet, sink", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["water is running and a man is speaking", "a woman is speaking and a baby is crying"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["u6jIvCtKarQ", "w5W5Kqtc8E"], "start_seconds": ["70", "100"], "properties": ["a, man, speaks", "wind, blow, vehicle"], "captions_pred_video": ["footage of a person using a blender on a stove top", null], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "water quietly rushes by while birds chirp in the background"], "sample_ids": ["ukg5L09Wpvo", "sYITalLZjj4"], "start_seconds": ["150", "30"], "properties": ["a train, a horn, a bell", "water, rushes, background, birds"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "two ducks are swimming in the water near each other"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "wind blows and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sWZzXuWYY", "uYT5gxnyMWM"], "start_seconds": ["420", "50"], "properties": ["male, speech, banging", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a scream", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wRBHTgrbiwg", "sSMl2vc3ek"], "start_seconds": ["50", "20"], "properties": ["bird, owl, speak", "loud, multiple, distance"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["sWZzXuWYY", "wyllXV6PjKo"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "a baby, a woman, a man"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman speaks and a baby cries"], "question": "which entity is more quiet", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sQGXqGcwOTc", "tdWhHV3X25Q"], "start_seconds": ["3", "60"], "properties": ["audio, kid, giggles", "applause, audience, yells"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "water pouring and bubbling"], "sample_ids": ["yDoT73BWsdA", "uyRfq-jKPpo"], "start_seconds": ["10", "50"], "properties": ["engine, revs, vehicle", "water, bubbles, pouring"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a cat meows and children speak", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["x5cuQjOdM3E", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["cat, speak, children", "men, talk, cars"], "captions_pred_video": ["a black background with an airplane flying in the sky", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a door opens and birds chirp", "an adult male speaks and dials a rotary phone"], "sample_ids": ["yeFvk9x0wWI", "tK4VlLsNxak"], "start_seconds": ["30", "120"], "properties": ["door, open, birds", "An adult male speaks, dials, and speaks into a rotary phone"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and using a sewing machine"], "question": "which entity is a video", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "a man speaks as a car is passing by"], "sample_ids": ["s3cTDAj31g", "sK4u5T8hW78"], "start_seconds": ["80", "30"], "properties": ["man, talk, woman", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a man talking?", "label": 0}, {"captions": ["a woman and man speak while food is frying", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zk-xJGQU8-4", "tDVADusiIoc"], "start_seconds": ["130", "60"], "properties": ["food, man, woman", "water, radio, man"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["su6FAOcOA8c", "w34HjHr6gAY"], "start_seconds": ["4", "30"], "properties": ["engine, run, woman", "beeps, hit, woman"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a woman and man are speaking", "a stream of water runs briefly"], "sample_ids": ["vbpKkWvfOu4", "x-PeY8Yb8M4"], "start_seconds": ["560", "300"], "properties": ["two people, speaking, woman, man", "stream, water, run"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a power tool runs and touches a surface", "plastic is tapped on while someone speaks"], "sample_ids": ["zfvPRf3chY", "wvKpEYswXO0"], "start_seconds": ["290", "150"], "properties": ["power tool, run, touch", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a woman is speaking and tapping with background noise and water running "], "question": "which is not a power tool", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "winds blows roughly as a vehicle races past"], "sample_ids": ["tEE3MpBt1sg", "xjvTpk2Zpr8"], "start_seconds": ["50", "70"], "properties": ["drill, something, laugh", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["wy1eKjR7KC0", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["people, talk, distance", "motor noise, horn, siren"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a car speeding up in the distance"], "sample_ids": ["v7jJS8aAyA", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["wind, blows, loudly", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a machine beeps continuously", "a duck quacks continuously"], "sample_ids": ["y682ml90jGw", "vh30P49Po6s"], "start_seconds": ["11", "30"], "properties": ["beeps, machine, continuously", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a beeping sound is being made ", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "some tunes played by whistling"], "sample_ids": ["wztCSUxOf8", "u6BnG6YZqJ4"], "start_seconds": ["130", "0"], "properties": ["a crowd, yells, applauds", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a man speaks as a motor runs in the background"], "sample_ids": ["ylpYOorfH4o", "xZepNM9qcRA"], "start_seconds": ["410", "30"], "properties": ["engine, running, wind", "background, motor, run"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a man speaks as a car is passing by"], "sample_ids": ["sK4u5T8hW78", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "a, car, pass"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a photograph", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["un9VQlzgZM", "xfaoyyzw2WU"], "start_seconds": ["5", "180"], "properties": ["wind, speak, laugh", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sa6TLVbooCc", "wDVMhEdTiVw"], "start_seconds": ["240", "30"], "properties": ["people, laugh, child", "gun, shoot, water"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "a child speaks in closed space"], "sample_ids": ["u6jIvCtKarQ", "yW6FWLSLkx4"], "start_seconds": ["70", "40"], "properties": ["a, man, speaks", "child, space, speak"], "captions_pred_video": ["footage of a person using a blender on a stove top", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 0}, {"captions": ["speaking following by laughing and clapping", "insects humming with a dog barking and small goat bleating"], "sample_ids": ["u2f5NpsoHBg", "tIY7qOV3rEM"], "start_seconds": ["30", "0"], "properties": ["person, laugh, clap", "animal, bark, dog, barking, small, goat, bleating"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a dog is barking and a cat is meowing"], "question": "which animal is barking", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["xyL9F5VrjkE", "y8WEcpOlT3I"], "start_seconds": ["20", "40"], "properties": ["engine, run, wind", "harsh, wind, blows"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "on how to use a sewing machine youtube"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking with wind noise in the background "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["goats bleat and people speak", "an infant crying as a woman laughs"], "sample_ids": ["z5iUE5h0EPs", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["goats bleat, people speak, language", "a, laugh, infant"], "captions_pred_video": ["of the goat in the barn", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a goat bleats and a man speaks", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "some men converse over an engine running"], "sample_ids": ["uiItxDsDMFI", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["wood, piece, saw", "men, converse, engine"], "captions_pred_video": ["a man cutting a log with an axe in the woods", null], "captions_pred_audio": ["a saw is being used with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a piece of wood being sawed?", "label": 0}, {"captions": ["here comes the train and it starts to blow the horn and get close", "birds chirp and objects are moved around"], "sample_ids": ["s7knHCFW82w", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["blow horn, get close, train", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "insects buzz and a man speaks"], "question": "which entity is moving around objects", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a propeller rotates loudly and intensely"], "sample_ids": ["wSVhSdj0F0", "ugHJF0hfYkg"], "start_seconds": ["10", "10"], "properties": ["horn honks, keys jingle, slam", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a duck quacks loudly and continuously"], "sample_ids": ["xBxDz0CFVn0", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["wind, chatter, people", "loud, continuous, quacks"], "captions_pred_video": ["footage is blurry and out of focus", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "dishes cling together then a man begins to speak"], "sample_ids": ["w2JXXIAdUdg", "sQGXqGcwOTc"], "start_seconds": ["10", "3"], "properties": ["snoring, distance, person", "cling, speak, dishes"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a person snoring and a dog whimpering", "mechanisms are operating and water is splashing "], "question": "which entity has a person speaking in the distance?", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "a woman speaks and a baby laughs"], "sample_ids": ["yaln9y8I7ms", "tOj4tdLRaA"], "start_seconds": ["230", "70"], "properties": ["female, flushes, toilet", "woman, laugh, baby"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and a man speaks", "a baby laughs and a woman speaks"], "question": "which entity has a baby laugh?", "label": 1}, {"captions": ["a machine runs continuously", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["wdXV3Pv0jiY", "zY3icUyMdh8"], "start_seconds": ["11", "20"], "properties": ["machine, running, continuously", "dog, bark, engine"], "captions_pred_video": ["footage is blurry and shaky", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a car is driving and dogs are barking and squealing "], "question": "which entity is not running continuously", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "dishes cling together then a man begins to speak"], "sample_ids": ["sncRqQ67iJU", "sQGXqGcwOTc"], "start_seconds": ["460", "3"], "properties": ["loud, repeatedly, man", "cling, speak, dishes"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a person is snoring", "mechanisms are operating and water is splashing "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "an infant crying frantically"], "sample_ids": ["vhJWZheqaE", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["water drains unevenly, toilet flushes, water drains", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a toilet is flushed", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "people cheer as a vehicle engine revs"], "sample_ids": ["wtDqrBygTcU", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["man, engine, run", "engine revs, vehicle, people"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and a motor is running", "a truck is revving its engine and a man is speaking "], "question": "which entity has a man speaking as an engine runs?", "label": 0}, {"captions": ["a diesel truck engine runs while wind blows", "a woman speaks as she rubs two objects together"], "sample_ids": ["xyL9F5VrjkE", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["engine, run, wind", "two objects, woman, speak"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["an engine runs and a man speaks", "vehicles pass by on a roadway"], "sample_ids": ["yT5WfYMRr-U", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["engine, run, man", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "birds chirp and objects are moved around"], "sample_ids": ["uPDn2BFTHk", "yPUYU6t3rwo"], "start_seconds": ["140", "370"], "properties": ["lady, laugh, baby", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a baby laughs and a woman speaks", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["vz8868znkVQ", "ziUT9IFTkjg"], "start_seconds": ["60", "10"], "properties": ["audio, click, kid speaking", "background, birds, rustling"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", null], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "birds are chirping and a chime is ringing "], "question": "which entity has a bird in the background?", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a vehicle is skidding and squealing tires"], "sample_ids": ["wztCSUxOf8", "soTOh3zYJfY"], "start_seconds": ["130", "40"], "properties": ["a crowd, yells, applauds", "vehicle, skid, tires"], "captions_pred_video": [null, "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a helicopter engine idles continuously", "an airplane engine spools and people speak"], "sample_ids": ["ugHJF0hfYkg", "wTjoRj1se3U"], "start_seconds": ["10", "390"], "properties": ["engine, idle, continuously", "airplane, engine, spool"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a helicopter is flying overhead ", "a jet engine is running and people are talking"], "question": "which entity has a moving engine", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vf9xf3vMsGM", "xBxDz0CFVn0"], "start_seconds": ["540", "30"], "properties": ["A man speaks while turning a water faucet on.", "stream, water, flow"], "captions_pred_video": ["of the person washing their hands under the faucet", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["an audience gives applause", "water flows as a woman laughs and a man speaks"], "sample_ids": ["x6iCUDmRpKQ", "vddP56-ogds"], "start_seconds": ["38", "30"], "properties": ["applause, audience, give", "water, flow, laugh"], "captions_pred_video": ["a black background with the moon and stars in the sky", null], "captions_pred_audio": ["a group of people are clapping and cheering", "water is running and gurgling and a man is speaking"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["w0xsN8X18Y", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["music, surface, rain", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["running water in a faucet with some clinks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zNRChLjqcU", "uZesmtKZGSw"], "start_seconds": ["220", "250"], "properties": ["water, faucet, run", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks as insects buzz and a bird chirps", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["t25U-v4k4ts", "sLUnaPT5gM8"], "start_seconds": ["40", "0"], "properties": ["a, chirps, bird", "loud, laughter, intermittent"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an adult woman and an adult man speak", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zTLVJCo4WEE", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["two people, adult, speak", "airplane, boy, fly"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about a boy speaking?", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a vehicle accelerates and squeals tires"], "sample_ids": ["tK4VlLsNxak", "yRx9txMcBl0"], "start_seconds": ["120", "40"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "accelerates, tires, squeals"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a car is revving its engine and skidding "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "water pouring and bubbling"], "sample_ids": ["vs65y4qmyBE", "uyRfq-jKPpo"], "start_seconds": ["340", "50"], "properties": ["engine, run, man", "water, bubbles, pouring"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wfHeoPDLMaM", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["quacking, squawking, ducks", "engine, accelerate, idle"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["ducks are quacking", "an engine is idling"], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a man speaks, then dials a rotary telephone", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tK4VlLsNxak", "vb1fPSDI4c"], "start_seconds": ["120", "30"], "properties": ["a, dial, telephone", "multiple, people, yell"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["yRx9txMcBl0", "tdWhHV3X25Q"], "start_seconds": ["40", "60"], "properties": ["accelerates, tires, squeals", "applause, audience, yells"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman and man speak while food is frying", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zk-xJGQU8-4", "vb1fPSDI4c"], "start_seconds": ["130", "30"], "properties": ["food, man, woman", "multiple, people, yell"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", null], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a horn rings out as a machine runs by"], "sample_ids": ["wudZTNBtVqc", "slZLHwNbbt4"], "start_seconds": ["60", "300"], "properties": ["accelerates, engine, wind", "a, horn, run"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a man talks while metallic objects are rapped and steam is released"], "sample_ids": ["sSMl2vc3ek", "vuUVPzd2FXw"], "start_seconds": ["20", "160"], "properties": ["a person, laughs, snores", "a, steam, release"], "captions_pred_video": [null, "of the person cooking on the grill with a spatula"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and dishes are clanging"], "question": "which entity is about a person releasing steam?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "someone is typing on a computer keyboard"], "sample_ids": ["tDVADusiIoc", "v0x1odnXtP0"], "start_seconds": ["60", "210"], "properties": ["water, radio, man", "keyboard, type, computer"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a person is typing on a keyboard"], "question": "which entity is a person", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a infant makes noise and is excited"], "sample_ids": ["ul60S8TXDA8", "wIJK3-5y0kA"], "start_seconds": ["60", "30"], "properties": ["sound, distance, bell", "noise, excited, infant"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "an airplane engine spools and people speak"], "sample_ids": ["vJ7JPEFhyLA", "wTjoRj1se3U"], "start_seconds": ["16", "390"], "properties": ["three men, wind, flow", "airplane, engine, spool"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine is running and people are talking"], "question": "which entity is about a moving object", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "an insect buzzes around continuously"], "sample_ids": ["yRx9txMcBl0", "v25l1jef3JY"], "start_seconds": ["40", "0"], "properties": ["accelerates, tires, squeals", "buzzes, continuously, insect"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a fly is buzzing around a microphone "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["birds chirp as a bell rings", "birds chirp and objects are moved around"], "sample_ids": ["ziUT9IFTkjg", "yPUYU6t3rwo"], "start_seconds": ["10", "370"], "properties": ["chirp, bell, ring", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["y8dSeubCNI", "sLUnaPT5gM8"], "start_seconds": ["4", "0"], "properties": ["engine revving, people speaking, motorcycle", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["an engine revving and people talking in the background", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["running water in a faucet with some clinks", "an airplane engine spools and people speak"], "sample_ids": ["zNRChLjqcU", "wTjoRj1se3U"], "start_seconds": ["220", "390"], "properties": ["water, faucet, run", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["water is running from a faucet into a sink", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["a duck quacks several times", "a series of light horn beeps is followed by a loud steam whistle"], "sample_ids": ["vh30P49Po6s", "wnpJndXuxLc"], "start_seconds": ["30", "50"], "properties": ["quacks, duck, several", "beeps, loud, whistle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a duck is quacking loudly", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "vehicles pass by on a roadway"], "sample_ids": ["tDVADusiIoc", "tgbONvsP47Y"], "start_seconds": ["60", "0"], "properties": ["man, radio, blows", "pass, vehicle, roadway"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a child speaks"], "sample_ids": ["vddP56-ogds", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["liquid, laughs, man", "a, child, speaks"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["s7knHCFW82w", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["blow horn, get close, train", "a woman, laughs, animal"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "dishes cling together then a man begins to speak"], "sample_ids": ["v0x1odnXtP0", "sQGXqGcwOTc"], "start_seconds": ["210", "3"], "properties": ["keyboard, type, computer", "cling, speak, dishes"], "captions_pred_video": ["how to make money on youtube in spanish", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a person is typing on a keyboard", "mechanisms are operating and water is splashing "], "question": "which entity is about speaking", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a vehicle is skidding and squealing tires"], "sample_ids": ["vXlk0lIQBFo", "soTOh3zYJfY"], "start_seconds": ["470", "40"], "properties": ["wind, speak, vocalize", "vehicle, skid, tires"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "wind blows as people chatter quietly"], "sample_ids": ["zF8yoL0rkbI", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["engine, run, someone", "wind, chatter, people"], "captions_pred_video": ["footage of the traffic on the street at night", "footage is blurry and out of focus"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["an insect buzzes around continuously", "a train horn blows as it passes by"], "sample_ids": ["v25l1jef3JY", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "horn, blows, train"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vzxHnu-SFEw", "tDVADusiIoc"], "start_seconds": ["80", "60"], "properties": ["two objects, woman, speak", "water, radio, man"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "a man speaks as a car is passing by"], "sample_ids": ["xOZfdgAgJ9o", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["woman, whimpering, speaking", "a, car, pass"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a person snores loudly multiple times at a close distance"], "sample_ids": ["xBxDz0CFVn0", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["wind, chatter, people", "loud, multiple, distance"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a person is whistling", "water pouring and bubbling"], "sample_ids": ["sIXTftIuUgw", "uyRfq-jKPpo"], "start_seconds": ["90", "50"], "properties": ["person, whistling, person", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a person whistling a song", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks while water drains", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vSeGhaZt-aI", "xfaoyyzw2WU"], "start_seconds": ["50", "180"], "properties": ["water, drain, man", "loud, jet engine, roar"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a small engine spits as it runs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sZvwOuuPGP0", "wDVMhEdTiVw"], "start_seconds": ["50", "30"], "properties": ["spits, engine, runs", "gun, shoot, water"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a medium engine is running ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not a gun?", "label": 0}, {"captions": ["wind blows and people scream while an engine revs", "a train horn blares as a train passes, then fades"], "sample_ids": ["w5W5Kqtc8E", "zVacuqSb4LI"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "blares, fades, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is about a train?", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["w5W5Kqtc8E", "uEU-Hg5MTN8"], "start_seconds": ["100", "27"], "properties": ["wind, blow, vehicle", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman laughing?", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "several insects fly while two men talk"], "sample_ids": ["sjlVMgdGSK0", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["accelerates, vehicle, race car", "several, fly, men"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["water bubbles and gurgles.", "a door opens and closes"], "sample_ids": ["tB7hWb9gTuQ", "vBHyYJ8pL0"], "start_seconds": ["30", "2"], "properties": ["bubbles, gurgles, water", "open, close, door"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", null], "captions_pred_audio": ["water is splashing and gurgling", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["birds vocalize and a man speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["v0wPrLBI3hg", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["vocalize, bird, speak", "engine, laugh, loud"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "footage of a man driving a car in the dark"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "an infant crying as a woman laughs"], "sample_ids": ["yajyRTUQk3U", "xhmRY9yhC7c"], "start_seconds": ["400", "20"], "properties": ["a woman, something, fried", "a, laugh, infant"], "captions_pred_video": ["- a woman cooking in the kitchen", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a baby cries and a woman speaks"], "question": "which woman is laughing", "label": 1}, {"captions": ["an airplane engine spools and people speak", "water flows as men speak and yell"], "sample_ids": ["wTjoRj1se3U", "vJ7JPEFhyLA"], "start_seconds": ["390", "16"], "properties": ["airplane, engine, spool", "water, flow, men"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a moving object?", "label": 0}, {"captions": ["a helicopter engine runs continuously", "a man speaks followed by another man speaking outside"], "sample_ids": ["ugHJF0hfYkg", "viuTg1M-dqg"], "start_seconds": ["10", "30"], "properties": ["engine, running, continuously", "two men, speak, follow"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "dishes cling together then a man begins to speak"], "sample_ids": ["sOa7g-44Dag", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["audio, scratching, man", "cling, speak, dishes"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a heavy rain falls endlessly", "a man speaks as a car is passing by"], "sample_ids": ["wP8ZKrlx3oA", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["heavy, rain, fall", "a, car, pass"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a chime of a clock followed by various tones of ticking with come clinking"], "sample_ids": ["ugHJF0hfYkg", "uqFtmnhuqA8"], "start_seconds": ["10", "30"], "properties": ["loud, intense, propeller", "a, b, c"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "shows a clock on the wall of a room with a person standing in front of it"], "captions_pred_audio": ["a helicopter is flying overhead ", "mechanisms are ticking and a hammer is striking "], "question": "which entity is quieter", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["w0xsN8X18Y", "tDlysoZiA1I"], "start_seconds": ["30", "0"], "properties": ["rain, thunder, surface", "animal, grunts, chirps"], "captions_pred_video": [null, "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yZrFNS7GFBQ", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["pigeon, buzzes, insect", "rooster, crow, background, men"], "captions_pred_video": ["of the bird in the cage", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird?", "label": 0}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["t69a8aRKhmc", "t25U-v4k4ts"], "start_seconds": ["30", "40"], "properties": ["a, b, c", "a, chirps, bird"], "captions_pred_video": ["footage is blurry and out of focus", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and bees are buzzing"], "question": "which entity has a bird chirp?", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "water flows and trickles"], "sample_ids": ["zCrAfDfv6-A", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["person, mouse, click", "water, flow, trickle"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a person whistles a song", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["yDoT73BWsdA", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["engine revs, tires squeal, vehicle", "applause, audience, yells"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a child yells and another yells", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vMDHu7Lxcgw", "su6FAOcOA8c"], "start_seconds": ["410", "4"], "properties": ["two, yell, child", "engine, idle, woman"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a car accelerates and wind blows"], "sample_ids": ["wP8ZKrlx3oA", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["rain, storm, thunder", "accelerates, wind, blows"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["an insect buzzes around continuously", "a machine beeps continuously"], "sample_ids": ["v25l1jef3JY", "y682ml90jGw"], "start_seconds": ["0", "11"], "properties": ["buzzes, continuously, insect", "beeps, machine, continuously"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a beeping sound is being made "], "question": "which entity is a machine", "label": 1}, {"captions": ["a train horn blows as it passes by", "a vehicle engine accelerating then running on idle"], "sample_ids": ["zVacuqSb4LI", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["horn, blows, train", "engine, accelerate, idle"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xyx6eNVEYRY", "zj2R0XoFr5k"], "start_seconds": ["380", "50"], "properties": ["loud, engine, muffles", "airplane, boy, fly"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a baby cries and a woman speaks", "vehicles pass by on a roadway"], "sample_ids": ["tMbMDvT50j8", "tgbONvsP47Y"], "start_seconds": ["12", "0"], "properties": ["a, cry, woman", "pass, vehicle, roadway"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a baby cries and a woman speaks", "a car is driving on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yswmmRZFItk", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["background, frog, croak", "male, duck, laugh"], "captions_pred_video": ["a close up of a frog in the water", null], "captions_pred_audio": ["a frog is croaking", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "several insects fly while two men talk"], "sample_ids": ["zkKdxzNC97Y", "s-T9OVOiMLo"], "start_seconds": ["27", "330"], "properties": ["hard, surface, door", "several, fly, men"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a person snoring several times", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["spJCm8tD9Zo", "uZesmtKZGSw"], "start_seconds": ["90", "250"], "properties": ["snore, person, several", "men, talk, cars"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["birds chirp and a pop occurs before a man speaks", "vehicles pass by on a roadway"], "sample_ids": ["zuua6-5goWw", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["sound, pop, bird", "pass, vehicle, roadway"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a stream of water runs briefly"], "sample_ids": ["zY3icUyMdh8", "x-PeY8Yb8M4"], "start_seconds": ["20", "300"], "properties": ["dog, bark, engine", "stream, water, run"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["ugHJF0hfYkg", "zFjIWfSD-4"], "start_seconds": ["10", "410"], "properties": ["loud, intense, propeller", "People, motor, brakes"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which is quieter", "label": 1}, {"captions": ["someone whistles a song", "several insects fly while two men talk"], "sample_ids": ["sIXTftIuUgw", "s-T9OVOiMLo"], "start_seconds": ["90", "330"], "properties": ["someone, song, whistle", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a person whistling a song", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a person", "label": 0}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a man speaks as a motor runs in the background"], "sample_ids": ["wP8ZKrlx3oA", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["rain, storm, thunder", "background, motor, run"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "waves crash against a shoreline and people speak"], "sample_ids": ["vlJS7LN2XyM", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["background, clocks, ticking", "wave, crash, shoreline"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zkKdxzNC97Y", "uZesmtKZGSw"], "start_seconds": ["27", "250"], "properties": ["hard, surface, door", "men, talk, cars"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["goats bleat and metal clings", "a clock ticktocks"], "sample_ids": ["tH17JPjDPnc", "v-g-j2uTByM"], "start_seconds": ["260", "30"], "properties": ["bleat, metal, clings", "ticktocks, clock, ticktocks"], "captions_pred_video": ["feed of the goats eating hay in the barn", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["y2ZBGpgbhHM", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["animal, growl, bird", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds chirping and a dog panting", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["birds chirp as a bell rings", "some men converse over an engine running"], "sample_ids": ["ziUT9IFTkjg", "sCiy7QS1U"], "start_seconds": ["10", "300"], "properties": ["chirp, bell, ring", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a human activity", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "water flows as men speak and yell"], "sample_ids": ["wvKpEYswXO0", "vJ7JPEFhyLA"], "start_seconds": ["150", "16"], "properties": ["sound, water, running", "water, flow, men"], "captions_pred_video": ["of the person preparing food in the kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water flowing", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a toilet flushes and a female speaks"], "sample_ids": ["vYkA3cfXp5Q", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["engine, accelerate, idle", "female, flushes, toilet"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage is blurry and out of focus"], "captions_pred_audio": ["an engine is idling", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a woman speaks with water running"], "sample_ids": ["sQwlkXjQabo", "wTideSjRFS0"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "water, running, woman"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["spraying followed by silence", "a woman is speaking while water is running in the background"], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["bees buzz and wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tMJne1a4AFI", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["bees buzz, wind blows, bees", "a woman, something, fried"], "captions_pred_video": ["a swarm of bees on the ground", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["birds vocalize and a man speaks", "a clock ticktocks"], "sample_ids": ["v0wPrLBI3hg", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["vocalize, bird, speak", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yks4cLgIDMc", "wz7N8YRy74I"], "start_seconds": ["170", "30"], "properties": ["background, speaking, child", "rooster, crow, background, men"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and a child is crying", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a duck quacks continuously"], "sample_ids": ["tDVADusiIoc", "vh30P49Po6s"], "start_seconds": ["60", "30"], "properties": ["wind, radio, waves", "quacks, continuously, duck"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a few ducks quack and scamper and a man speaks"], "sample_ids": ["zCrAfDfv6-A", "w2bYrCVLT60"], "start_seconds": ["30", "120"], "properties": ["person, mouse, click", "ducks, speak, quack"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "of the ducks drinking from a pink pool in the grass"], "captions_pred_audio": ["a person whistles a song", "ducks are quacking and a man is speaking"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a person snoring several times", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["spJCm8tD9Zo", "zFjIWfSD-4"], "start_seconds": ["90", "410"], "properties": ["snore, person, several", "People, motor, brakes"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "dishes cling together then a man begins to speak"], "sample_ids": ["wAAkbZToh8", "sQGXqGcwOTc"], "start_seconds": ["0", "3"], "properties": ["burp, laugh, speak", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man burps and a woman speaks", "mechanisms are operating and water is splashing "], "question": "which entity is about speaking", "label": 1}, {"captions": ["a duck quacks several times", "a car speeding up in the distance"], "sample_ids": ["vh30P49Po6s", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["quacks, duck, several", "distance, car, speed"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "water splashes as an animal walks through"], "sample_ids": ["zgUgkpk78xU", "w1ir-sZ3Im8"], "start_seconds": ["70", "90"], "properties": ["horn, bells, ring", "animal, water, splashes"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a car accelerates and wind blows"], "sample_ids": ["wjsXBsc7M40", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "accelerates, wind, blows"], "captions_pred_video": ["footage of the baby playing with a toothbrush", null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "an insect buzzes around continuously"], "sample_ids": ["zuua6-5goWw", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["birds, chirp, quiet, man, speaks", "buzzes, continuously, insect"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a fly is buzzing around a microphone "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a horn rings out as a machine runs by"], "sample_ids": ["vBHyYJ8pL0", "slZLHwNbbt4"], "start_seconds": ["2", "300"], "properties": ["noise, door, opening", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is accompanied by a door opening and closing", "label": 0}, {"captions": ["some tunes played by whistling", "a toilet flushes and a female speaks"], "sample_ids": ["u6BnG6YZqJ4", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["tune, play, whistling", "female, flushes, toilet"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistling a song", "a toilet flushes and a man speaks"], "question": "which entity is a video", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "water is sprayed across a hard surface"], "sample_ids": ["sofxkNWaP0s", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["wind, engine, louder", "water, spray, surface"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a child speaks in closed space"], "sample_ids": ["sLUnaPT5gM8", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["loud, laughter, intermittent", "child, space, speak"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a child yells and another yells", "a horn honks and then loudly blares"], "sample_ids": ["vMDHu7Lxcgw", "wnpJndXuxLc"], "start_seconds": ["410", "50"], "properties": ["two, yell, child", "horn, honk, loud"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a toilet door squeaks as it is opened", "a horse runs while two women talk"], "sample_ids": ["sdXV-ylviw", "sdvI1mHAsc"], "start_seconds": ["190", "20"], "properties": ["door, toilet, squeaks", "two women, horse, run"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and taps with background noise ", "horses clip-clop and a woman speaks"], "question": "which entity is a living thing", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "some men converse over an engine running"], "sample_ids": ["sQGXqGcwOTc", "sCiy7QS1U"], "start_seconds": ["3", "300"], "properties": ["audio, kid, giggles", "men, converse, engine"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video", "label": 1}, {"captions": ["a man speaks as a machine runs", "a man speaks as a machine runs"], "sample_ids": ["vD6lYD1l0BY", "vD6lYD1l0BY"], "start_seconds": ["330", "330"], "properties": ["a, machine, run", "a, machine, run"], "captions_pred_video": ["game controller being held in the hands of the person", "game controller being held in the hands of the person"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking and dishes are being washed "], "question": "which machine is running in the first image?", "label": 0}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vddP56-ogds", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["water, splash, person, laugh", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a piece of wood is being placed down and sawed"], "sample_ids": ["tQWGZLItBXk", "uiItxDsDMFI"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "wood, piece, saw"], "captions_pred_video": ["worms revolution screenshots", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a saw is being used with background noise "], "question": "which entity is a video of a person sawing wood?", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "paper is crumpling consistently"], "sample_ids": ["uJV8NDaHqqk", "v5cSxLaHADY"], "start_seconds": ["100", "0"], "properties": ["loud, fly, chirp", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a bee hive in a wooden box", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a swarm of bees buzzing around", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "wind blows as people chatter quietly"], "sample_ids": ["sncRqQ67iJU", "xBxDz0CFVn0"], "start_seconds": ["460", "30"], "properties": ["loud, repeatedly, man", "wind, chatter, people"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a person is snoring", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "an airplane engine spools and people speak"], "sample_ids": ["vbpKkWvfOu4", "wTjoRj1se3U"], "start_seconds": ["560", "390"], "properties": ["a, woman, man", "airplane, engine, spool"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a jet engine is running and people are talking"], "question": "which entity is a video of a woman speaking and other women and a man talk with her?", "label": 0}, {"captions": ["birds chirp and objects are moved around", "a woman speaks happily and an animal chirps"], "sample_ids": ["yPUYU6t3rwo", "uWAAAL4CIoc"], "start_seconds": ["370", "0"], "properties": ["birds chirp, objects are moved around, birds", "a woman, chirps, animal"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", null], "captions_pred_audio": ["insects buzz and a man speaks", "a woman is speaking and a dog is barking "], "question": "which entity is a video", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["xyL9F5VrjkE", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["wind, blows, vehicle", "female, spraying, scream"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman and man speak while food is frying", "an infant crying frantically"], "sample_ids": ["zk-xJGQU8-4", "zwOBqeFTgiU"], "start_seconds": ["130", "30"], "properties": ["food, man, woman", "cry, infant, frantically"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "of the baby crying in the car seat"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["tjmoSi330GM", "wz7N8YRy74I"], "start_seconds": ["23", "30"], "properties": ["speed, water, boat", "rooster, crow, background, men"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a rooster?", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "an airplane flies overhead as a woman speaks"], "sample_ids": ["ul60S8TXDA8", "zj2R0XoFr5k"], "start_seconds": ["60", "50"], "properties": ["sound, distance, bell", "airplane, fly, overhead"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying overhead", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "a man speaks as a car is passing by"], "sample_ids": ["yZp6xizR0yU", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["animal, bleat, cry", "a, car, pass"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["people clap and speak in the distance", "someone snores nearby"], "sample_ids": ["wwyfGO2J4", "spJCm8tD9Zo"], "start_seconds": ["90", "90"], "properties": ["clap, distance, speak", "someone snores, nearby, someone"], "captions_pred_video": [null, "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a person is snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["people speak softly as food sizzles", "sirens ring and approach with humming of distant traffic"], "sample_ids": ["yhQ2Lg-7qDY", "xERFUeZONz8"], "start_seconds": ["130", "0"], "properties": ["food, sizzle, speak", "ring, approach, traffic"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage is blurry due to camera shake or motion blur"], "captions_pred_audio": ["a faucet is running and a man is speaking", "an emergency vehicle siren blares"], "question": "which entity is more quiet", "label": 0}, {"captions": ["people cheer as a vehicle engine revs", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xjhAnI2q6hM", "tdWhHV3X25Q"], "start_seconds": ["6", "60"], "properties": ["engine revs, vehicle, people", "applause, audience, yells"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "water splashes as an animal walks through"], "sample_ids": ["wPz6QRAkEb4", "w1ir-sZ3Im8"], "start_seconds": ["60", "90"], "properties": ["chirps, tweets, song", "animal, water, splashes"], "captions_pred_video": ["a bird in a cage on top of a pole", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["birds are chirping in the background ", "water splashes and gurgles as people speak"], "question": "which entity is not a bird?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "pigeons vocalize and birds chirp"], "sample_ids": ["wSVhSdj0F0", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["horn honks, keys jingle, slam", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a car accelerates and wind blows"], "sample_ids": ["wfHeoPDLMaM", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["quacking, squawking, ducks", "accelerates, wind, blows"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet door squeaks as it is opened", "a airplane flies overhead as a woman speaks"], "sample_ids": ["sdXV-ylviw", "zj2R0XoFr5k"], "start_seconds": ["190", "50"], "properties": ["door, toilet, squeaks", "airplane, fly, woman"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["a duck quacks continuously", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vh30P49Po6s", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["quacks, continuously, duck", "male, duck, laugh"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking and ducks are quacking"], "question": "which duck is speaking", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sZPuqDgX2V0", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["commentator, race, track", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which vehicle is racing around a track", "label": 0}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wz7N8YRy74I", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["rooster, crow, background, people", "loud, laughter, intermittent"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a motorcycle engine works nearby", "water flows as men speak and yell"], "sample_ids": ["tOSWIURC-4", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["engine, work, nearby", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vJrjSeP17yE", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["a person is sleeping, snoring, person", "female, spraying, scream"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["a propeller moves loudly nearby", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["ugHJF0hfYkg", "yajyRTUQk3U"], "start_seconds": ["10", "400"], "properties": ["loud, propeller, move", "a woman, something, fried"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks followed by clicks and scraping", "paper is crumpling consistently"], "sample_ids": ["yYJksgsxx5U", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["audio, clicks, scraping", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["speaking following by laughing and clapping", "an infant crying as a woman laughs"], "sample_ids": ["u2f5NpsoHBg", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["person, laugh, clap", "a, laugh, infant"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a baby cries and a woman speaks"], "question": "which person is laughing", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "water splashes as an animal walks through"], "sample_ids": ["zFjIWfSD-4", "w1ir-sZ3Im8"], "start_seconds": ["410", "90"], "properties": ["People, motor, brakes", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "an infant crying frantically"], "sample_ids": ["tDlfY3nmx1A", "zwOBqeFTgiU"], "start_seconds": ["160", "30"], "properties": ["applause, laugh, man", "cry, infant, frantically"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "of the baby crying in the car seat"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["goats bleat and people speak", "an insect buzzes around continuously"], "sample_ids": ["z5iUE5h0EPs", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["goats bleat, people speak, language", "buzzes, continuously, insect"], "captions_pred_video": ["of the goat in the barn", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a goat bleats and a man speaks", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["a child speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["yW6FWLSLkx4", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["a, child, speaks", "engine revs, vehicle, people"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["water flows followed by women screaming", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["w5W5Kqtc8E", "vlS6YMeWAPo"], "start_seconds": ["100", "40"], "properties": ["water, flow, women", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a goat bleats and birds chirp"], "question": "which entity is followed by birds chirping", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "an airplane engine spools and people speak"], "sample_ids": ["zofjfKhqLk8", "wTjoRj1se3U"], "start_seconds": ["10", "390"], "properties": ["noise, stop, motor", "airplane, engine, spool"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["people speak as gunfire rings out", "a woman talks while a baby cries and a man whispers"], "sample_ids": ["wqTCwqVRDlk", "smDKStoHBJo"], "start_seconds": ["80", "0"], "properties": ["gunfire, ring, speak", "a, talk, baby, cry"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "a man holding a crying baby in his arms"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a baby is crying and a woman is speaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "several insects fly while two men talk"], "sample_ids": ["sofxkNWaP0s", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["wind, engine, louder", "several, fly, men"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more flying insects", "label": 1}, {"captions": ["a jet engine spools up and takes off", "a train horn blows as it passes by"], "sample_ids": ["vBslzh7saPw", "zVacuqSb4LI"], "start_seconds": ["90", "30"], "properties": ["engine, spools, takes", "horn, blows, train"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["an adult woman and an adult man speak", "water pouring and bubbling"], "sample_ids": ["zTLVJCo4WEE", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["two people, adult, speak", "water, bubbles, pouring"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman speaks and crickets chirp", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zOZleIRqZm4", "su6FAOcOA8c"], "start_seconds": ["80", "4"], "properties": ["rustling, leaves, person", "engine, idle, woman"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a subway train is moving "], "question": "which entity is a person speaking over?", "label": 0}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "a car accelerates and wind blows"], "sample_ids": ["yI-KvObbDoY", "u0TrcHhkPQ"], "start_seconds": ["260", "20"], "properties": ["sound, smack, wind", "accelerates, wind, blows"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", null], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["people speak and tapping occurs", "a propeller rotates loudly and intensely"], "sample_ids": ["tFCUUGdREgA", "ugHJF0hfYkg"], "start_seconds": ["70", "10"], "properties": ["people, tap, speak", "loud, intense, propeller"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a man is filing a hard object", "an airplane engine spools and people speak"], "sample_ids": ["vveS8HT7Uog", "wTjoRj1se3U"], "start_seconds": ["100", "390"], "properties": ["a man, hard, object", "airplane, engine, spool"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a jet engine is running and people are talking"], "question": "which object is moving", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "heavy rain splashes as it falls"], "sample_ids": ["vdoxuJn9lTc", "wP8ZKrlx3oA"], "start_seconds": ["40", "40"], "properties": ["burp, loud, girl", "fall, rain, splash"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a child speaks followed by a burp", "a heavy rain is falling on a surface"], "question": "which entity is more likely to cause a splash", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a man speaks, another man speaks, and a small bell dings"], "sample_ids": ["yajyRTUQk3U", "t69a8aRKhmc"], "start_seconds": ["400", "30"], "properties": ["a woman, something, fried", "a, b, c"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking and birds are chirping in the background "], "question": "which entity has a woman talking?", "label": 0}, {"captions": ["water running down a sink while a man is talking", "a clock ticktocks"], "sample_ids": ["vSeGhaZt-aI", "v-g-j2uTByM"], "start_seconds": ["50", "30"], "properties": ["water, sink, talk", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "motors runs briefly and tires screech"], "sample_ids": ["su6FAOcOA8c", "yRx9txMcBl0"], "start_seconds": ["4", "40"], "properties": ["engine, idle, woman", "motors, tires, screech"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a car is revving its engine and skidding "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a train horn sounds as a railroad passing bell rings", "someone whistles a tune"], "sample_ids": ["zgUgkpk78xU", "sIXTftIuUgw"], "start_seconds": ["70", "90"], "properties": ["horn, bell, train", "someone, tune, whistle"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["vehicles pass by on a roadway", "an airplane engine spools and people speak"], "sample_ids": ["tgbONvsP47Y", "wTjoRj1se3U"], "start_seconds": ["0", "390"], "properties": ["pass, vehicle, roadway", "airplane, engine, spool"], "captions_pred_video": ["footage of a fire truck entering a garage", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a car is driving on the road ", "a jet engine is running and people are talking"], "question": "which is not a moving object", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "water splashes as an animal walks through"], "sample_ids": ["uWAAAL4CIoc", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["a woman, chirps, animal", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "water splashes and gurgles as people speak"], "question": "which animal is more active", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w5W5Kqtc8E", "wDVMhEdTiVw"], "start_seconds": ["100", "30"], "properties": ["wind, blow, vehicle", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["dogs barking and whimpering", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tIY7qOV3rEM", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["barking, whimpering, dog", "two men, woman, birds"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a stream of water runs briefly"], "sample_ids": ["vzxHnu-SFEw", "x-PeY8Yb8M4"], "start_seconds": ["80", "300"], "properties": ["two objects, woman, speak", "stream, water, run"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a weapon fires multiple times", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sMC07Ucy7kg", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["weapon, fire, multiple", "a, scream, girl"], "captions_pred_video": ["footage is from a car's point of view", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tOj4tdLRaA", "sSMl2vc3ek"], "start_seconds": ["70", "20"], "properties": ["woman, laugh, baby", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vbZ-0lGPneg", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["a woman, a television program, a bird", "sheep, baa, birds"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a goat bleats and birds chirp"], "question": "which entity has more birds", "label": 1}, {"captions": ["a power tool runs and touches a surface", "several ducks quack and cocks crow far away"], "sample_ids": ["zfvPRf3chY", "sNB8zxXneIM"], "start_seconds": ["290", "20"], "properties": ["power tool, run, touch", "several, quack, cocks"], "captions_pred_video": [null, "a group of geese in a cage"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a rooster is crowing and wind is blowing "], "question": "which is not a power tool", "label": 1}, {"captions": ["birds chirp and wind blows", "a man speaks followed by another man speaking outside"], "sample_ids": ["sxIvBMSavMQ", "viuTg1M-dqg"], "start_seconds": ["210", "30"], "properties": ["birds, chirp, wind", "two men, speak, follow"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "roadway noise occurs and a truck accelerates"], "sample_ids": ["u--KhUW8l1Y", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["engine, sound, horn", "noise, truck, accelerate"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a telephone rings followed by a woman talking"], "sample_ids": ["w2JXXIAdUdg", "tGcFnX0GHI"], "start_seconds": ["10", "0"], "properties": ["snoring, distance, person", "ring, talk, woman"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", null], "captions_pred_audio": ["a person snoring and a dog whimpering", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a heavy rain falls endlessly", "a toilet flushes and a female speaks"], "sample_ids": ["wP8ZKrlx3oA", "yaln9y8I7ms"], "start_seconds": ["40", "230"], "properties": ["heavy, rain, fall", "female, flushes, toilet"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage is blurry and out of focus"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a toilet flushes and a man speaks"], "question": "which entity is not a video of a toilet flushing?", "label": 0}, {"captions": ["ticking continues without interruption", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["v-g-j2uTByM", "xV7Mg1QucSc"], "start_seconds": ["30", "14"], "properties": ["ticking, continuous, clock", "alarm, ticktocks, laughs"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a clock is ticking loudly", "an alarm clock ticks and a woman laughs"], "question": "which clock is ticking continuously", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "a man speaks as a motor runs in the background"], "sample_ids": ["uYT5gxnyMWM", "xZepNM9qcRA"], "start_seconds": ["50", "30"], "properties": ["person, spray, yell", "background, motor, run"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a machine beeps continuously"], "sample_ids": ["yRx9txMcBl0", "y682ml90jGw"], "start_seconds": ["40", "11"], "properties": ["accelerates, tires, squeals", "beeps, machine, continuously"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a machine runs continuously", "water flows and trickles"], "sample_ids": ["wdXV3Pv0jiY", "tB7hWb9gTuQ"], "start_seconds": ["11", "30"], "properties": ["machine, running, continuously", "water, flow, trickle"], "captions_pred_video": ["footage is blurry and shaky", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "water is splashing and gurgling"], "question": "which entity is not a continuous flow", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zALy31PjDl0", "zj2R0XoFr5k"], "start_seconds": ["21", "50"], "properties": ["a man, a vehicle, a horn", "airplane, boy, fly"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about a vehicle?", "label": 0}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vfYTJq7nU", "xKB8O8LTs6s"], "start_seconds": ["130", "70"], "properties": ["rustling, ducks, quack", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a duck quacks and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a stream of water flows as people talk and wind blows"], "sample_ids": ["slZLHwNbbt4", "xBxDz0CFVn0"], "start_seconds": ["300", "30"], "properties": ["clap, distance, horn", "stream, water, flow"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage is blurry and out of focus"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a horn rings out as a machine runs by"], "sample_ids": ["sK4u5T8hW78", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["a, car, pass", "a, horn, run"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a child babbles as a woman speaks", "water pouring and bubbling"], "sample_ids": ["wEBlkGWVWwE", "uyRfq-jKPpo"], "start_seconds": ["260", "50"], "properties": ["a, babble, woman", "water, bubbles, pouring"], "captions_pred_video": ["shows a person writing on the whiteboard", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a train engine runs and a horn blows"], "sample_ids": ["w1mlz3Pe4fU", "zPX9o1uDiI"], "start_seconds": ["300", "40"], "properties": ["vocalize, chirp, continuously", "engine, horn, run"], "captions_pred_video": ["of a bird in a cage", null], "captions_pred_audio": ["birds are chirping and singing", "a train moves with its horn blowing and wheels squealing "], "question": "which entity is not a train?", "label": 0}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "people applaud and hoot and chat quietly"], "sample_ids": ["ukg5L09Wpvo", "wwyfGO2J4"], "start_seconds": ["150", "90"], "properties": ["clickety-clack, train, whistle", "people, applaud, hoot"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a car speeding up in the distance"], "sample_ids": ["wqN6IIHw3po", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["rain, surface, fall", "distance, car, speed"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", null], "captions_pred_audio": ["a man is speaking and water is splashing", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "water pouring and bubbling"], "sample_ids": ["wqZ135Ssz0", "uyRfq-jKPpo"], "start_seconds": ["60", "50"], "properties": ["two men, woman, birds", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["a child speaks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yW6FWLSLkx4", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["a, child, speaks", "female, spraying, scream"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity is a child speaking?", "label": 0}, {"captions": ["distant men speak as a spray can nozzle is depressed", "people speak as gunfire rings out"], "sample_ids": ["rwtmaKiCcQU", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["nozzle, depressed, spray can", "gunfire, ring, speak"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["spraying and people speaking", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a telephone rings followed by a woman talking"], "sample_ids": ["t25U-v4k4ts", "tGcFnX0GHI"], "start_seconds": ["40", "0"], "properties": ["a, chirps, bird", "ring, talk, woman"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", null], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a human talking?", "label": 1}, {"captions": ["a beep repeats multiple times", "wind blowing followed by a zoom"], "sample_ids": ["y682ml90jGw", "vr8ZXjEBhMQ"], "start_seconds": ["11", "150"], "properties": ["beep, repeat, multiple", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a beeping sound is being made ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "paper is crumpling consistently"], "sample_ids": ["x6ijhqRY38s", "v5cSxLaHADY"], "start_seconds": ["250", "0"], "properties": ["bowl, silverware, man", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a person is snoring while sleeping"], "sample_ids": ["zuua6-5goWw", "vJrjSeP17yE"], "start_seconds": ["30", "40"], "properties": ["birds, chirp, quiet, man, speaks", "a person is sleeping, snoring, person"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a person snoring loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["small dogs yip and bark sharply", "a telephone rings followed by a woman talking"], "sample_ids": ["v-wcQf4BDY0", "tGcFnX0GHI"], "start_seconds": ["120", "0"], "properties": ["bark, yip, sharply", "ring, talk, woman"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", null], "captions_pred_audio": ["a dog barks and growls", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a man speaks as a car is passing by"], "sample_ids": ["wSVhSdj0F0", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["horn honks, keys jingle, electronic beep", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a recording of a car passing by?", "label": 1}, {"captions": ["food is frying while a woman speaks", "dogs barking and whimpering"], "sample_ids": ["yhQ2Lg-7qDY", "tIY7qOV3rEM"], "start_seconds": ["130", "0"], "properties": ["food, woman, speak", "barking, whimpering, dog"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a dog is barking and a cat is meowing"], "question": "which entity is a dog", "label": 1}, {"captions": ["a car speeding up in the distance", "an engine sputters followed by a car zooming by"], "sample_ids": ["u0TrcHhkPQ", "u5RmF3c3Aw"], "start_seconds": ["20", "60"], "properties": ["distance, car, speed", "engine, car, zoom"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a race car accelerates and skids with wind noise in the background "], "question": "which car is zooming by", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "waves crash against a shoreline and people speak"], "sample_ids": ["wqN6IIHw3po", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["rain, surface, fall", "wave, crash, shoreline"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and water is splashing", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be seen in a movie", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "a car accelerates and wind blows"], "sample_ids": ["yFB25fqfU8I", "u0TrcHhkPQ"], "start_seconds": ["300", "20"], "properties": ["wave, crash, shoreline", "accelerates, wind, blows"], "captions_pred_video": ["footage of a person surfing in the ocean", null], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["heavy rain splashes as it falls", "an engine sputters followed by a car zooming by"], "sample_ids": ["wP8ZKrlx3oA", "u5RmF3c3Aw"], "start_seconds": ["40", "60"], "properties": ["fall, rain, splash", "engine, car, zoom"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a race car accelerates and skids with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a infant makes noise and is excited"], "sample_ids": ["uZesmtKZGSw", "wIJK3-5y0kA"], "start_seconds": ["250", "30"], "properties": ["men, talk, cars", "noise, excited, infant"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "people applaud and hoot and chat quietly"], "sample_ids": ["vZAw4apG0Es", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["background, clock, ticktocks", "people, applaud, hoot"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["several insects fly while two men talk", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["s-T9OVOiMLo", "tDVADusiIoc"], "start_seconds": ["330", "60"], "properties": ["several, fly, men", "water, radio, man"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a male speaks and another male speaks"], "sample_ids": ["tDVADusiIoc", "viuTg1M-dqg"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "two males, speaking, male"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a crowd yells, reacts and applauds"], "sample_ids": ["xC8kbrKJmco", "wztCSUxOf8"], "start_seconds": ["0", "130"], "properties": ["background, goat, scream", "a crowd, yells, applauds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["w5W5Kqtc8E", "ziUT9IFTkjg"], "start_seconds": ["100", "10"], "properties": ["wind, engine, scream", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tGcFnX0GHI", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["ring, talk, woman", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["zj2R0XoFr5k", "s7knHCFW82w"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, overhead", "blow horn, get close, train"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a train is blowing its horn and its wheels are squealing "], "question": "which is a train", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "water drips and bubbles as a man speaks"], "sample_ids": ["yVumC9TGknc", "vSeGhaZt-aI"], "start_seconds": ["30", "50"], "properties": ["humming, clock, birds", "water, bubbles, speak"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["an engine starts and increases in power", "a man talks followed by a woman shouting"], "sample_ids": ["zjTG0gaGCUI", "s3cTDAj31g"], "start_seconds": ["80", "80"], "properties": ["power, increase, engine", "man, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a jet engine roars as wind blows ", "a man is speaking and a baby is crying"], "question": "which entity is a human", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "people speak as gunfire rings out"], "sample_ids": ["sapQIQUhFc", "wqTCwqVRDlk"], "start_seconds": ["280", "80"], "properties": ["liquid, flow, distance", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a duck quacks continuously", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vh30P49Po6s", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["quacks, continuously, duck", "airplane, boy, fly"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a duck is quacking loudly", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "pigeons vocalize and birds chirp"], "sample_ids": ["xyL9F5VrjkE", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["wind, blows, vehicle", "vocalize, bird, chirp"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "of the pigeon in the cage"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a toilet door squeaks as it is opened", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["sdXV-ylviw", "ukg5L09Wpvo"], "start_seconds": ["190", "150"], "properties": ["door, toilet, squeaks", "a train, a horn, a bell"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "wind blowing followed by a zoom"], "sample_ids": ["zcDwZ6W7E3E", "vr8ZXjEBhMQ"], "start_seconds": ["180", "150"], "properties": ["man, speak, motorcycles", "wind, blow, zoom"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be a video of a man speaking?", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["vbpKkWvfOu4", "sQGXqGcwOTc"], "start_seconds": ["560", "3"], "properties": ["a, man, speaks", "cling, speak, dishes"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "mechanisms are operating and water is splashing "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["distant humming of an engine", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["yVPZ2MNWpms", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["sound, distance, engine", "a, scream, girl"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car is driving by on the road ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["x6ijhqRY38s", "wqZ135Ssz0"], "start_seconds": ["250", "60"], "properties": ["bowl, silverware, man", "two men, woman, birds"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vhJWZheqaE", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["water drains unevenly, toilet flushes, water drains", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a person talking?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a toilet flushes and a female speaks"], "sample_ids": ["tgbONvsP47Y", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["pass, vehicle, roadway", "female, flushes, toilet"], "captions_pred_video": ["footage of a fire truck entering a garage", "footage is blurry and out of focus"], "captions_pred_audio": ["a car is driving on the road ", "a toilet flushes and a man speaks"], "question": "which entity is a video", "label": 1}, {"captions": ["a door slams shut roughly", "small dogs yip and bark sharply"], "sample_ids": ["zkKdxzNC97Y", "v-wcQf4BDY0"], "start_seconds": ["27", "120"], "properties": ["a door, slams, shut", "bark, yip, sharply"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a door is opened and closed", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a drill runs and two people laugh", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["tEE3MpBt1sg", "uWPRNLnpy7Y"], "start_seconds": ["50", "10"], "properties": ["two people, laugh, drill", "accelerate, laugh, vehicle"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "is taken from a car driving down the street"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "a person snores loudly multiple times at a close distance"], "sample_ids": ["xMXvkIcaG0Y", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["sound, humming, rattling", "loud, multiple, distance"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", null], "captions_pred_audio": ["an engine is revving and accelerating ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "multiple birds vocalize and wind blows"], "sample_ids": ["wvKpEYswXO0", "uoGVs9yUqY4"], "start_seconds": ["150", "30"], "properties": ["plastic, tap, speak", "multiple, vocalize, wind"], "captions_pred_video": ["of the person preparing food in the kitchen", "for how to make a wooden shed door youtube"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "birds are chirping and flapping their wings with wind noise in the background "], "question": "which entity is not a person", "label": 1}, {"captions": ["leaves rustle while man speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["zOZleIRqZm4", "vzxHnu-SFEw"], "start_seconds": ["80", "80"], "properties": ["leaves, rustle, speak", "two objects, woman, speak"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a motorcycle engine is revving while people are speaking", "a woman speaks and then a man speaks"], "sample_ids": ["y8dSeubCNI", "vbpKkWvfOu4"], "start_seconds": ["4", "560"], "properties": ["engine revving, people speaking, motorcycle", "a, man, speaks"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["an engine revving and people talking in the background", "a woman is speaking and a man is speaking"], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a muffled toilet flushes and the water drains"], "sample_ids": ["w1mlz3Pe4fU", "sfAvvZwdLCY"], "start_seconds": ["300", "20"], "properties": ["vocalize, chirp, continuously", "flushes, drains, water"], "captions_pred_video": ["of a bird in a cage", "footage of the toilet in the bathroom"], "captions_pred_audio": ["birds are chirping and singing", "a toilet is flushed"], "question": "which entity is silent", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "an audience gives applause"], "sample_ids": ["vhJWZheqaE", "x6iCUDmRpKQ"], "start_seconds": ["0", "38"], "properties": ["water drains unevenly, toilet flushes, water drains", "applause, audience, give"], "captions_pred_video": [null, "a black background with the moon and stars in the sky"], "captions_pred_audio": ["a toilet is flushed", "a group of people are clapping and cheering"], "question": "which entity is a response to a stimulus", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "pigeons vocalize and birds chirp"], "sample_ids": ["vZAw4apG0Es", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["people, clock, converse", "vocalize, bird, chirp"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "of the pigeon in the cage"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["water flows and trickles", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tB7hWb9gTuQ", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["water, flow, trickle", "engine, idle, woman"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["water is splashing and gurgling", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["uOpoD0gGXcs", "vlS6YMeWAPo"], "start_seconds": ["120", "40"], "properties": ["chirps, woman, bird", "sheep, baa, birds"], "captions_pred_video": ["a herd of cows grazing in the field", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a goat bleats and birds chirp"], "question": "which entity is a response to a human chirping?", "label": 0}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "someone snores nearby"], "sample_ids": ["su6FAOcOA8c", "spJCm8tD9Zo"], "start_seconds": ["4", "90"], "properties": ["engine, idle, woman", "someone snores, nearby, someone"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["people speak then an engine runs", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["uMTTDZ2mb4", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["engine, run, people", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wAAkbZToh8", "sLUnaPT5gM8"], "start_seconds": ["0", "0"], "properties": ["burp, laugh, speak", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man burps and a woman speaks", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a snore", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["vlJS7LN2XyM", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["background, clocks, ticking", "motor noise, horn, siren"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a ticktock of a clock", "a truck is honking its horn and a siren is blaring "], "question": "which entity is more ominous", "label": 1}, {"captions": ["leaves rustle while man speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zOZleIRqZm4", "xKB8O8LTs6s"], "start_seconds": ["80", "70"], "properties": ["leaves, rustle, speak", "music, gunfire, explosion"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["a machine runs continuously", "wind blows as people chatter quietly"], "sample_ids": ["wdXV3Pv0jiY", "xBxDz0CFVn0"], "start_seconds": ["11", "30"], "properties": ["machine, running, continuously", "wind, chatter, people"], "captions_pred_video": ["footage is blurry and shaky", "footage is blurry and out of focus"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sYITalLZjj4", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["water, rushes, background, birds", "People, motor, brakes"], "captions_pred_video": ["two ducks are swimming in the water near each other", null], "captions_pred_audio": ["wind blows and birds chirp", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more quiet", "label": 0}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xl2PIWyXaM", "w5W5Kqtc8E"], "start_seconds": ["160", "100"], "properties": ["chirp, man, younger person", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and people are talking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running?", "label": 1}, {"captions": ["a person is whistling", "birds chirp quietly and an adult man speaks"], "sample_ids": ["sIXTftIuUgw", "zuua6-5goWw"], "start_seconds": ["90", "30"], "properties": ["person, whistling, person", "birds, chirp, quiet, man, speaks"], "captions_pred_video": [null, "in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot"], "captions_pred_audio": ["a person whistling a song", "birds are chirping and a man is speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["siJFXfGWgDk", "uWAAAL4CIoc"], "start_seconds": ["50", "0"], "properties": ["man, woman, vehicle", "a woman, chirps, animal"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman is speaking and a dog is barking "], "question": "which entity has a more active animal", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zdYdyF9-m8U", "wz7N8YRy74I"], "start_seconds": ["7", "30"], "properties": ["wind, crash, shoreline", "rooster, crow, background, men"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["waves crash and wind blows ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vD6lYD1l0BY", "tDVADusiIoc"], "start_seconds": ["330", "60"], "properties": ["a, machine, run", "water, radio, man"], "captions_pred_video": ["game controller being held in the hands of the person", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["people speak then an engine runs", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["uMTTDZ2mb4", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["engine, run, people", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "dishes cling together then a man begins to speak"], "sample_ids": ["wqZ135Ssz0", "sQGXqGcwOTc"], "start_seconds": ["60", "3"], "properties": ["man, woman, squawks", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "mechanisms are operating and water is splashing "], "question": "which entity has a man speaking", "label": 1}, {"captions": ["an insect buzzes around continuously", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["v25l1jef3JY", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "rooster, crow, background, men"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird?", "label": 1}, {"captions": ["small dogs yip and bark sharply", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["v-wcQf4BDY0", "xKB8O8LTs6s"], "start_seconds": ["120", "70"], "properties": ["bark, yip, sharply", "music, gunfire, explosion"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a dog barks and growls", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "several insects fly while two men talk"], "sample_ids": ["rwtmaKiCcQU", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["nozzle, depressed, spray can", "several, fly, men"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["spraying and people speaking", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more insects", "label": 1}, {"captions": ["a man speaks uses a drill", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["x5eIC7S0fbg", "vqZuVbG6-HI"], "start_seconds": ["60", "130"], "properties": ["A man is speaking, uses a drill, and is a tool", "background, male, female"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a lawn mower is running and men are speaking "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tDlfY3nmx1A", "uZesmtKZGSw"], "start_seconds": ["160", "250"], "properties": ["applause, laugh, man", "men, talk, cars"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "a man speaks as horns blow"], "sample_ids": ["yswmmRZFItk", "tHyNqRyK34A"], "start_seconds": ["0", "24"], "properties": ["background, frog, croak", "a, man, speaks"], "captions_pred_video": ["a close up of a frog in the water", "being taken from inside a vehicle on the street at night"], "captions_pred_audio": ["a frog is croaking", "a man is speaking and a car is honking with background noise "], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a baby coos and fidgets as a lady speaks and laughs"], "sample_ids": ["voJh2gJxXhA", "uPDn2BFTHk"], "start_seconds": ["50", "140"], "properties": ["music, frog, croak", "lady, laugh, baby"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", null], "captions_pred_audio": ["music is playing and crickets are chirping ", "a baby laughs and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "an adult woman and an adult man speak"], "sample_ids": ["sofxkNWaP0s", "zTLVJCo4WEE"], "start_seconds": ["30", "30"], "properties": ["wind, engine, louder", "two people, adult, speak"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a woman speaks and crickets chirp"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a clock ticktocks briefly", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["u7C-AEBQM", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks briefly", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and ducks are quacking"], "question": "which entity is a clock?", "label": 0}, {"captions": ["a door slams shut and an object moves on a hard surface", "people applaud and hoot and chat quietly"], "sample_ids": ["zkKdxzNC97Y", "wwyfGO2J4"], "start_seconds": ["27", "90"], "properties": ["hard, surface, door", "people, applaud, hoot"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be in a theater", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sWZzXuWYY", "uYT5gxnyMWM"], "start_seconds": ["420", "50"], "properties": ["male, speech, banging", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a woman", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uiS58TNyUiw", "xBxDz0CFVn0"], "start_seconds": ["430", "30"], "properties": ["vocalize, bird, chirp", "stream, water, flow"], "captions_pred_video": ["of the pigeon in the cage", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking with wind noise in the background "], "question": "which entity is not a stream of water flowing?", "label": 0}, {"captions": ["a saw finishes running as metal clings in the background", "water runs into a sink while men speak"], "sample_ids": ["zofjfKhqLk8", "vzceMbklWc"], "start_seconds": ["10", "180"], "properties": ["background, metal, clings", "water, sink, run"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "water is running and a man is speaking"], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a car speeding up in the distance", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["u0TrcHhkPQ", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["distance, car, speed", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["an airplane engine runs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["yVPZ2MNWpms", "yDoT73BWsdA"], "start_seconds": ["0", "10"], "properties": ["engine, airplane, runs", "engine, revs, vehicle"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a car is driving by on the road ", "a race car accelerates and revs its engine "], "question": "which entity has a running engine", "label": 0}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a car speeding up in the distance"], "sample_ids": ["yLy-WycbVVE", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, people, talk", "distance, car, speed"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", null], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a duck quacks continuously"], "sample_ids": ["siJFXfGWgDk", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["man, woman, vehicle", "quacks, continuously, duck"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "waves crash against a shoreline and people speak"], "sample_ids": ["ylpYOorfH4o", "yFB25fqfU8I"], "start_seconds": ["410", "300"], "properties": ["motor, run, steady", "wave, crash, shoreline"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which is a natural phenomenon", "label": 1}, {"captions": ["a goat bleats as a person speaks", "a young woman speaks over spraying and another person yells"], "sample_ids": ["tPJvjq9QePY", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["bleats, person, speak", "person, spray, yell"], "captions_pred_video": ["a dog and a sheep in a barn", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a baby cries and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["the revving of an engine throttle followed by a man speaking", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["tezvROoo4bs", "yDoT73BWsdA"], "start_seconds": ["40", "10"], "properties": ["audio, throttle, speaking", "engine, revs, vehicle"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a race car accelerates and revs its engine "], "question": "which entity is a video of a vehicle?", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "an insect buzzes around continuously"], "sample_ids": ["w6RTHR6AeAg", "v25l1jef3JY"], "start_seconds": ["40", "0"], "properties": ["call, owl, screech", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a fly is buzzing around a microphone "], "question": "which entity is a predator", "label": 0}, {"captions": ["an aircraft engine runs as wind blows heavily", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xjvTpk2Zpr8", "vfYTJq7nU"], "start_seconds": ["70", "130"], "properties": ["engine, run, wind", "rustling, ducks, quack"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "a duck quacks and a woman speaks"], "question": "which entity is not a video of an aircraft engine running?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wz7N8YRy74I", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["rooster, crow, background, men", "applause, audience, yells"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a helicopter engine runs continuously"], "sample_ids": ["tOSWIURC-4", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["engine, work, nearby", "engine, running, continuously"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a lawn mower is running ", "a helicopter is flying overhead "], "question": "which entity has a running engine", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a man speaks as a motor runs in the background"], "sample_ids": ["vJvryTwuAV8", "xZepNM9qcRA"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "background, motor, run"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zFjIWfSD-4", "vfYTJq7nU"], "start_seconds": ["410", "130"], "properties": ["People, motor, brakes", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["water pouring and bubbling", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["uyRfq-jKPpo", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["water, bubbles, pouring", "a woman, something, fried"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "- a woman cooking in the kitchen"], "captions_pred_audio": ["water is running from a faucet", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of something being fried?", "label": 1}, {"captions": ["a door opens and closes", "water pouring and bubbling"], "sample_ids": ["vBHyYJ8pL0", "uyRfq-jKPpo"], "start_seconds": ["2", "50"], "properties": ["open, close, door", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "water is running from a faucet"], "question": "which entity is more likely to be a door", "label": 0}, {"captions": ["people speak and laugh as a child speaks", "some men converse over an engine running"], "sample_ids": ["sa6TLVbooCc", "sCiy7QS1U"], "start_seconds": ["240", "300"], "properties": ["people, laugh, child", "men, converse, engine"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", null], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows people speaking and laughing as a child speaks?", "label": 0}, {"captions": ["birds chirp and pigeons vocalize while walking around", "vehicles pass by on a roadway"], "sample_ids": ["wIvYjuR3nrg", "tgbONvsP47Y"], "start_seconds": ["9", "0"], "properties": ["birds, pigeons, vocalize", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and cooing", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a clock ticktocks in wind", "a car accelerates and wind blows"], "sample_ids": ["yVumC9TGknc", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["ticktocks, clock, wind", "accelerates, wind, blows"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", null], "captions_pred_audio": ["a series of beeps and chirps", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet flushes and water drains", "a clock ticktocks"], "sample_ids": ["sfAvvZwdLCY", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the toilet in the bathroom", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a toilet is flushed", "a clock is ticking loudly"], "question": "which entity is a timepiece", "label": 1}, {"captions": ["food is frying and sizzles", "paper folding and crinkling"], "sample_ids": ["zNRChLjqcU", "zPpG3RD8lSs"], "start_seconds": ["220", "20"], "properties": ["food is frying, sizzles, food", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["water is running from a faucet into a sink", "the wind blows and a mouse clicks "], "question": "which entity is not a food", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vr8ZXjEBhMQ", "uEU-Hg5MTN8"], "start_seconds": ["150", "27"], "properties": ["wind, blow, zoom", "a woman, laughs, animal"], "captions_pred_video": ["is taken from a motorcycle's point of view", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vbpKkWvfOu4", "wqZ135Ssz0"], "start_seconds": ["560", "60"], "properties": ["a, man, speaks", "two men, woman, birds"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "an insect buzzes around continuously"], "sample_ids": ["vveS8HT7Uog", "v25l1jef3JY"], "start_seconds": ["100", "0"], "properties": ["a man, objects, speak", "buzzes, continuously, insect"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a drill runs and two people laugh", "water drips and bubbles as a man speaks"], "sample_ids": ["tEE3MpBt1sg", "vSeGhaZt-aI"], "start_seconds": ["50", "50"], "properties": ["two people, laugh, drill", "water, bubbles, speak"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xKB8O8LTs6s", "vb1fPSDI4c"], "start_seconds": ["70", "30"], "properties": ["music, gunfire, explosion", "multiple, people, yell"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "winds blows roughly as a vehicle races past"], "sample_ids": ["sjlVMgdGSK0", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["car, revving, loudly", "wind, blows, vehicle"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a jet engine roars and wind blows "], "question": "which is not a vehicle", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "water flows as men speak and yell"], "sample_ids": ["uWPRNLnpy7Y", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["accelerate, laugh, vehicle", "water, flow, men"], "captions_pred_video": ["is taken from a car driving down the street", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a vehicle?", "label": 0}, {"captions": ["an insect buzzes around continuously", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["v25l1jef3JY", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "rooster, crow, background, men"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird?", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a man talks while metallic objects are rapped and steam is released"], "sample_ids": ["vr8ZXjEBhMQ", "vuUVPzd2FXw"], "start_seconds": ["150", "160"], "properties": ["wind, blow, zoom", "a, steam, release"], "captions_pred_video": ["is taken from a motorcycle's point of view", "of the person cooking on the grill with a spatula"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man is speaking and dishes are clanging"], "question": "which entity is a video of a man talking?", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a man speaks, another man speaks, and a small bell dings"], "sample_ids": ["xjhAnI2q6hM", "t69a8aRKhmc"], "start_seconds": ["6", "30"], "properties": ["engine revs, vehicle, people", "a, b, c"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage is blurry and out of focus"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a man is speaking and birds are chirping in the background "], "question": "which entity has a vehicle?", "label": 0}, {"captions": ["a person speaks over rustling leaves", "water pouring and bubbling"], "sample_ids": ["zOZleIRqZm4", "uyRfq-jKPpo"], "start_seconds": ["80", "50"], "properties": ["rustling, leaves, person", "water, bubbles, pouring"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a infant makes noise and is excited"], "sample_ids": ["xfaoyyzw2WU", "wIJK3-5y0kA"], "start_seconds": ["180", "30"], "properties": ["loud, jet engine, roar", "noise, excited, infant"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a baby cries and a woman speaks"], "question": "which is louder", "label": 1}, {"captions": ["a man is filing a hard object", "a man speaks over intermittent keyboard taps"], "sample_ids": ["vveS8HT7Uog", "tw76HGONaKg"], "start_seconds": ["100", "570"], "properties": ["a man, hard, object", "audio, man, keyboard"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man speaks and types on a computer keyboard "], "question": "which entity is a video of a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a man speaks followed by another man speaking outside"], "sample_ids": ["wz7N8YRy74I", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, people", "two men, speak, follow"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a rooster in it?", "label": 0}, {"captions": ["a jet engine screams, then increases its power", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vBslzh7saPw", "uYT5gxnyMWM"], "start_seconds": ["90", "50"], "properties": ["power, scream, increase", "female, spraying, scream"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "water pouring and bubbling"], "sample_ids": ["vh30P49Po6s", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["loud, continuous, quacks", "water, bubbles, pouring"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a duck is quacking loudly", "water is running from a faucet"], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple ducks quack continuously", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wfHeoPDLMaM", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["multiple, quack, continuously", "loud, multiple, distance"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "an infant crying frantically"], "sample_ids": ["vbpKkWvfOu4", "zwOBqeFTgiU"], "start_seconds": ["560", "30"], "properties": ["a, woman, man", "cry, infant, frantically"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "of the baby crying in the car seat"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["water rushes by", "a man is snoring loudly and repeatedly"], "sample_ids": ["x-PeY8Yb8M4", "sncRqQ67iJU"], "start_seconds": ["300", "460"], "properties": ["water, rushes, by", "loud, repeatedly, man"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a car is driving on a wet road ", "a person is snoring"], "question": "which entity is louder", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["su6FAOcOA8c", "sLUnaPT5gM8"], "start_seconds": ["4", "0"], "properties": ["engine, idle, woman", "loud, laughter, intermittent"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a horn rings out as a machine runs by", "a car accelerates and wind blows"], "sample_ids": ["slZLHwNbbt4", "u0TrcHhkPQ"], "start_seconds": ["300", "20"], "properties": ["a, horn, run", "accelerates, wind, blows"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a race car accelerates and revs its engine "], "question": "which is not a machine", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["y8WEcpOlT3I", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["harsh, wind, blows", "men, talk, cars"], "captions_pred_video": ["on how to use a sewing machine youtube", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a helicopter engine runs continuously", "vehicles pass by on a roadway"], "sample_ids": ["ugHJF0hfYkg", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["an insect buzzes around continuously", "a horn blasts as warning bells ring"], "sample_ids": ["v25l1jef3JY", "zgUgkpk78xU"], "start_seconds": ["0", "70"], "properties": ["buzzes, continuously, insect", "horn, bells, ring"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a train blows its horn as it speeds down the tracks "], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xV7Mg1QucSc", "yDoT73BWsdA"], "start_seconds": ["14", "10"], "properties": ["alarm, ticktocks, laughs", "engine, revs, vehicle"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a door opens and birds chirp", "birds vocalize and chirp continuously"], "sample_ids": ["yeFvk9x0wWI", "w1mlz3Pe4fU"], "start_seconds": ["30", "300"], "properties": ["door, open, birds", "vocalize, chirp, continuously"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of a bird in a cage"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "birds are chirping and singing"], "question": "which entity is more active", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a jet engine spools up and takes off"], "sample_ids": ["wRV8yMk886E", "vBslzh7saPw"], "start_seconds": ["0", "90"], "properties": ["liquid, spray, nozzle", "engine, spools, takes"], "captions_pred_video": ["two cars are parked in a parking lot at night", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a jet engine roars and accelerates "], "question": "which entity is a moving object", "label": 1}, {"captions": ["continuous snoring", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sLkeqCDJIyw", "vYkA3cfXp5Q"], "start_seconds": ["120", "30"], "properties": ["loud, snoring, noise", "engine, accelerate, idle"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a person is snoring loudly", "an engine is idling"], "question": "which entity is not a noise", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a infant makes noise and is excited"], "sample_ids": ["v0x1odnXtP0", "wIJK3-5y0kA"], "start_seconds": ["210", "30"], "properties": ["keyboard, type, computer", "noise, excited, infant"], "captions_pred_video": ["how to make money on youtube in spanish", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a person is typing on a keyboard", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a car accelerates and wind blows"], "sample_ids": ["vJ7JPEFhyLA", "u0TrcHhkPQ"], "start_seconds": ["16", "20"], "properties": ["three men, wind, flow", "accelerates, wind, blows"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["someone whistles a song", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sIXTftIuUgw", "wqZ135Ssz0"], "start_seconds": ["90", "60"], "properties": ["someone, song, whistle", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["yZmhM1HcsyE", "tdWhHV3X25Q"], "start_seconds": ["4", "60"], "properties": ["engine, roar, water", "applause, audience, yells"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a clock ticktocks"], "sample_ids": ["tgbONvsP47Y", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["pass, vehicle, roadway", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a fire truck entering a garage", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a car is driving on the road ", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vr8ZXjEBhMQ", "sSMl2vc3ek"], "start_seconds": ["150", "20"], "properties": ["wind, blow, zoom", "loud, multiple, distance"], "captions_pred_video": ["is taken from a motorcycle's point of view", null], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a person snoring loudly"], "question": "which entity is not a zoom", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "someone whistles a tune"], "sample_ids": ["w0xsN8X18Y", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["music, surface, rain", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaking with light rustling", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zOZleIRqZm4", "uYT5gxnyMWM"], "start_seconds": ["80", "50"], "properties": ["light, rustling, man", "a, scream, girl"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a baby is crying"], "question": "which entity is more spooky", "label": 1}, {"captions": ["a infant makes noise and is excited", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wIJK3-5y0kA", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["noise, excited, infant", "water, radio, man"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["sWZzXuWYY", "zY3icUyMdh8"], "start_seconds": ["420", "20"], "properties": ["male, speech, banging", "dog, bark, engine"], "captions_pred_video": [null, "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a car is driving and dogs are barking and squealing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xyL9F5VrjkE", "ukg5L09Wpvo"], "start_seconds": ["20", "150"], "properties": ["wind, motor, distance", "clickety-clack, train, whistle"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a person is whistling a tune", "some tunes played by whistling"], "sample_ids": ["scYRUkrFLiQ", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["a, tune, whistle", "tune, play, whistling"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a person whistling a song", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "three men talk while wind blows and some liquid flows"], "sample_ids": ["ziUT9IFTkjg", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["background, birds, rustling", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a natural phenomenon", "label": 0}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "an infant crying as a woman laughs"], "sample_ids": ["yYEVLuqEytU", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["animal, pig, background", "a, laugh, infant"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["spYNpeN7rPY", "yDoT73BWsdA"], "start_seconds": ["1", "10"], "properties": ["a clock, ticktock, man", "engine, revs, vehicle"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an audience gives applause", "a man speaks uses a drill"], "sample_ids": ["x6iCUDmRpKQ", "x5eIC7S0fbg"], "start_seconds": ["38", "60"], "properties": ["applause, audience, give", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["a black background with the moon and stars in the sky", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["a group of people are clapping and cheering", "a man is speaking and using a power tool "], "question": "which entity is a tool", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["vzceMbklWc", "s7knHCFW82w"], "start_seconds": ["180", "30"], "properties": ["water, faucet, sink", "blow horn, get close, train"], "captions_pred_video": [null, "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["water is running and a man is speaking", "a train is blowing its horn and its wheels are squealing "], "question": "which is a train", "label": 1}, {"captions": ["an airplane engine runs", "a person uses a saw to cut some wood"], "sample_ids": ["yVPZ2MNWpms", "sHbXC6na9hg"], "start_seconds": ["0", "0"], "properties": ["engine, airplane, runs", "a person, saw, wood"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a car is driving by on the road ", "an engine is idling and vibrating"], "question": "which entity is a person", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "someone is typing on a computer keyboard"], "sample_ids": ["vXlk0lIQBFo", "v0x1odnXtP0"], "start_seconds": ["470", "210"], "properties": ["wind, talk, vocalize", "keyboard, type, computer"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "how to make money on youtube in spanish"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a clock ticktocks", "a infant makes noise and is excited"], "sample_ids": ["v-g-j2uTByM", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks", "noise, excited, infant"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a clock is ticking loudly", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["women speak and laugh as wind blows", "an airplane engine spools and people speak"], "sample_ids": ["un9VQlzgZM", "wTjoRj1se3U"], "start_seconds": ["5", "390"], "properties": ["wind, speak, laugh", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a jet engine is running and people are talking"], "question": "which entity is about a moving object", "label": 1}, {"captions": ["material crumbles into a microphone", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vofpvUo6NAw", "tdWhHV3X25Q"], "start_seconds": ["220", "60"], "properties": ["material, crumbles, microphone", "applause, audience, yells"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a man is speaking and a crowd is clapping"], "question": "which is a live action", "label": 1}, {"captions": ["a train horn sounds as a railroad passing bell rings", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zgUgkpk78xU", "sLUnaPT5gM8"], "start_seconds": ["70", "0"], "properties": ["horn, bell, train", "loud, laughter, intermittent"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a dog barks and whimpers", "a train horn blows as it passes by"], "sample_ids": ["sShpyu2l4YQ", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "horn, blows, train"], "captions_pred_video": ["the puppies are playing with a toy", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a dog is barking and growling", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["scraping and female speech with distant music", "a man speaks as a motor runs in the background"], "sample_ids": ["yHeVV-xeOxQ", "xZepNM9qcRA"], "start_seconds": ["130", "30"], "properties": ["female, speech, music", "background, motor, run"], "captions_pred_video": ["of a girl milking a goat's udder", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xyL9F5VrjkE", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["engine, run, wind", "a woman, something, fried"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "- a woman cooking in the kitchen"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "small dogs yip and bark sharply"], "sample_ids": ["u6jIvCtKarQ", "v-wcQf4BDY0"], "start_seconds": ["70", "120"], "properties": ["a, man, speaks", "bark, yip, sharply"], "captions_pred_video": ["footage of a person using a blender on a stove top", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "an infant crying as a woman laughs"], "sample_ids": ["yNtRmrn0io8", "xhmRY9yhC7c"], "start_seconds": ["210", "20"], "properties": ["storm, distance, strike", "a, laugh, infant"], "captions_pred_video": ["footage of a house in the middle of the night", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["rain falls and thunder roars", "a baby cries and a woman speaks"], "question": "which is not a person", "label": 0}, {"captions": ["a woman speaks and food sizzles while frying", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wTideSjRFS0", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["food, sizzle, woman", "airplane, boy, fly"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about flying", "label": 1}, {"captions": ["a woman talking as an infant is crying", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tMbMDvT50j8", "xKB8O8LTs6s"], "start_seconds": ["12", "70"], "properties": ["a, talk, infant", "music, gunfire, explosion"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a baby cries and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["water splashes as an animal walks through", "a duck quacks continuously"], "sample_ids": ["w1ir-sZ3Im8", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["animal, water, splashes", "quacks, continuously, duck"], "captions_pred_video": ["footage of a group of people riding horses through a river", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a duck is quacking loudly"], "question": "which animal is more active", "label": 0}, {"captions": ["a rumble grows louder", "people speak as gunfire rings out"], "sample_ids": ["y4MY9mp8-TA", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["loudness, increase, rumble", "gunfire, ring, speak"], "captions_pred_video": ["a helicopter flying in the sky", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a helicopter flies overhead ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "plastic is tapped on while someone speaks"], "sample_ids": ["v7jJS8aAyA", "wvKpEYswXO0"], "start_seconds": ["10", "150"], "properties": ["wind, blows, loudly", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is quieter", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a duck quacks continuously"], "sample_ids": ["zhx6hoYrHeI", "vh30P49Po6s"], "start_seconds": ["160", "30"], "properties": ["engine, sputter, rough", "quacks, continuously, duck"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "water is sprayed across a hard surface"], "sample_ids": ["sOa7g-44Dag", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["background, man, spray", "water, spray, surface"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "spraying followed by silence"], "question": "which entity is sprayed across a hard surface", "label": 1}, {"captions": ["a duck quacks continuously", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["vh30P49Po6s", "vXlk0lIQBFo"], "start_seconds": ["30", "470"], "properties": ["quacks, continuously, duck", "wind, speak, vocalize"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a duck is quacking loudly", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity is speaking", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yRx9txMcBl0", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["motors, tires, screech", "a woman, something, fried"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "birds chirp as a man speaks and a younger person speaks"], "sample_ids": ["uC9dtII1KDI", "xl2PIWyXaM"], "start_seconds": ["150", "160"], "properties": ["wind, gusts, distance", "chirp, man, younger person"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", null], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "birds are chirping and people are talking"], "question": "which entity is more likely to be in a city", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zl9Dqx-j7q4", "tDVADusiIoc"], "start_seconds": ["6", "60"], "properties": ["engine, laugh, loud", "water, radio, man"], "captions_pred_video": ["footage of a man driving a car in the dark", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a speedboat passes quickly on the water"], "sample_ids": ["w5W5Kqtc8E", "tjmoSi330GM"], "start_seconds": ["100", "23"], "properties": ["water, splashes, motorboat", "speed, water, boat"], "captions_pred_video": [null, "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a motorboat speeds through water with wind noise "], "question": "which boat is moving faster", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "wind blows as people chatter quietly"], "sample_ids": ["yDoT73BWsdA", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["engine revs, tires squeal, vehicle", "wind, chatter, people"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage is blurry and out of focus"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["uoGVs9yUqY4", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["multiple, vocalize, wind", "sheep, baa, birds"], "captions_pred_video": ["for how to make a wooden shed door youtube", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a goat bleats and birds chirp"], "question": "which entity is a sheep?", "label": 1}, {"captions": ["a infant makes noise and is excited", "a telephone rings followed by a woman talking"], "sample_ids": ["wIJK3-5y0kA", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["noise, excited, infant", "ring, talk, woman"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a infant makes noise and is excited", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wIJK3-5y0kA", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["noise, excited, infant", "People, motor, brakes"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a machine beeps continuously"], "sample_ids": ["wTideSjRFS0", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["food, sizzle, woman", "beeps, machine, continuously"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "running water in a faucet with some clinks"], "sample_ids": ["vh30P49Po6s", "zNRChLjqcU"], "start_seconds": ["30", "220"], "properties": ["loud, continuous, quacks", "water, faucet, run"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "water is running from a faucet into a sink"], "question": "which entity is quieter", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vJvryTwuAV8", "vb1fPSDI4c"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "multiple, people, yell"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", null], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking?", "label": 1}, {"captions": ["people speak in a closed space", "a car accelerates and wind blows"], "sample_ids": ["sTpirNYo8vQ", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["people, space, speak", "accelerates, wind, blows"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a person sniffles and sneezes", "several insects fly while two men talk"], "sample_ids": ["uRlbY6aoBU", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["sneezes, sniffles, person", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a person?", "label": 0}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a man speaks as a motor runs in the background"], "sample_ids": ["vdoxuJn9lTc", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "background, motor, run"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a child speaks followed by a burp", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "small dogs yip and bark sharply"], "sample_ids": ["ugHJF0hfYkg", "v-wcQf4BDY0"], "start_seconds": ["10", "120"], "properties": ["engine, running, continuously", "bark, yip, sharply"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a helicopter is flying overhead ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a duck quacks continuously"], "sample_ids": ["wwyfGO2J4", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["people, applaud, hoot", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["continuous snoring", "a woman speaks as she rubs two objects together"], "sample_ids": ["sLkeqCDJIyw", "vzxHnu-SFEw"], "start_seconds": ["120", "80"], "properties": ["loud, snoring, noise", "two objects, woman, speak"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is silent", "label": 1}, {"captions": ["someone is burping continuously", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["y636gklDioE", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["burps, burps, burps", "men, talk, cars"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person burps loudly several times", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["food fries in a pan as someone talks and cooks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["ukxt9I7eMMg", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["food, pan, cook", "rooster, crow, background, men"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is about a rooster?", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "water splashes as an animal walks through"], "sample_ids": ["ylpYOorfH4o", "w1ir-sZ3Im8"], "start_seconds": ["410", "90"], "properties": ["engine, run, loud", "animal, water, splashes"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and an engine is revving", "water splashes and gurgles as people speak"], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "water splashes as an animal walks through"], "sample_ids": ["w34HjHr6gAY", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["beeps, squawk, child speaking", "animal, water, splashes"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["someone whistles briefly", "an insect buzzes around continuously"], "sample_ids": ["uFoga8sHpiw", "v25l1jef3JY"], "start_seconds": ["90", "0"], "properties": ["sound, duration, pitch", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a bird in a cage", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a person whistles a song", "a fly is buzzing around a microphone "], "question": "which entity buzzes continuously", "label": 1}, {"captions": ["a infant makes noise and is excited", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wIJK3-5y0kA", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["noise, excited, infant", "three men, wind, flow"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a stream of water runs briefly"], "sample_ids": ["w6RTHR6AeAg", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["call, owl, screech", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a car is driving on a wet road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a horn rings out as a machine runs by"], "sample_ids": ["sZPuqDgX2V0", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["engine, accelerate, intercom", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sOa7g-44Dag", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["audio, scratching, man", "water, radio, man"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a recording of a man speaking?", "label": 0}, {"captions": ["motors runs briefly and tires screech", "water flows as men speak and yell"], "sample_ids": ["yRx9txMcBl0", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["motors, tires, screech", "water, flow, men"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a car accelerates and wind blows"], "sample_ids": ["sfAvvZwdLCY", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["water drains, flushes, water", "accelerates, wind, blows"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["someone snores nearby", "a woman speaks as she rubs two objects together"], "sample_ids": ["spJCm8tD9Zo", "vzxHnu-SFEw"], "start_seconds": ["90", "80"], "properties": ["someone snores, nearby, someone", "two objects, woman, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks while vehicles pass by", "people speaking indiscriminately in the distance with a person snoring loudly nearby"], "sample_ids": ["sK4u5T8hW78", "w2JXXIAdUdg"], "start_seconds": ["30", "10"], "properties": ["a, man, talk", "snoring, distance, person"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a close up shot of a person's mouth with a toothbrush in it"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a person snoring and a dog whimpering"], "question": "which entity is more likely to be a person", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "repeated tapping is accompanied by water running and a woman speaking softly"], "sample_ids": ["vfYTJq7nU", "wvKpEYswXO0"], "start_seconds": ["130", "150"], "properties": ["rustling, ducks, quack", "sound, water, running"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is accompanied by water running", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tOSWIURC-4", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["engine, work, nearby", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a lawn mower is running ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wPz6QRAkEb4", "xfaoyyzw2WU"], "start_seconds": ["60", "180"], "properties": ["chirps, tweets, song", "loud, jet engine, roar"], "captions_pred_video": ["a bird in a cage on top of a pole", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["birds are chirping in the background ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "some tunes played by whistling"], "sample_ids": ["sOa7g-44Dag", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["audio, scratching, man", "tune, play, whistling"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a clock ticktocks"], "sample_ids": ["xNMovAf3o50", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["rain, thunder, music", "ticktocks, clock, ticktocks"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["wind blows strongly", "multiple people speak and children yell while water gurgles"], "sample_ids": ["w8uLijTqtlU", "vb1fPSDI4c"], "start_seconds": ["70", "30"], "properties": ["wind, blows, strongly", "multiple, people, yell"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "paper is crumpling consistently"], "sample_ids": ["v7jJS8aAyA", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["wind, blows, loudly", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uZesmtKZGSw", "tdWhHV3X25Q"], "start_seconds": ["250", "60"], "properties": ["car, track, man", "applause, audience, yells"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["sZvwOuuPGP0", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["engine, diesel, truck", "rooster, crow, background, men"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a medium engine is running ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "several insects fly while two men talk"], "sample_ids": ["xl2PIWyXaM", "s-T9OVOiMLo"], "start_seconds": ["160", "330"], "properties": ["chirp, man, younger person", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["birds are chirping and people are talking", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a man speaking and a younger person speaking?", "label": 0}, {"captions": ["a train engine runs and a horn blows", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["zPX9o1uDiI", "y2bVZ7rz-5M"], "start_seconds": ["40", "280"], "properties": ["engine, horn, run", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["children speak as a female ask them questions", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wEBlkGWVWwE", "zj2R0XoFr5k"], "start_seconds": ["260", "50"], "properties": ["female, speak, questions", "airplane, boy, fly"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about a boy speaking?", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "crowd applause while a guy laughs followed by another man speaking"], "sample_ids": ["vbZ-0lGPneg", "tDlfY3nmx1A"], "start_seconds": ["30", "160"], "properties": ["a woman, a television program, a bird", "applause, laugh, man"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a man in a suit and tie is talking to another man in a suit and tie"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a crowd is clapping and laughing and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["a clock ticktocks briefly", "a heavy rain falls endlessly"], "sample_ids": ["u7C-AEBQM", "wP8ZKrlx3oA"], "start_seconds": ["30", "40"], "properties": ["ticktocks, clock, ticktocks briefly", "heavy, rain, fall"], "captions_pred_video": [null, "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a ticktock of a clock", "a heavy rain is falling on a surface"], "question": "which entity is falling", "label": 1}, {"captions": ["a clock ticktocks in wind", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["yVumC9TGknc", "yLy-WycbVVE"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, wind", "background, people, talk"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity is talking", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a clock ticktocks"], "sample_ids": ["xSKJGCItUWE", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["engine, run, boy", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the helicopter flying in the room", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vXlk0lIQBFo", "uEU-Hg5MTN8"], "start_seconds": ["470", "27"], "properties": ["wind, talk, vocalize", "a woman, laughs, animal"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vbr9mHKc8WM", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["noise, loudness, engine", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["an engine is idling", "a train blows its whistle and blows its horn "], "question": "which train is making noise", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["smGI3C1NZc", "t25U-v4k4ts"], "start_seconds": ["30", "40"], "properties": ["water, drain, toilet", "a, chirps, bird"], "captions_pred_video": [null, "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and bees are buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a weapon fires multiple times", "a woman speaks happily and an animal chirps"], "sample_ids": ["sMC07Ucy7kg", "uWAAAL4CIoc"], "start_seconds": ["10", "0"], "properties": ["weapon, fire, multiple", "a woman, chirps, animal"], "captions_pred_video": ["footage is from a car's point of view", null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a woman is speaking and a dog is barking "], "question": "which entity is more passive", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a man speaks as a car is passing by"], "sample_ids": ["vddP56-ogds", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["water, flow, laugh", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a machine runs continuously", "an insect buzzes around continuously"], "sample_ids": ["wdXV3Pv0jiY", "v25l1jef3JY"], "start_seconds": ["11", "0"], "properties": ["machine, running, continuously", "buzzes, continuously, insect"], "captions_pred_video": ["footage is blurry and shaky", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a woman speaks followed by another woman whimpering and speaking"], "sample_ids": ["sOa7g-44Dag", "xOZfdgAgJ9o"], "start_seconds": ["30", "40"], "properties": ["background, man, spray", "woman, whimpering, speaking"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of a woman talking to a man in a doctor's office"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a woman is speaking and a baby is crying"], "question": "which entity is a woman speaking?", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wRBHTgrbiwg", "tdWhHV3X25Q"], "start_seconds": ["50", "60"], "properties": ["bird, owl, speak", "applause, audience, yells"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a man speaks as a car is passing by"], "sample_ids": ["v5P-ThUCINM", "sK4u5T8hW78"], "start_seconds": ["400", "30"], "properties": ["background, chirp, bird", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wz7N8YRy74I", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["rooster, crow, background, men", "men, talk, cars"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a cat meows and children speak", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["x5cuQjOdM3E", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["cat, speak, children", "female, spraying, scream"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "small dogs yip and bark sharply"], "sample_ids": ["uKCSGgof8gI", "v-wcQf4BDY0"], "start_seconds": ["12", "120"], "properties": ["chirps, distance, signal", "bark, yip, sharply"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["multiple motorcycles pass by as a man speaks", "a flush is followed by gurgling water, then another flush"], "sample_ids": ["zcDwZ6W7E3E", "tqR406bGiE"], "start_seconds": ["180", "40"], "properties": ["man, speak, motorcycles", "flush, water, gurgle"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a toilet is flushed"], "question": "which entity is about water?", "label": 1}, {"captions": ["a dark barks and whimpers", "winds blows roughly as a vehicle races past"], "sample_ids": ["sYj4hpDUZDQ", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["barks, whimpers, dark", "wind, blows, vehicle"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a dog barks and a cat meows", "a jet engine roars and wind blows "], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "a person snores loudly multiple times at a close distance"], "sample_ids": ["soTOh3zYJfY", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["vehicle, skid, tires", "loud, multiple, distance"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["people speak softly as food sizzles", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["yhQ2Lg-7qDY", "w5W5Kqtc8E"], "start_seconds": ["130", "100"], "properties": ["food, sizzle, speak", "wind, blow, vehicle"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a man speaks followed by another man speaking outside"], "sample_ids": ["vD6lYD1l0BY", "viuTg1M-dqg"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "two men, speak, follow"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a person speaks over rustling leaves", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zOZleIRqZm4", "sLUnaPT5gM8"], "start_seconds": ["80", "0"], "properties": ["rustling, leaves, person", "loud, laughter, intermittent"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a large bell chimes back and forth loudly", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w2M4i1mklOA", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["loud, chime, bell", "stream, water, flow"], "captions_pred_video": ["footage of an antique clock", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "roadway noise occurs and a truck accelerates"], "sample_ids": ["sEprKHm8Sj8", "tgbONvsP47Y"], "start_seconds": ["90", "0"], "properties": ["noise, loud, buzzing", "noise, truck, accelerate"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car is driving on the road "], "question": "which noise is caused by a truck", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xC8kbrKJmco", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["background, goat, scream", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a goat is bleating ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["wz7N8YRy74I", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, men", "beeps, hit, woman"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "an airplane engine runs"], "sample_ids": ["yRx9txMcBl0", "yVPZ2MNWpms"], "start_seconds": ["40", "0"], "properties": ["accelerates, tires, squeals", "engine, airplane, runs"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a car is driving by on the road "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["paper is crumpling consistently", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["v5cSxLaHADY", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "airplane, boy, fly"], "captions_pred_video": ["footage of the person holding a pair of scissors", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["paper is crumpled and crinkled", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a woman speaks and taps on a hard surface before running tap water"], "sample_ids": ["xjvTpk2Zpr8", "wvKpEYswXO0"], "start_seconds": ["70", "150"], "properties": ["engine, run, wind", "water, tap, run"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is not running", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "a stream of water runs briefly"], "sample_ids": ["sHbXC6na9hg", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["a person, saw, wood", "stream, water, run"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["an engine is idling and vibrating", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["white noise and birds chirping", "loud, continuous burping"], "sample_ids": ["wRBHTgrbiwg", "y636gklDioE"], "start_seconds": ["50", "20"], "properties": ["noise, white, chirping", "loud, continuous, burping"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a dog sitting on a red chair in front of an old telephone"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a person burps loudly several times"], "question": "which noise is louder", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "winds blows roughly as a vehicle races past"], "sample_ids": ["wRBHTgrbiwg", "xjvTpk2Zpr8"], "start_seconds": ["50", "70"], "properties": ["bird, owl, speak", "wind, blows, vehicle"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a jet engine roars and wind blows "], "question": "which entity is more active", "label": 1}, {"captions": ["people speak softly as food sizzles", "a woman speaks as she rubs two objects together"], "sample_ids": ["yhQ2Lg-7qDY", "vzxHnu-SFEw"], "start_seconds": ["130", "80"], "properties": ["food, sizzle, speak", "two objects, woman, speak"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is about a woman speaking?", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "some men converse over an engine running"], "sample_ids": ["vXlk0lIQBFo", "sCiy7QS1U"], "start_seconds": ["470", "300"], "properties": ["wind, speak, vocalize", "men, converse, engine"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", null], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a conversation?", "label": 1}, {"captions": ["birds chirp as a train approaches", "a frog croaks as other frogs croak in the background"], "sample_ids": ["xM4joTqDVp4", "yswmmRZFItk"], "start_seconds": ["160", "0"], "properties": ["bird, chirp, train", "background, frog, croak"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a close up of a frog in the water"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a frog is croaking"], "question": "which entity is a solitary animal", "label": 1}, {"captions": ["people speak and tapping occurs", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["tFCUUGdREgA", "t25U-v4k4ts"], "start_seconds": ["70", "40"], "properties": ["people, tap, speak", "a, chirps, bird"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a man is speaking and bees are buzzing"], "question": "which entity has a bird chirp?", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "pigeons vocalize and birds chirp"], "sample_ids": ["xERFUeZONz8", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["ring, approach, traffic", "vocalize, bird, chirp"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "of the pigeon in the cage"], "captions_pred_audio": ["an emergency vehicle siren blares", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["uOpoD0gGXcs", "t25U-v4k4ts"], "start_seconds": ["120", "40"], "properties": ["chirps, woman, bird", "a, chirps, bird"], "captions_pred_video": ["a herd of cows grazing in the field", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a man is speaking and bees are buzzing"], "question": "which entity is a response to a woman chirping for the birds?", "label": 0}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "wind blowing followed by a zoom"], "sample_ids": ["wP8ZKrlx3oA", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["rain, storm, thunder", "wind, blow, zoom"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a heavy rain is falling on a surface", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a weather event", "label": 0}, {"captions": ["birds chirp and an insect buzzes around", "waves crash against a shoreline and people speak"], "sample_ids": ["t97k0cejSQE", "yFB25fqfU8I"], "start_seconds": ["250", "300"], "properties": ["bird, chirp, insect", "wave, crash, shoreline"], "captions_pred_video": ["a bee on a purple thistle flower", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and objects are moved around", "small dogs yip and bark sharply"], "sample_ids": ["yPUYU6t3rwo", "v-wcQf4BDY0"], "start_seconds": ["370", "120"], "properties": ["birds chirp, objects are moved around, birds", "bark, yip, sharply"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["insects buzz and a man speaks", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "plastic is tapped on while someone speaks"], "sample_ids": ["zcDwZ6W7E3E", "wvKpEYswXO0"], "start_seconds": ["180", "150"], "properties": ["man, speak, motorcycles", "plastic, tap, speak"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a man speaks in the background while a slow tick repeats", "a stream of water runs briefly"], "sample_ids": ["vZAw4apG0Es", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["background, tick, repeat", "stream, water, run"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a clock is ticking and people are talking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a woman speaks happily and an animal chirps"], "sample_ids": ["wRV8yMk886E", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["liquid, spray, nozzle", "a woman, chirps, animal"], "captions_pred_video": ["two cars are parked in a parking lot at night", null], "captions_pred_audio": ["a man speaks followed by a loud burst", "a woman is speaking and a dog is barking "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a mechanical buzzing getting louder"], "sample_ids": ["siJFXfGWgDk", "sEprKHm8Sj8"], "start_seconds": ["50", "90"], "properties": ["a, bird, vehicle", "noise, loud, buzzing"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "a machine beeps continuously"], "sample_ids": ["vBslzh7saPw", "y682ml90jGw"], "start_seconds": ["90", "11"], "properties": ["power, scream, increase", "beeps, machine, continuously"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["zVacuqSb4LI", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["blares, fades, train", "motor noise, horn, siren"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sapQIQUhFc", "uEU-Hg5MTN8"], "start_seconds": ["280", "27"], "properties": ["water, trickles, flow", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "wind blows strongly and a young man speaks"], "sample_ids": ["uWPRNLnpy7Y", "vs65y4qmyBE"], "start_seconds": ["10", "340"], "properties": ["accelerate, laugh, vehicle", "wind, blows, strongly"], "captions_pred_video": ["is taken from a car driving down the street", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a heavy engine is running and men are speaking "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a motorcycle engine is idling", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vZAqdHZ81yA", "xBxDz0CFVn0"], "start_seconds": ["180", "30"], "properties": ["engine, motorcycle, idling", "stream, water, flow"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage is blurry and out of focus"], "captions_pred_audio": ["an engine is idling loudly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "an infant crying as a woman laughs"], "sample_ids": ["yYEVLuqEytU", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["grunt, slurp, background", "a, laugh, infant"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a motorcycle engine is idling"], "sample_ids": ["xV7Mg1QucSc", "vZAqdHZ81yA"], "start_seconds": ["14", "180"], "properties": ["alarm, ticktocks, laughs", "engine, motorcycle, idling"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "a motorcycle is parked on the side of the road with its rear end facing the viewer"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "an engine is idling loudly"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a stream of water flows as people talk and wind blows"], "sample_ids": ["y4tPJXBKDig", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["a, noise, talk", "stream, water, flow"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "a boat travels through the waves as the wind blows loudly and a man speaks over a radio"], "sample_ids": ["vbpKkWvfOu4", "tDVADusiIoc"], "start_seconds": ["560", "60"], "properties": ["a, woman, man", "wind, radio, waves"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking over a radio?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a siren comes to life as a horn blares"], "sample_ids": ["wSVhSdj0F0", "u--KhUW8l1Y"], "start_seconds": ["10", "0"], "properties": ["horn honks, keys jingle, electronic beep", "horn, siren, life"], "captions_pred_video": [null, "a firefighter spraying water from a fire hydrant at night"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a fire truck siren blares and a horn blows "], "question": "which entity is a siren?", "label": 1}, {"captions": ["a woman talking as an infant is crying", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["tMbMDvT50j8", "s7knHCFW82w"], "start_seconds": ["12", "30"], "properties": ["a, talk, infant", "blow horn, get close, train"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a baby cries and a woman speaks", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is moving", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a vehicle accelerates squealing tires"], "sample_ids": ["w5W5Kqtc8E", "sd7xVssqlw"], "start_seconds": ["100", "50"], "properties": ["water, splashes, motorboat", "accelerates, tires, squealing"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "a man speaks as a motor runs in the background"], "sample_ids": ["yI-KvObbDoY", "xZepNM9qcRA"], "start_seconds": ["260", "30"], "properties": ["sound, smack, wind", "background, motor, run"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["someone snores nearby", "small dogs yip and bark sharply"], "sample_ids": ["spJCm8tD9Zo", "v-wcQf4BDY0"], "start_seconds": ["90", "120"], "properties": ["someone snores, nearby, someone", "bark, yip, sharply"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a person is snoring loudly", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a propeller rotates loudly and intensely", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["ugHJF0hfYkg", "zj2R0XoFr5k"], "start_seconds": ["10", "50"], "properties": ["loud, intense, propeller", "airplane, boy, fly"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman speaks while a helicopter flies overhead "], "question": "which is a moving object", "label": 1}, {"captions": ["a man is filing a hard object", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["vveS8HT7Uog", "xV7Mg1QucSc"], "start_seconds": ["100", "14"], "properties": ["a man, hard, object", "alarm, ticktocks, laughs"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "an alarm clock ticks and a woman laughs"], "question": "which entity is about a clock ticktocking and a man laughing?", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["zY3icUyMdh8", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["dog, bark, engine", "background, male, female"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a lawn mower is running and men are speaking "], "question": "which entity has a male and female voice in the background?", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "three men talk while wind blows and some liquid flows"], "sample_ids": ["u5RmF3c3Aw", "vJ7JPEFhyLA"], "start_seconds": ["60", "16"], "properties": ["engine, car, zoom", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a car zooming by?", "label": 0}, {"captions": ["a vehicle engine runs while a siren and horn sound", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["u--KhUW8l1Y", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["engine, sound, horn", "male, duck, laugh"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", null], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["water flows followed by women screaming", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["w5W5Kqtc8E", "uEU-Hg5MTN8"], "start_seconds": ["100", "27"], "properties": ["water, flow, women", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a woman laughing?", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a beep repeats multiple times"], "sample_ids": ["xKB8O8LTs6s", "y682ml90jGw"], "start_seconds": ["70", "11"], "properties": ["music, gunshots, explosion", "beep, repeat, multiple"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a beeping sound is being made "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "wind blowing followed by a zoom"], "sample_ids": ["ylpYOorfH4o", "vr8ZXjEBhMQ"], "start_seconds": ["410", "150"], "properties": ["motor, run, steady", "wind, blow, zoom"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and an engine is revving", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "an engine runs loudly"], "sample_ids": ["vqZuVbG6-HI", "vqZuVbG6-HI"], "start_seconds": ["130", "130"], "properties": ["background, male, female", "loud, engine, run"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a child speaks in closed space"], "sample_ids": ["w8uLijTqtlU", "yW6FWLSLkx4"], "start_seconds": ["70", "40"], "properties": ["wind, microphone, noise", "child, space, speak"], "captions_pred_video": ["footage is blurry and shaky", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["the wind is blowing strongly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sWZzXuWYY", "vb1fPSDI4c"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a person is whistling a tune", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["scYRUkrFLiQ", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["a, tune, whistle", "male, duck, laugh"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", null], "captions_pred_audio": ["a person whistling a song", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a clicking followed by some people laughing and a kid speaking"], "sample_ids": ["wvKpEYswXO0", "vz8868znkVQ"], "start_seconds": ["150", "60"], "properties": ["sound, water, running", "audio, click, kid speaking"], "captions_pred_video": ["of the person preparing food in the kitchen", "a video of a plane flying over a cloudy sky"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a baby is laughing and breathing with background noise "], "question": "which entity has a woman speaking softly?", "label": 0}, {"captions": ["people cheer as a vehicle engine revs", "a person snores loudly multiple times at a close distance"], "sample_ids": ["xjhAnI2q6hM", "sSMl2vc3ek"], "start_seconds": ["6", "20"], "properties": ["engine revs, vehicle, people", "loud, multiple, distance"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a person snoring loudly"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["xl2PIWyXaM", "sLUnaPT5gM8"], "start_seconds": ["160", "0"], "properties": ["chirp, man, younger person", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["birds are chirping and people are talking", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["young female child snoring and breathing deeply", "food is frying while a woman speaks"], "sample_ids": ["sAam2NqGhLY", "yhQ2Lg-7qDY"], "start_seconds": ["20", "130"], "properties": ["snoring, breathing, child", "food, woman, speak"], "captions_pred_video": ["of a little girl sleeping on a couch", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a person is snoring", "a faucet is running and a man is speaking"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wz7N8YRy74I", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["rooster, crow, background, people", "People, motor, brakes"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["wDVMhEdTiVw", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["gun, shoot, water", "background, birds, rustling"], "captions_pred_video": ["a blurry image of trees and water in the forest", null], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a heavy rain falls endlessly", "water rushes and then a vehicle zooms past"], "sample_ids": ["wP8ZKrlx3oA", "s4Uz1Ffgo04"], "start_seconds": ["40", "100"], "properties": ["heavy, rain, fall", "water, rushes, vehicle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is moving faster", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a toilet flushes and water drains"], "sample_ids": ["vlJS7LN2XyM", "sfAvvZwdLCY"], "start_seconds": ["30", "20"], "properties": ["background, clocks, ticking", "water drains, flushes, water"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a ticktock of a clock", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["water flows and trickles", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["tB7hWb9gTuQ", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["water, flow, trickle", "engine, revs, vehicle"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["water is splashing and gurgling", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a woman speaks happily and an animal chirps"], "sample_ids": ["wP8ZKrlx3oA", "uWAAAL4CIoc"], "start_seconds": ["40", "0"], "properties": ["rain, storm, thunder", "a woman, chirps, animal"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yks4cLgIDMc", "vfYTJq7nU"], "start_seconds": ["170", "130"], "properties": ["background, speaking, child", "rustling, ducks, quack"], "captions_pred_video": ["footage of two kids wrestling on the floor", null], "captions_pred_audio": ["a man is speaking and a child is crying", "a duck quacks and a woman speaks"], "question": "which entity has a child shouting in the background?", "label": 0}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a infant makes noise and is excited"], "sample_ids": ["xNMovAf3o50", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["rain, thunder, music", "noise, excited, infant"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person is burping while a girl speaks", "a duck quacks loudly and continuously"], "sample_ids": ["vdoxuJn9lTc", "vh30P49Po6s"], "start_seconds": ["40", "30"], "properties": ["person, burp, girl", "loud, continuous, quacks"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a child speaks followed by a burp", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["xNMovAf3o50", "vlS6YMeWAPo"], "start_seconds": ["0", "40"], "properties": ["rain, thunder, music", "sheep, baa, birds"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a goat bleats and birds chirp"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a car speeding up in the distance"], "sample_ids": ["ul60S8TXDA8", "u0TrcHhkPQ"], "start_seconds": ["60", "20"], "properties": ["sound, distance, bell", "distance, car, speed"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", null], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a girl speaks followed by a scream and more girls talking", "dishes cling together then a man begins to speak"], "sample_ids": ["uYT5gxnyMWM", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["a, scream, girl", "cling, speak, dishes"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "mechanisms are operating and water is splashing "], "question": "which entity is about a girl speaking followed by a scream and more girls talking?", "label": 0}, {"captions": ["a girl speaks followed by a scream and more girls talking", "an infant crying as a woman laughs"], "sample_ids": ["uYT5gxnyMWM", "xhmRY9yhC7c"], "start_seconds": ["50", "20"], "properties": ["a, scream, girl", "a, laugh, infant"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "some men converse over an engine running"], "sample_ids": ["wTideSjRFS0", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["food, sizzle, woman", "men, converse, engine"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a woman speaking as frying food sizzles?", "label": 0}, {"captions": ["a baby cries and a woman speaks", "an infant crying frantically"], "sample_ids": ["tMbMDvT50j8", "zwOBqeFTgiU"], "start_seconds": ["12", "30"], "properties": ["a, cry, woman", "cry, infant, frantically"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "of the baby crying in the car seat"], "captions_pred_audio": ["a baby cries and a woman speaks", "a baby cries loudly"], "question": "which entity is crying frantically", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a car speeding up in the distance"], "sample_ids": ["vZAw4apG0Es", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, clock, ticktocks", "distance, car, speed"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "people cheer as a vehicle engine revs"], "sample_ids": ["rqu8iB22IY", "xjhAnI2q6hM"], "start_seconds": ["5", "6"], "properties": ["sound, repeats, laugh", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a truck is revving its engine and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yswmmRZFItk", "uZesmtKZGSw"], "start_seconds": ["0", "250"], "properties": ["background, frog, croak", "men, talk, cars"], "captions_pred_video": ["a close up of a frog in the water", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a frog is croaking", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a machine beeps continuously", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["y682ml90jGw", "tdWhHV3X25Q"], "start_seconds": ["11", "60"], "properties": ["beeps, machine, continuously", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a beeping sound is being made ", "a man is speaking and a crowd is clapping"], "question": "which entity is a response to a performance", "label": 1}, {"captions": ["a heavy rain falls endlessly", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wP8ZKrlx3oA", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["heavy, rain, fall", "a woman, laughs, animal"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "small dogs yip and bark sharply"], "sample_ids": ["ukxt9I7eMMg", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["food, pan, cook", "bark, yip, sharply"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaking with light rustling", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["zOZleIRqZm4", "xfaoyyzw2WU"], "start_seconds": ["80", "180"], "properties": ["light, rustling, man", "loud, jet engine, roar"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zgUgkpk78xU", "uZesmtKZGSw"], "start_seconds": ["70", "250"], "properties": ["horn, bells, ring", "men, talk, cars"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a baby laugh at a sputter", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sLUnaPT5gM8", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["laugh, sputter, baby", "engine, idle, woman"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman is speaking and a subway train is moving "], "question": "which entity is a person", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a stream of water runs briefly"], "sample_ids": ["ugHJF0hfYkg", "x-PeY8Yb8M4"], "start_seconds": ["10", "300"], "properties": ["engine, running, continuously", "stream, water, run"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is driving on a wet road "], "question": "which entity is running continuously", "label": 0}, {"captions": ["a person whistles and clicks a mouse", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zCrAfDfv6-A", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["person, mouse, click", "a woman, something, fried"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person whistles a song", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["tapping occurs then a baby cries", "music plays and a woman speaks on a radio before gunshots are fired"], "sample_ids": ["wIJK3-5y0kA", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["a, cry, baby", "music, radio, gunshots"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a baby cries and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is about a baby crying?", "label": 0}, {"captions": ["birds chirp as a train approaches", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xM4joTqDVp4", "vJ7JPEFhyLA"], "start_seconds": ["160", "16"], "properties": ["bird, chirp, train", "three men, wind, flow"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a train?", "label": 0}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a man speaks as a motor runs in the background"], "sample_ids": ["w6RTHR6AeAg", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["call, owl, screech", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "birds chirp and objects are moved around"], "sample_ids": ["xZepNM9qcRA", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["background, motor, run", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "paper folding and crinkling"], "sample_ids": ["yYEVLuqEytU", "zPpG3RD8lSs"], "start_seconds": ["40", "20"], "properties": ["grunt, slurp, background", "paper, fold, crinkle"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["several sheep bleat and a man speaks", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "water flows as men speak and yell"], "sample_ids": ["x4a9YGIw4ok", "vJ7JPEFhyLA"], "start_seconds": ["120", "16"], "properties": ["water, gurgles, stops", "water, flow, men"], "captions_pred_video": ["footage is blurry and out of focus", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a toilet flushes and water splashes", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["tDVADusiIoc", "zFjIWfSD-4"], "start_seconds": ["60", "410"], "properties": ["man, radio, blows", "People, motor, brakes"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a man speaking on a radio as wind blows?", "label": 0}, {"captions": ["a car accelerates and wind blows", "people speak as gunfire rings out"], "sample_ids": ["u0TrcHhkPQ", "wqTCwqVRDlk"], "start_seconds": ["20", "80"], "properties": ["accelerates, wind, blows", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wnpJndXuxLc", "su6FAOcOA8c"], "start_seconds": ["50", "4"], "properties": ["beeps, loud, whistle", "engine, idle, woman"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "plastic is tapped on while someone speaks"], "sample_ids": ["wTjoRj1se3U", "wvKpEYswXO0"], "start_seconds": ["390", "150"], "properties": ["engine, run, people", "plastic, tap, speak"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a jet engine is running and people are talking", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "a vehicle accelerates squealing tires"], "sample_ids": ["zkKdxzNC97Y", "sd7xVssqlw"], "start_seconds": ["27", "50"], "properties": ["hard, surface, door", "accelerates, tires, squealing"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "a man speaks as a car is passing by"], "sample_ids": ["uRlbY6aoBU", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["a, distance, sneeze", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "water is sprayed across a hard surface"], "sample_ids": ["w0xsN8X18Y", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["music, surface, rain", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "spraying followed by silence"], "question": "which entity is sprayed across a hard surface?", "label": 1}, {"captions": ["a helicopter engine runs", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["t5ZbXbniOWk", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["engine, helicopter, run", "a woman, a television program, a bird"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a bird?", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "pigeons vocalize and birds chirp"], "sample_ids": ["x4a9YGIw4ok", "uiS58TNyUiw"], "start_seconds": ["120", "430"], "properties": ["water, gurgles, stops", "vocalize, bird, chirp"], "captions_pred_video": ["footage is blurry and out of focus", "of the pigeon in the cage"], "captions_pred_audio": ["a toilet flushes and water splashes", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["food is frying while a woman speaks", "wind blowing followed by a zoom"], "sample_ids": ["yhQ2Lg-7qDY", "vr8ZXjEBhMQ"], "start_seconds": ["130", "150"], "properties": ["food, woman, speak", "wind, blow, zoom"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a faucet is running and a man is speaking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "some men converse over an engine running"], "sample_ids": ["xyL9F5VrjkE", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["engine, run, wind", "men, converse, engine"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["continuous snoring", "bird squawks are accompanied by a man and woman speaking"], "sample_ids": ["sLkeqCDJIyw", "wqZ135Ssz0"], "start_seconds": ["120", "60"], "properties": ["loud, snoring, noise", "man, woman, squawks"], "captions_pred_video": [", what is the man doing on the couch? sleeping", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["paper folding and crinkling", "several insects fly while two men talk"], "sample_ids": ["zPpG3RD8lSs", "s-T9OVOiMLo"], "start_seconds": ["20", "330"], "properties": ["paper, fold, crinkle", "several, fly, men"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "a bird is chirping and tweeting a bird song"], "sample_ids": ["wIvYjuR3nrg", "wPz6QRAkEb4"], "start_seconds": ["9", "60"], "properties": ["birds, pigeons, vocalize", "chirps, tweets, song"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "a bird in a cage on top of a pole"], "captions_pred_audio": ["birds are chirping and cooing", "birds are chirping in the background "], "question": "which bird is singing", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "a male speaks and another male speaks"], "sample_ids": ["tK4VlLsNxak", "viuTg1M-dqg"], "start_seconds": ["120", "30"], "properties": ["a, dial, telephone", "two males, speaking, male"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a muffled toilet flushes and the water drains"], "sample_ids": ["uZesmtKZGSw", "sfAvvZwdLCY"], "start_seconds": ["250", "20"], "properties": ["car, track, man", "flushes, drains, water"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a toilet is flushed"], "question": "which entity has water draining?", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["zhx6hoYrHeI", "y2bVZ7rz-5M"], "start_seconds": ["160", "280"], "properties": ["engine, sputter, rough", "motor noise, horn, siren"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "pigeons vocalize and birds chirp"], "sample_ids": ["zl9Dqx-j7q4", "uiS58TNyUiw"], "start_seconds": ["6", "430"], "properties": ["engine, laugh, loud", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a man driving a car in the dark", "of the pigeon in the cage"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["continuous sneezing together with speech", "a duck quacks continuously"], "sample_ids": ["x4dZyf9Gbj0", "vh30P49Po6s"], "start_seconds": ["130", "30"], "properties": ["continuous, sneeze, speech", "quacks, continuously, duck"], "captions_pred_video": ["footage is blurry and out of focus", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman sneezes and speaks", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a helicopter engine idles continuously", "a car accelerates and wind blows"], "sample_ids": ["ugHJF0hfYkg", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["engine, idle, continuously", "accelerates, wind, blows"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["w5W5Kqtc8E", "w34HjHr6gAY"], "start_seconds": ["100", "30"], "properties": ["wind, blow, vehicle", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "water flows and trickles"], "sample_ids": ["vbr9mHKc8WM", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["noise, loudness, engine", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["an engine is idling", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["water pouring and bubbling", "three men talk while wind blows and some liquid flows"], "sample_ids": ["uyRfq-jKPpo", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["water, bubbles, pouring", "three men, wind, flow"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["water is running from a faucet", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a liquid flowing?", "label": 1}, {"captions": ["someone whistles a song", "a man speaks as a car is passing by"], "sample_ids": ["sIXTftIuUgw", "sK4u5T8hW78"], "start_seconds": ["90", "30"], "properties": ["someone, song, whistle", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["xERFUeZONz8", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["ring, approach, traffic", "loud, jet engine, roar"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["an emergency vehicle siren blares", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "people applaud and hoot and chat quietly"], "sample_ids": ["tQWGZLItBXk", "wwyfGO2J4"], "start_seconds": ["170", "90"], "properties": ["music, person, ding", "people, applaud, hoot"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "people are clapping and speaking with background noise "], "question": "which entity has more people", "label": 1}, {"captions": ["a small engine spits as it runs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sZvwOuuPGP0", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["spits, engine, runs", "airplane, boy, fly"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a medium engine is running ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["food is frying while a woman speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yhQ2Lg-7qDY", "xBxDz0CFVn0"], "start_seconds": ["130", "30"], "properties": ["food, woman, speak", "stream, water, flow"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage is blurry and out of focus"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["dogs barking and whimpering", "a baby cries and a woman speaks"], "sample_ids": ["tIY7qOV3rEM", "tMbMDvT50j8"], "start_seconds": ["0", "12"], "properties": ["barking, whimpering, dog", "a, cry, woman"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["distant humming of an engine", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yVPZ2MNWpms", "wz7N8YRy74I"], "start_seconds": ["0", "30"], "properties": ["sound, distance, engine", "rooster, crow, background, men"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster crow?", "label": 1}, {"captions": ["people speak and tapping occurs", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tFCUUGdREgA", "xfaoyyzw2WU"], "start_seconds": ["70", "180"], "properties": ["people, tap, speak", "loud, jet engine, roar"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a child speaks in closed space"], "sample_ids": ["wPz6QRAkEb4", "yW6FWLSLkx4"], "start_seconds": ["60", "40"], "properties": ["chirps, tweets, song", "child, space, speak"], "captions_pred_video": ["a bird in a cage on top of a pole", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["birds are chirping in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sNB8zxXneIM", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["several, quack, cocks", "music, gunfire, explosion"], "captions_pred_video": ["a group of geese in a cage", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zO-LSSY92ZM", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["liquid, surface, sound", "a woman, something, fried"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "- a woman cooking in the kitchen"], "captions_pred_audio": ["steam is hissing and hissing", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking", "label": 1}, {"captions": ["scraping and female speech with distant music", "a man talks as something metal hits against and glass is set down"], "sample_ids": ["yHeVV-xeOxQ", "x6ijhqRY38s"], "start_seconds": ["130", "250"], "properties": ["female, speech, music", "something metal, glass, hit"], "captions_pred_video": ["of a girl milking a goat's udder", "a chef preparing a dish with a bottle of wine and a plate of food on a table"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a man is speaking and dishes are clanging "], "question": "which entity is about something hitting something?", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["x6ijhqRY38s", "wqZ135Ssz0"], "start_seconds": ["250", "60"], "properties": ["something metal, glass, hit", "two men, woman, birds"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "a vehicle engine accelerating then running on idle"], "sample_ids": ["zdYdyF9-m8U", "vYkA3cfXp5Q"], "start_seconds": ["7", "30"], "properties": ["wind, crash, shoreline", "engine, accelerate, idle"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["waves crash and wind blows ", "an engine is idling"], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a jet engine spools up and takes off", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vBslzh7saPw", "vJ7JPEFhyLA"], "start_seconds": ["90", "16"], "properties": ["engine, spools, takes", "three men, wind, flow"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a moving object", "label": 0}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vuUVPzd2FXw", "zFjIWfSD-4"], "start_seconds": ["160", "410"], "properties": ["a, steam, release", "People, motor, brakes"], "captions_pred_video": ["of the person cooking on the grill with a spatula", null], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a man talking?", "label": 0}, {"captions": ["an electric engine works nearby followed by a child talking", "a woman speaks as she rubs two objects together"], "sample_ids": ["xSKJGCItUWE", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["engine, work, child", "two objects, woman, speak"], "captions_pred_video": ["footage of the helicopter flying in the room", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a machine?", "label": 0}, {"captions": ["an airplane engine runs", "water flows and trickles"], "sample_ids": ["yVPZ2MNWpms", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["engine, airplane, runs", "water, flow, trickle"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a car is driving by on the road ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "birds chirp and objects are moved around"], "sample_ids": ["yswmmRZFItk", "yPUYU6t3rwo"], "start_seconds": ["0", "370"], "properties": ["background, frog, croak", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a close up of a frog in the water", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a frog is croaking", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks as several small engines run", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["u9A6VZQCZpU", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["a, man, talk", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a man is speaking with wind noise in the background "], "question": "which entity is about a man talking?", "label": 0}, {"captions": ["a wooden clack accompanies nearby chirping birds", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yeFvk9x0wWI", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["clack, bird, chirp", "a woman, something, fried"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a vehicle engine accelerating then running on idle"], "sample_ids": ["ukxt9I7eMMg", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["continuous, woman, speaking", "engine, accelerate, idle"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an airplane engine runs", "roadway noise occurs and a truck accelerates"], "sample_ids": ["yVPZ2MNWpms", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["engine, airplane, runs", "noise, truck, accelerate"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a car is driving by on the road ", "a car is driving on the road "], "question": "which is not a source of noise", "label": 0}, {"captions": ["a duck quacks loudly and continuously", "birds chirp and objects are moved around"], "sample_ids": ["vh30P49Po6s", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["loud, continuous, quacks", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a duck is quacking loudly", "insects buzz and a man speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a drill drills through something then people begin laughing"], "sample_ids": ["xvDdE3zNf8Y", "tEE3MpBt1sg"], "start_seconds": ["120", "50"], "properties": ["a, female, speaks", "drill, something, laugh"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a woman speaks and crumples paper", "people are laughing breathing and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a infant makes noise and is excited"], "sample_ids": ["u21-Z5gJCB8", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["background, voice, man", "noise, excited, infant"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a person sniffles and sneezes", "people speak as gunfire rings out"], "sample_ids": ["uRlbY6aoBU", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["sneezes, sniffles, person", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "someone is typing on a computer keyboard"], "sample_ids": ["sZPuqDgX2V0", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["engine, accelerate, intercom", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a person is typing on a keyboard"], "question": "which entity is stationary", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "wind blows as people chatter quietly"], "sample_ids": ["rwtmaKiCcQU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["nozzle, depressed, spray can", "wind, chatter, people"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "footage is blurry and out of focus"], "captions_pred_audio": ["spraying and people speaking", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone snores nearby", "a man speaks as a motor runs in the background"], "sample_ids": ["spJCm8tD9Zo", "xZepNM9qcRA"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "background, motor, run"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person is snoring loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["children speak as a female ask them questions", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["wEBlkGWVWwE", "wDVMhEdTiVw"], "start_seconds": ["260", "30"], "properties": ["female, speak, questions", "gun, shoot, water"], "captions_pred_video": ["shows a person writing on the whiteboard", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a game", "label": 1}, {"captions": ["an airplane engine runs", "a woman speaks as she rubs two objects together"], "sample_ids": ["yVPZ2MNWpms", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["engine, airplane, runs", "two objects, woman, speak"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a car is driving by on the road ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is moving", "label": 0}, {"captions": ["a female speaks softly as paper crinkles", "people speak as gunfire rings out"], "sample_ids": ["xvDdE3zNf8Y", "wqTCwqVRDlk"], "start_seconds": ["120", "80"], "properties": ["a, female, speaks", "gunfire, ring, speak"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a goat bleats and someone makes a calling noise", "a clock ticktocks"], "sample_ids": ["vlS6YMeWAPo", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["noise, bleat, call", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a goat bleats and birds chirp", "a clock is ticking loudly"], "question": "which entity makes a ticktocks noise", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vZAw4apG0Es", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["background, clock, ticktocks", "female, spraying, scream"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a clock is ticking and people are talking", "a woman is speaking and a baby is crying"], "question": "which entity is more violent", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a train engine runs and a horn blows"], "sample_ids": ["wz7N8YRy74I", "zPX9o1uDiI"], "start_seconds": ["30", "40"], "properties": ["rooster, crow, background, people", "engine, horn, run"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a train moves with its horn blowing and wheels squealing "], "question": "which entity is a train", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a woman speaks happily and an animal chirps"], "sample_ids": ["sAam2NqGhLY", "uWAAAL4CIoc"], "start_seconds": ["20", "0"], "properties": ["snoring, breathing, child", "a woman, chirps, animal"], "captions_pred_video": ["of a little girl sleeping on a couch", null], "captions_pred_audio": ["a person is snoring", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "paper is crumpling consistently"], "sample_ids": ["t97k0cejSQE", "v5cSxLaHADY"], "start_seconds": ["250", "0"], "properties": ["bird, chirp, insect", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a bee on a purple thistle flower", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "an electric engine works nearby followed by a child talking"], "sample_ids": ["vimzuGQvdcU", "xSKJGCItUWE"], "start_seconds": ["30", "10"], "properties": ["a, man, yells", "engine, work, child"], "captions_pred_video": ["a group of people are rafting down a river", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a high pitched engine is running and a child speaks"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["goats bleat and metal clings", "a car speeding up in the distance"], "sample_ids": ["tH17JPjDPnc", "u0TrcHhkPQ"], "start_seconds": ["260", "20"], "properties": ["bleat, metal, clings", "distance, car, speed"], "captions_pred_video": ["feed of the goats eating hay in the barn", null], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "an engine idles consistently before sputtering some"], "sample_ids": ["yRx9txMcBl0", "rwTERCUno"], "start_seconds": ["40", "90"], "properties": ["accelerates, tires, squeals", "engine, idle, sputter"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "an engine is idling and vibrating"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["birds chirp as a train approaches", "a clock ticktocks"], "sample_ids": ["xM4joTqDVp4", "v-g-j2uTByM"], "start_seconds": ["160", "30"], "properties": ["bird, chirp, train", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a cat meows and children speak", "water flows and trickles"], "sample_ids": ["x5cuQjOdM3E", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["cat, speak, children", "water, flow, trickle"], "captions_pred_video": ["a black background with an airplane flying in the sky", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a cat meows and a woman speaks", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a helicopter engine runs", "small dogs yip and bark sharply"], "sample_ids": ["t5ZbXbniOWk", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["engine, helicopter, run", "bark, yip, sharply"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a helicopter is flying overhead ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "paper is crumpling consistently"], "sample_ids": ["su6FAOcOA8c", "v5cSxLaHADY"], "start_seconds": ["4", "0"], "properties": ["engine, idle, woman", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "winds blows roughly as a vehicle races past"], "sample_ids": ["vbpKkWvfOu4", "xjvTpk2Zpr8"], "start_seconds": ["560", "70"], "properties": ["a, woman, man", "wind, blows, vehicle"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a jet engine roars and wind blows "], "question": "which entity is a person", "label": 0}, {"captions": ["birds chirp and a dog breathes heavily", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["y2ZBGpgbhHM", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["dog, chirp, breathe", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["birds chirping and a dog panting", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a machine engine runs and a man speaks"], "sample_ids": ["u21-Z5gJCB8", "vs65y4qmyBE"], "start_seconds": ["30", "340"], "properties": ["background, voice, man", "engine, run, man"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a heavy engine is running and men are speaking "], "question": "which entity has a man speaking with another voice speaking in the background?", "label": 0}, {"captions": ["a woman speaks and other women and a man talk with her", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vbpKkWvfOu4", "tdWhHV3X25Q"], "start_seconds": ["560", "60"], "properties": ["a, woman, man", "applause, audience, yells"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "paper is crumpling consistently"], "sample_ids": ["xvDdE3zNf8Y", "v5cSxLaHADY"], "start_seconds": ["120", "0"], "properties": ["a, female, speaks", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman speaks and crumples paper", "paper is crumpled and crinkled"], "question": "which entity is crumpling consistently", "label": 1}, {"captions": ["a baby laugh at a sputter", "a stream of water runs briefly"], "sample_ids": ["sLUnaPT5gM8", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["laugh, sputter, baby", "stream, water, run"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["material crumbles into a microphone", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["vofpvUo6NAw", "ziUT9IFTkjg"], "start_seconds": ["220", "10"], "properties": ["material, crumbles, microphone", "background, birds, rustling"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", null], "captions_pred_audio": ["paper is being crumpled and crinkled", "birds are chirping and a chime is ringing "], "question": "which entity is more likely to be found in a forest", "label": 1}, {"captions": ["some men converse over an engine running", "insects humming with a dog barking and small goat bleating"], "sample_ids": ["sCiy7QS1U", "tIY7qOV3rEM"], "start_seconds": ["300", "0"], "properties": ["men, converse, engine", "animal, bark, dog, barking, small, goat, bleating"], "captions_pred_video": [null, "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a dog is barking and a cat is meowing"], "question": "which animal is barking", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "a steam engine runs and whistles as it passes by"], "sample_ids": ["u5RmF3c3Aw", "se87d6yxEOA"], "start_seconds": ["60", "10"], "properties": ["engine, car, zoom", "run, whistle, pass"], "captions_pred_video": [null, "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a train is moving and blowing its whistle "], "question": "which entity is a steam engine?", "label": 1}, {"captions": ["a cat meows and children speak", "a man speaks as a car is passing by"], "sample_ids": ["x5cuQjOdM3E", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["cat, speak, children", "a, car, pass"], "captions_pred_video": ["a black background with an airplane flying in the sky", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vf44CgrjT0A", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["loud, long, person", "men, talk, cars"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a loud burp", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "water flows and trickles"], "sample_ids": ["vVhthZ45k3Y", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["cat, purr, hiss", "water, flow, trickle"], "captions_pred_video": ["footage is blurry and out of focus", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "an infant crying as a woman laughs"], "sample_ids": ["vW4x7S1VfQc", "xhmRY9yhC7c"], "start_seconds": ["150", "20"], "properties": ["clacking, oil, woman", "a, laugh, infant"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["food sizzles in a frying pan", "a baby cries and a woman speaks"], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["leaves rustle while man speaks", "a steam engine runs and whistles as it passes by"], "sample_ids": ["zOZleIRqZm4", "se87d6yxEOA"], "start_seconds": ["80", "10"], "properties": ["leaves, rustle, speak", "run, whistle, pass"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a train is moving and blowing its whistle "], "question": "which entity is moving", "label": 1}, {"captions": ["a car accelerates and wind blows", "an engine runs loudly"], "sample_ids": ["u0TrcHhkPQ", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["accelerates, wind, blows", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "paper is crumpling consistently"], "sample_ids": ["w0xsN8X18Y", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["music, surface, rain", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a horn honks and then loudly blares", "a loud engine muffles a man as he speaks"], "sample_ids": ["wnpJndXuxLc", "xyx6eNVEYRY"], "start_seconds": ["50", "380"], "properties": ["horn, honk, loud", "loud, engine, muffles"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "an aircraft engine is running and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["ylpYOorfH4o", "zj2R0XoFr5k"], "start_seconds": ["410", "50"], "properties": ["engine, running, wind", "airplane, boy, fly"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "water splashes as an animal walks through"], "sample_ids": ["vbZ-0lGPneg", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["a woman, a television program, a bird", "animal, water, splashes"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "a vehicle accelerates and squeals tires"], "sample_ids": ["xMXvkIcaG0Y", "yRx9txMcBl0"], "start_seconds": ["30", "40"], "properties": ["sound, humming, rattling", "accelerates, tires, squeals"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["an engine is revving and accelerating ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xfaoyyzw2WU", "tDVADusiIoc"], "start_seconds": ["180", "60"], "properties": ["loud, jet engine, roar", "water, radio, man"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which is louder", "label": 1}, {"captions": ["a helicopter engine runs continuously", "pigeons vocalize and birds chirp"], "sample_ids": ["ugHJF0hfYkg", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["engine, running, continuously", "vocalize, bird, chirp"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of the pigeon in the cage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["water flows as a woman laughs and a man speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vddP56-ogds", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["water, flow, laugh", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "an infant crying as a woman laughs"], "sample_ids": ["vf44CgrjT0A", "xhmRY9yhC7c"], "start_seconds": ["20", "20"], "properties": ["loud, long, person", "a, laugh, infant"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a loud burp", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a car speeding up in the distance"], "sample_ids": ["s6DESzUTGjY", "u0TrcHhkPQ"], "start_seconds": ["16", "20"], "properties": ["wind, laugh, woman", "distance, car, speed"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", null], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yZp6xizR0yU", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["animal, bleat, cry", "a woman, something, fried"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a woman is speaking while food is frying in the background"], "question": "which entity is about a woman talking?", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yLy-WycbVVE", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["background, people, talk", "engine, laugh, loud"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a man sprays as a scraping occurs in the background"], "sample_ids": ["uYT5gxnyMWM", "sOa7g-44Dag"], "start_seconds": ["50", "30"], "properties": ["female, spraying, scream", "background, man, spray"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and rubbing his hands together "], "question": "which entity has a female spraying?", "label": 0}, {"captions": ["white noise and birds chirping", "a man speaks followed by another man speaking outside"], "sample_ids": ["wRBHTgrbiwg", "viuTg1M-dqg"], "start_seconds": ["50", "30"], "properties": ["noise, white, chirping", "two men, speak, follow"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "people cheer as a vehicle engine revs"], "sample_ids": ["uEU-Hg5MTN8", "xjhAnI2q6hM"], "start_seconds": ["27", "6"], "properties": ["a woman, laughs, animal", "engine revs, vehicle, people"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a truck is revving its engine and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["a horse runs while two women talk", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sdvI1mHAsc", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["two women, horse, run", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wDVMhEdTiVw", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["gun, shoot, water", "loud, multiple, distance"], "captions_pred_video": ["a blurry image of trees and water in the forest", null], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "someone snores nearby"], "sample_ids": ["zofjfKhqLk8", "spJCm8tD9Zo"], "start_seconds": ["10", "90"], "properties": ["background, metal, clings", "someone snores, nearby, someone"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wztCSUxOf8", "sLUnaPT5gM8"], "start_seconds": ["130", "0"], "properties": ["a crowd, yells, applauds", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["dogs barking and whimpering", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tIY7qOV3rEM", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["barking, whimpering, dog", "female, spraying, scream"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a toilet flushes and a female speaks"], "sample_ids": ["xyx6eNVEYRY", "yaln9y8I7ms"], "start_seconds": ["380", "230"], "properties": ["loud, engine, muffles", "female, flushes, toilet"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "footage is blurry and out of focus"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sfAvvZwdLCY", "y8WEcpOlT3I"], "start_seconds": ["20", "40"], "properties": ["water drains, flushes, water", "harsh, wind, blows"], "captions_pred_video": ["footage of the toilet in the bathroom", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with wind noise in the background "], "question": "which entity is a source of water", "label": 0}, {"captions": ["dogs barking and whimpering", "an infant crying frantically"], "sample_ids": ["tIY7qOV3rEM", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "cry, infant, frantically"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of the baby crying in the car seat"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "birds chirp and an insect buzzes around"], "sample_ids": ["tZGN5a7ybxo", "t97k0cejSQE"], "start_seconds": ["60", "250"], "properties": ["ring, train, horn", "bird, chirp, insect"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "a bee on a purple thistle flower"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a bee buzzes and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a train engine runs and a horn blows", "a vehicle accelerates and squeals tires"], "sample_ids": ["zPX9o1uDiI", "yRx9txMcBl0"], "start_seconds": ["40", "40"], "properties": ["engine, horn, run", "accelerates, tires, squeals"], "captions_pred_video": [null, "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "birds chirp and objects are moved around"], "sample_ids": ["u21-Z5gJCB8", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["background, voice, man", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["shmR4OZtzqA", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["man, engine, idle", "a woman, something, fried"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man speaks while a motor runs", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a telephone rings followed by a woman talking"], "sample_ids": ["wjsXBsc7M40", "tGcFnX0GHI"], "start_seconds": ["10", "0"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "ring, talk, woman"], "captions_pred_video": ["footage of the baby playing with a toothbrush", null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a man speaks as a car is passing by"], "sample_ids": ["zCrAfDfv6-A", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["person, mouse, click", "a, car, pass"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person whistles a song", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a person clicking a mouse?", "label": 0}, {"captions": ["wind blows and people scream while an engine revs", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["w5W5Kqtc8E", "vbZ-0lGPneg"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more calm", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "someone whistles a tune"], "sample_ids": ["xjhAnI2q6hM", "sIXTftIuUgw"], "start_seconds": ["6", "90"], "properties": ["wind, blow, loudly", "someone, tune, whistle"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a person whistling a song"], "question": "which is a musical instrument", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "paper is crumpling consistently"], "sample_ids": ["vqZuVbG6-HI", "v5cSxLaHADY"], "start_seconds": ["130", "0"], "properties": ["background, male, female", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a door slams shut roughly", "a woman and man are speaking"], "sample_ids": ["zkKdxzNC97Y", "vbpKkWvfOu4"], "start_seconds": ["27", "560"], "properties": ["a door, slams, shut", "two people, speaking, woman, man"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a door is opened and closed", "a woman is speaking and a man is speaking"], "question": "which entity is a person", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["y2bVZ7rz-5M", "ziUT9IFTkjg"], "start_seconds": ["280", "10"], "properties": ["motor noise, horn, siren", "background, birds, rustling"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "water is sprayed across a hard surface"], "sample_ids": ["sapQIQUhFc", "sQwlkXjQabo"], "start_seconds": ["280", "10"], "properties": ["liquid, flow, distance", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "spraying followed by silence"], "question": "which entity is a spray of water?", "label": 1}, {"captions": ["someone snores nearby", "paper folding and crinkling"], "sample_ids": ["spJCm8tD9Zo", "zPpG3RD8lSs"], "start_seconds": ["90", "20"], "properties": ["someone snores, nearby, someone", "paper, fold, crinkle"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a person is snoring loudly", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["xyx6eNVEYRY", "vzxHnu-SFEw"], "start_seconds": ["380", "80"], "properties": ["loud, engine, muffles", "two objects, woman, speak"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is rubbing together?", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a man speaks as a car is passing by"], "sample_ids": ["vddP56-ogds", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["water, flow, laugh", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["xBxDz0CFVn0", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["wind, chatter, people", "harsh, wind, blows"], "captions_pred_video": ["footage is blurry and out of focus", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 0}, {"captions": ["people speak in a closed space", "a woman speaks happily and an animal chirps"], "sample_ids": ["sTpirNYo8vQ", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["people, space, speak", "a woman, chirps, animal"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a woman is speaking and a dog is barking "], "question": "which entity is more active", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sZPuqDgX2V0", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["engine, accelerate, intercom", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["wind blowing followed by a zoom", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vr8ZXjEBhMQ", "vbZ-0lGPneg"], "start_seconds": ["150", "30"], "properties": ["wind, blow, zoom", "a woman, a television program, a bird"], "captions_pred_video": ["is taken from a motorcycle's point of view", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a video of a natural phenomenon?", "label": 0}, {"captions": ["a woman talks while something is fried and objects are tapped", "a car accelerates and wind blows"], "sample_ids": ["yajyRTUQk3U", "u0TrcHhkPQ"], "start_seconds": ["400", "20"], "properties": ["a woman, something, fried", "accelerates, wind, blows"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a man speaks as a car is passing by"], "sample_ids": ["vW4x7S1VfQc", "sK4u5T8hW78"], "start_seconds": ["150", "30"], "properties": ["clacking, oil, woman", "a, car, pass"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["food sizzles in a frying pan", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["xyx6eNVEYRY", "wwyfGO2J4"], "start_seconds": ["380", "90"], "properties": ["loud, engine, muffles", "people, applaud, hoot"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", null], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a person snoring", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["t8tv5YRMJUg", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["a person, snore, loud", "gun, shoot, water"], "captions_pred_video": ["of a man getting his face licked by another man", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks with water running", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["wTideSjRFS0", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["water, running, woman", "male, duck, laugh"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and ducks are quacking"], "question": "which entity has a duck in it?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "people speak as gunfire rings out"], "sample_ids": ["wTideSjRFS0", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["food, sizzle, woman", "gunfire, ring, speak"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a horse runs while two women talk", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sdvI1mHAsc", "wDVMhEdTiVw"], "start_seconds": ["20", "30"], "properties": ["two women, horse, run", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["uEU-Hg5MTN8", "vlS6YMeWAPo"], "start_seconds": ["27", "40"], "properties": ["a woman, laughs, animal", "sheep, baa, birds"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a goat bleats and birds chirp"], "question": "which entity has more animals", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "birds chirp and objects are moved around"], "sample_ids": ["u6jIvCtKarQ", "yPUYU6t3rwo"], "start_seconds": ["70", "370"], "properties": ["a, man, speaks", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a person using a blender on a stove top", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "a telephone rings and a bird vocalizes"], "sample_ids": ["wRBHTgrbiwg", "skd2PphS6oI"], "start_seconds": ["50", "190"], "properties": ["birds, chirp, cooing", "ring, bird, vocalize"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a telephone bell rings repeatedly "], "question": "which entity is a bird vocalizing?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a man speaks while turning a water faucet on"], "sample_ids": ["vJrjSeP17yE", "vf9xf3vMsGM"], "start_seconds": ["40", "540"], "properties": ["a person is sleeping, snoring, person", "A man speaks while turning a water faucet on."], "captions_pred_video": ["a black background with a small plane flying in the sky", "of the person washing their hands under the faucet"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while water is running in the background"], "question": "which entity is a man?", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a toilet flushes and a female speaks"], "sample_ids": ["yYEVLuqEytU", "yaln9y8I7ms"], "start_seconds": ["40", "230"], "properties": ["grunt, slurp, background", "female, flushes, toilet"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage is blurry and out of focus"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["some men converse over an engine running", "some tunes played by whistling"], "sample_ids": ["sCiy7QS1U", "u6BnG6YZqJ4"], "start_seconds": ["300", "0"], "properties": ["men, converse, engine", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "paper is crumpling consistently"], "sample_ids": ["sYITalLZjj4", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["water, rushes, background, birds", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["wind blows and birds chirp", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a woman speaks as she rubs two objects together"], "sample_ids": ["un9VQlzgZM", "vzxHnu-SFEw"], "start_seconds": ["5", "80"], "properties": ["females, talk, laugh", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a woman speaking?", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a young woman speaks over spraying and another person yells"], "sample_ids": ["rqfQRErjfk8", "uYT5gxnyMWM"], "start_seconds": ["170", "50"], "properties": ["crowd, cheers, applauds", "person, spray, yell"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a train horn blares as a train passes, then fades"], "sample_ids": ["xKB8O8LTs6s", "zVacuqSb4LI"], "start_seconds": ["70", "30"], "properties": ["music, gunfire, explosion", "blares, fades, train"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is more calm", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "water pouring and bubbling"], "sample_ids": ["vdoxuJn9lTc", "uyRfq-jKPpo"], "start_seconds": ["40", "50"], "properties": ["burp, loud, girl", "water, bubbles, pouring"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a child speaks followed by a burp", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a train engine runs and a horn blows", "a toilet flushes and a female speaks"], "sample_ids": ["zPX9o1uDiI", "yaln9y8I7ms"], "start_seconds": ["40", "230"], "properties": ["engine, horn, run", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a toilet flushes and a man speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a car accelerates and wind blows"], "sample_ids": ["zsLxS-uLJTw", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["horn, blast, train", "accelerates, wind, blows"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", null], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["male speech with light ticking", "small dogs growl, bark and yip."], "sample_ids": ["xO-Q2BlIIPU", "sShpyu2l4YQ"], "start_seconds": ["30", "0"], "properties": ["male, speech, ticking", "growl, bark, yip"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "the puppies are playing with a toy"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a dog is barking and growling"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["t97k0cejSQE", "vfYTJq7nU"], "start_seconds": ["250", "130"], "properties": ["bird, chirp, insect", "rustling, ducks, quack"], "captions_pred_video": ["a bee on a purple thistle flower", null], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "several insects fly while two men talk"], "sample_ids": ["vzxHnu-SFEw", "s-T9OVOiMLo"], "start_seconds": ["80", "330"], "properties": ["two objects, woman, speak", "several, fly, men"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a woman speaking as she rubs two objects together?", "label": 0}, {"captions": ["someone is snoring while sleeping", "paper folding and crinkling"], "sample_ids": ["ujMt0-D-x2k", "zPpG3RD8lSs"], "start_seconds": ["0", "20"], "properties": ["snore, sleep, someone", "paper, fold, crinkle"], "captions_pred_video": ["of the dog playing with a toy on the floor", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a person is snoring loudly", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a woman speaks as she rubs two objects together"], "sample_ids": ["vuUVPzd2FXw", "vzxHnu-SFEw"], "start_seconds": ["160", "80"], "properties": ["a, steam, release", "two objects, woman, speak"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is being rubbed together", "label": 0}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "several insects fly while two men talk"], "sample_ids": ["wsHBIgzs9Fs", "s-T9OVOiMLo"], "start_seconds": ["50", "330"], "properties": ["horn, continuous, buzzing", "several, fly, men"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be in a zoo", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a flush is followed by gurgling water, then another flush"], "sample_ids": ["v7jJS8aAyA", "tqR406bGiE"], "start_seconds": ["10", "40"], "properties": ["wind, blows, loudly", "flush, water, gurgle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a toilet is flushed"], "question": "which entity is silent", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "several insects fly while two men talk"], "sample_ids": ["xvDdE3zNf8Y", "s-T9OVOiMLo"], "start_seconds": ["120", "330"], "properties": ["a, female, speaks", "several, fly, men"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more insects", "label": 1}, {"captions": ["speaking following by laughing and clapping", "someone snores nearby"], "sample_ids": ["u2f5NpsoHBg", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["person, laugh, clap", "someone snores, nearby, someone"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a person is snoring loudly"], "question": "which person is speaking", "label": 0}, {"captions": ["a person burps loudly for a long time nearby", "water is sprayed across a hard surface"], "sample_ids": ["vf44CgrjT0A", "sQwlkXjQabo"], "start_seconds": ["20", "10"], "properties": ["loud, long, person", "water, spray, surface"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a loud burp", "spraying followed by silence"], "question": "which entity is a spray?", "label": 1}, {"captions": ["a consistent ticking pattern", "a large crowd cheers and applauds"], "sample_ids": ["sCeWURVHfOM", "rqfQRErjfk8"], "start_seconds": ["30", "170"], "properties": ["ticking, pattern, clock", "crowd, cheers, applauds"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "a man hugging another man in front of an orchestra"], "captions_pred_audio": ["ticking of a clock", "a crowd of people clapping and cheering"], "question": "which entity is more likely to be a clock", "label": 0}, {"captions": ["food is frying while a woman speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["yhQ2Lg-7qDY", "sQGXqGcwOTc"], "start_seconds": ["130", "3"], "properties": ["food, woman, speak", "cling, speak, dishes"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a faucet is running and a man is speaking", "mechanisms are operating and water is splashing "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a person sniffles and then sneezes in the distance", "people cheer as a vehicle engine revs"], "sample_ids": ["uRlbY6aoBU", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["a, distance, sneeze", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is sneezing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["people speak and tapping occurs", "water flows as a woman laughs and a man speaks"], "sample_ids": ["tFCUUGdREgA", "vddP56-ogds"], "start_seconds": ["70", "30"], "properties": ["people, tap, speak", "water, flow, laugh"], "captions_pred_video": ["a person riding a white horse in an indoor arena", null], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "water is running and gurgling and a man is speaking"], "question": "which entity is a video of a person speaking and tapping?", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vzxHnu-SFEw", "ukg5L09Wpvo"], "start_seconds": ["80", "150"], "properties": ["two objects, woman, speak", "clickety-clack, train, whistle"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a male speaks and another male speaks", "a man speaks as a car is passing by"], "sample_ids": ["viuTg1M-dqg", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["two males, speaking, male", "a, car, pass"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a man speaks as crickets sing", "three men talk while wind blows and some liquid flows"], "sample_ids": ["ryFDPxgDOGc", "vJ7JPEFhyLA"], "start_seconds": ["570", "16"], "properties": ["a, crickets, sing", "three men, wind, flow"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["an aircraft engine runs", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yLCORCnd35Q", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["engine, aircraft, runs", "People, motor, brakes"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", null], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["a man is filing a hard object", "several insects fly while two men talk"], "sample_ids": ["vveS8HT7Uog", "s-T9OVOiMLo"], "start_seconds": ["100", "330"], "properties": ["a man, hard, object", "several, fly, men"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a man filing a hard object?", "label": 0}, {"captions": ["a woman speaks in a fast tone with a male", "someone whistles a tune"], "sample_ids": ["sTpirNYo8vQ", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["a, tone, fast", "someone, tune, whistle"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a clock ticktocks"], "sample_ids": ["zj2R0XoFr5k", "v-g-j2uTByM"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, woman", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "roadway noise occurs and a truck accelerates"], "sample_ids": ["zkKdxzNC97Y", "tgbONvsP47Y"], "start_seconds": ["27", "0"], "properties": ["loud, bang, noise", "noise, truck, accelerate"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a door is opened and closed", "a car is driving on the road "], "question": "which noise is made by a truck", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["smDKStoHBJo", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["a, talk, baby, cry", "a woman, a television program, a bird"], "captions_pred_video": ["a man holding a crying baby in his arms", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a dog is whimpering"], "question": "which entity has a baby?", "label": 0}, {"captions": ["an airplane engine roars increasingly louder", "a duck quacks continuously"], "sample_ids": ["vBslzh7saPw", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["engine, roar, louder", "quacks, continuously, duck"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a car speeding up in the distance"], "sample_ids": ["y1saVTXsKwc", "u0TrcHhkPQ"], "start_seconds": ["80", "20"], "properties": ["a, dog, talk", "distance, car, speed"], "captions_pred_video": ["a dog playing with a pink ball", null], "captions_pred_audio": ["a dog barks and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "the revving of an engine throttle followed by a man speaking"], "sample_ids": ["vBHyYJ8pL0", "tezvROoo4bs"], "start_seconds": ["2", "40"], "properties": ["noise, door, opening", "audio, throttle, speaking"], "captions_pred_video": [null, "footage of a busy city street with cars parked on both sides of the road"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a car accelerates and revs while a man speaks "], "question": "which entity is accompanied by a door opening and closing", "label": 0}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a man speaks as a car is passing by"], "sample_ids": ["tiDFTC-5vU", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a duck in it?", "label": 0}, {"captions": ["multiple birds vocalize and wind blows", "a man speaks as a motor runs in the background"], "sample_ids": ["uoGVs9yUqY4", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["multiple, vocalize, wind", "background, motor, run"], "captions_pred_video": ["for how to make a wooden shed door youtube", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a man speaking to a motor?", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "a woman speaks as she rubs two objects together"], "sample_ids": ["sOa7g-44Dag", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["audio, scratching, man", "two objects, woman, speak"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a child speaks in closed space"], "sample_ids": ["ugHJF0hfYkg", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["engine, running, continuously", "child, space, speak"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not running continuously", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sOa7g-44Dag", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["background, man, spray", "two men, woman, birds"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a duck quacks continuously"], "sample_ids": ["wz7N8YRy74I", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, people", "quacks, continuously, duck"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "a train horn blows as it passes by"], "sample_ids": ["slZLHwNbbt4", "zVacuqSb4LI"], "start_seconds": ["300", "30"], "properties": ["train, horn, sound", "horn, blows, train"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which train is blowing its horn?", "label": 1}, {"captions": ["a motorcycle engine is idling", "people speak as gunfire rings out"], "sample_ids": ["vZAqdHZ81yA", "wqTCwqVRDlk"], "start_seconds": ["180", "80"], "properties": ["engine, motorcycle, idling", "gunfire, ring, speak"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["an engine is idling loudly", "a man is speaking and a gun is fired"], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a clock ticktocks"], "sample_ids": ["sofxkNWaP0s", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["wind, engine, louder", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "pigeons vocalize and birds chirp"], "sample_ids": ["yeFvk9x0wWI", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["chirp, twitter, clatter", "vocalize, bird, chirp"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of the pigeon in the cage"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["leaves rustle while man speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["zOZleIRqZm4", "xZepNM9qcRA"], "start_seconds": ["80", "30"], "properties": ["leaves, rustle, speak", "background, motor, run"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["wqUmIEzuNz4", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["frog, bird, vocalize", "two men, woman, birds"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", null], "captions_pred_audio": ["a cat meows and rustles", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a human activity", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a drill runs and two people laugh"], "sample_ids": ["sa6TLVbooCc", "tEE3MpBt1sg"], "start_seconds": ["240", "50"], "properties": ["people, laugh, child", "two people, laugh, drill"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "people are laughing breathing and speaking with background noise "], "question": "which entity shows a drill running?", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sAam2NqGhLY", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["snoring, breathing, child", "a woman, something, fried"], "captions_pred_video": ["of a little girl sleeping on a couch", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person is snoring", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a horn honks and then loudly blares"], "sample_ids": ["zOZleIRqZm4", "wnpJndXuxLc"], "start_seconds": ["80", "50"], "properties": ["rustling, leaves, person", "horn, honk, loud"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a stream of water runs briefly", "paper folding and crinkling"], "sample_ids": ["x-PeY8Yb8M4", "zPpG3RD8lSs"], "start_seconds": ["300", "20"], "properties": ["stream, water, run", "paper, fold, crinkle"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a car is driving on a wet road ", "the wind blows and a mouse clicks "], "question": "which entity is not a stream of water?", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "an airplane engine spools and people speak"], "sample_ids": ["vBslzh7saPw", "wTjoRj1se3U"], "start_seconds": ["90", "390"], "properties": ["engine, roar, louder", "airplane, engine, spool"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a jet engine is running and people are talking"], "question": "which entity is a video of an airplane engine?", "label": 0}, {"captions": ["a child speaks in closed space", "a telephone rings followed by a woman talking"], "sample_ids": ["yW6FWLSLkx4", "tGcFnX0GHI"], "start_seconds": ["40", "0"], "properties": ["child, space, speak", "ring, talk, woman"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xMXvkIcaG0Y", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["sound, humming, rattling", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["an engine is revving and accelerating ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "water flows and trickles"], "sample_ids": ["yFB25fqfU8I", "tB7hWb9gTuQ"], "start_seconds": ["300", "30"], "properties": ["wave, crash, shoreline", "water, flow, trickle"], "captions_pred_video": ["footage of a person surfing in the ocean", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["someone is snoring while sleeping", "some tunes played by whistling"], "sample_ids": ["ujMt0-D-x2k", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["snore, sleep, someone", "tune, play, whistling"], "captions_pred_video": ["of the dog playing with a toy on the floor", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a person is snoring loudly", "a person whistling a song"], "question": "which entity is not playing a tune?", "label": 0}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a clock ticktocks"], "sample_ids": ["su6FAOcOA8c", "v-g-j2uTByM"], "start_seconds": ["4", "30"], "properties": ["engine, run, woman", "ticktocks, clock, ticktocks"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vddP56-ogds", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["water, flow, laugh", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows water flowing?", "label": 0}, {"captions": ["birds twitter and chirp and clatter", "birds chirp as a bell rings"], "sample_ids": ["yeFvk9x0wWI", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["chirp, twitter, clatter", "chirp, bell, ring"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a infant makes noise and is excited"], "sample_ids": ["wsHBIgzs9Fs", "wIJK3-5y0kA"], "start_seconds": ["50", "30"], "properties": ["horn, continuous, buzzing", "noise, excited, infant"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a clock ticktocks"], "sample_ids": ["sapQIQUhFc", "v-g-j2uTByM"], "start_seconds": ["280", "30"], "properties": ["liquid, flow, distance", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "a person is burping while a girl speaks"], "sample_ids": ["zliInBdC98Y", "vdoxuJn9lTc"], "start_seconds": ["30", "40"], "properties": ["a, baby, cries, wails", "person, burp, girl"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a baby cries and a woman speaks", "a child speaks followed by a burp"], "question": "which entity is a person?", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sTpirNYo8vQ", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["a, tone, fast", "two men, woman, birds"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["bees buzz and wind blows", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["tMJne1a4AFI", "yDoT73BWsdA"], "start_seconds": ["0", "10"], "properties": ["bees buzz, wind blows, bees", "engine, revs, vehicle"], "captions_pred_video": ["a swarm of bees on the ground", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a swarm of bees buzzing around", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["people speak in a closed space", "an airplane engine spools and people speak"], "sample_ids": ["sTpirNYo8vQ", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["people, space, speak", "airplane, engine, spool"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a jet engine is running and people are talking"], "question": "which entity is a video", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "water pouring and bubbling"], "sample_ids": ["yks4cLgIDMc", "uyRfq-jKPpo"], "start_seconds": ["170", "50"], "properties": ["background, speaking, child", "water, bubbles, pouring"], "captions_pred_video": ["footage of two kids wrestling on the floor", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and a child is crying", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["tapping occurs then a baby cries", "a infant makes noise and is excited"], "sample_ids": ["wIJK3-5y0kA", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["a, cry, baby", "noise, excited, infant"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a baby cries and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is a baby?", "label": 0}, {"captions": ["a toilet flushes and water drains", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sfAvvZwdLCY", "xfaoyyzw2WU"], "start_seconds": ["20", "180"], "properties": ["water drains, flushes, water", "loud, jet engine, roar"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a toilet is flushed", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a infant makes noise and is excited"], "sample_ids": ["xjhAnI2q6hM", "wIJK3-5y0kA"], "start_seconds": ["6", "30"], "properties": ["engine revs, vehicle, people", "noise, excited, infant"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a baby cries and a woman speaks"], "question": "which entity is more excited", "label": 1}, {"captions": ["electronic beeps occur in a short series", "water splashes as an animal walks through"], "sample_ids": ["y682ml90jGw", "w1ir-sZ3Im8"], "start_seconds": ["11", "90"], "properties": ["beeps, series, electronic", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a beeping sound is being made ", "water splashes and gurgles as people speak"], "question": "which entity is more natural", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "water splashes as an animal walks through"], "sample_ids": ["wIvYjuR3nrg", "w1ir-sZ3Im8"], "start_seconds": ["9", "90"], "properties": ["birds, pigeons, vocalize", "animal, water, splashes"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["birds are chirping and cooing", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["uYT5gxnyMWM", "ziUT9IFTkjg"], "start_seconds": ["50", "10"], "properties": ["a, scream, girl", "background, birds, rustling"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a child speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yW6FWLSLkx4", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["a, child, speaks", "People, motor, brakes"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vW4x7S1VfQc", "zj2R0XoFr5k"], "start_seconds": ["150", "50"], "properties": ["clacking, oil, woman", "airplane, boy, fly"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["food sizzles in a frying pan", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["material crumbles into a microphone", "birds chirp and objects are moved around"], "sample_ids": ["vofpvUo6NAw", "yPUYU6t3rwo"], "start_seconds": ["220", "370"], "properties": ["material, crumbles, microphone", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["paper is being crumpled and crinkled", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a person is snoring while sleeping"], "sample_ids": ["tPJvjq9QePY", "vJrjSeP17yE"], "start_seconds": ["40", "40"], "properties": ["animal, bleat, moo", "a person is sleeping, snoring, person"], "captions_pred_video": ["a dog and a sheep in a barn", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a baby cries and a man speaks", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["birds chirp quietly and an adult man speaks", "a duck quacks continuously"], "sample_ids": ["zuua6-5goWw", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["birds, chirp, quiet, man, speaks", "quacks, continuously, duck"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["a frog croaks as other frogs croak in the background", "a toilet flushes and a female speaks"], "sample_ids": ["yswmmRZFItk", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["background, frog, croak", "female, flushes, toilet"], "captions_pred_video": ["a close up of a frog in the water", "footage is blurry and out of focus"], "captions_pred_audio": ["a frog is croaking", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "paper is crumpling consistently"], "sample_ids": ["w8uLijTqtlU", "v5cSxLaHADY"], "start_seconds": ["70", "0"], "properties": ["wind, microphone, noise", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry and shaky", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["the wind is blowing strongly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "water rushes by"], "sample_ids": ["zdYdyF9-m8U", "x-PeY8Yb8M4"], "start_seconds": ["7", "300"], "properties": ["wind, crash, shoreline", "water, rushes, by"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["waves crash and wind blows ", "a car is driving on a wet road "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "a vehicle accelerates squealing tires"], "sample_ids": ["sOa7g-44Dag", "sd7xVssqlw"], "start_seconds": ["30", "50"], "properties": ["audio, scratching, man", "accelerates, tires, squealing"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "water flows as men speak and yell"], "sample_ids": ["wvKpEYswXO0", "vJ7JPEFhyLA"], "start_seconds": ["150", "16"], "properties": ["water, tap, run", "water, flow, men"], "captions_pred_video": ["of the person preparing food in the kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has water flowing?", "label": 1}, {"captions": ["a horse runs while two women talk", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["sdvI1mHAsc", "w34HjHr6gAY"], "start_seconds": ["20", "30"], "properties": ["two women, horse, run", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a clock ticktocks in wind", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yVumC9TGknc", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["ticktocks, clock, wind", "loud, multiple, distance"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", null], "captions_pred_audio": ["a series of beeps and chirps", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "someone snores nearby"], "sample_ids": ["y8WEcpOlT3I", "spJCm8tD9Zo"], "start_seconds": ["40", "90"], "properties": ["wind, speak, buffeting", "someone snores, nearby, someone"], "captions_pred_video": ["on how to use a sewing machine youtube", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a person is snoring loudly"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a motor runs in the distance as a soft wind periodically gusts"], "sample_ids": ["un9VQlzgZM", "xyL9F5VrjkE"], "start_seconds": ["5", "20"], "properties": ["wind, speak, laugh", "wind, motor, distance"], "captions_pred_video": [null, "of a caterpillar truck loading logs into a trailer"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "the wind is blowing and a car is passing by "], "question": "which entity is about a motor running in the distance as a soft wind periodically gusts?", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "wind blows in gusts as a woman speaks in the distance"], "sample_ids": ["s4Uz1Ffgo04", "uC9dtII1KDI"], "start_seconds": ["100", "150"], "properties": ["water, rushes, vehicle", "wind, gusts, distance"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a person riding a horse in a riding arena"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman is speaking with wind noise and breathing in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a man speaks as a car is passing by"], "sample_ids": ["zFjIWfSD-4", "sK4u5T8hW78"], "start_seconds": ["410", "30"], "properties": ["People, motor, brakes", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "an infant crying as a woman laughs"], "sample_ids": ["uWAAAL4CIoc", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["a woman, chirps, animal", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a baby cries and a woman speaks"], "question": "which entity is more likely to be a solitary event", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a car speeding up in the distance"], "sample_ids": ["tGcFnX0GHI", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["ring, talk, woman", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "a woman and man speak while food is frying"], "sample_ids": ["xMXvkIcaG0Y", "zk-xJGQU8-4"], "start_seconds": ["30", "130"], "properties": ["sound, humming, rattling", "food, man, woman"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["an engine is revving and accelerating ", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a woman speaks happily and an animal chirps"], "sample_ids": ["smDKStoHBJo", "uWAAAL4CIoc"], "start_seconds": ["0", "0"], "properties": ["a, infant, speaking", "a woman, chirps, animal"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a dog is barking "], "question": "which entity is a woman speaking to an animal?", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "wind blows as people chatter quietly"], "sample_ids": ["sOa7g-44Dag", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["background, man, spray", "wind, chatter, people"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a clock alarm sounds and gears turn"], "sample_ids": ["xjvTpk2Zpr8", "w2M4i1mklOA"], "start_seconds": ["70", "30"], "properties": ["engine, run, wind", "alarm, gears, turn"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of an antique clock"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a clock is ticking and a bell is ringing "], "question": "which entity is not a clock?", "label": 0}, {"captions": ["a horse runs while two women talk", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sdvI1mHAsc", "y8WEcpOlT3I"], "start_seconds": ["20", "40"], "properties": ["two women, horse, run", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 0}, {"captions": ["a person is whistling", "an infant crying frantically"], "sample_ids": ["sIXTftIuUgw", "zwOBqeFTgiU"], "start_seconds": ["90", "30"], "properties": ["person, whistling, person", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a person whistling a song", "a baby cries loudly"], "question": "which entity is a person", "label": 0}, {"captions": ["a man speaks followed by another man speaking outside", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["viuTg1M-dqg", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["two men, speak, follow", "motor noise, horn, siren"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn and a siren", "label": 1}, {"captions": ["a goat screams and people speak in the background", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["xC8kbrKJmco", "y2bVZ7rz-5M"], "start_seconds": ["0", "280"], "properties": ["background, goat, scream", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a goat is bleating ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "winds blows roughly as a vehicle races past"], "sample_ids": ["su6FAOcOA8c", "xjvTpk2Zpr8"], "start_seconds": ["4", "70"], "properties": ["engine, run, woman", "wind, blows, vehicle"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "multiple ducks quack continuously"], "sample_ids": ["w5W5Kqtc8E", "wfHeoPDLMaM"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "multiple, quack, continuously"], "captions_pred_video": [null, "ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "ducks are quacking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a church bell rings several times", "a man talks while a clock does ticktock"], "sample_ids": ["sUVVjE3Ucp8", "spYNpeN7rPY"], "start_seconds": ["0", "1"], "properties": ["ring, bell, several", "a clock, ticktock, man"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis"], "captions_pred_audio": ["a church bell is ringing ", "a man is speaking and breathing with background noise "], "question": "which entity is a clock?", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "water is sprayed across a hard surface"], "sample_ids": ["w0xsN8X18Y", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["rain, thunder, surface", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "water splashes as an animal walks through"], "sample_ids": ["zF8yoL0rkbI", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["engine, run, someone", "animal, water, splashes"], "captions_pred_video": ["footage of the traffic on the street at night", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "someone snores nearby"], "sample_ids": ["x6ijhqRY38s", "spJCm8tD9Zo"], "start_seconds": ["250", "90"], "properties": ["bowl, silverware, man", "someone snores, nearby, someone"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a male speaks over some small clicks", "winds blows roughly as a vehicle races past"], "sample_ids": ["uXxVebHsGZ8", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["male, clicks, speak", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["weDbePuc-Xc", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["music, slaps, human", "animal, grunts, snorts"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and an animal grunts and snorts?", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "a stream of water flows as people talk and wind blows"], "sample_ids": ["skd2PphS6oI", "xBxDz0CFVn0"], "start_seconds": ["190", "30"], "properties": ["ring, bird, vocalize", "stream, water, flow"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "footage is blurry and out of focus"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wy1eKjR7KC0", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["people, talk, distance", "a woman, a television program, a bird"], "captions_pred_video": ["two police officers riding motorcycles down the street", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a woman is speaking and a dog is whimpering"], "question": "which entity is a solitary event", "label": 1}, {"captions": ["water splashes and a door squeaks", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sdXV-ylviw", "xfaoyyzw2WU"], "start_seconds": ["190", "180"], "properties": ["sound, splash, door", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a dog barks and taps with background noise ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["an adult woman and an adult man speak", "an engine runs loudly"], "sample_ids": ["zTLVJCo4WEE", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["two people, adult, speak", "loud, engine, run"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["uqFtmnhuqA8", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["a, b, c", "motor noise, horn, siren"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "an airplane engine roars increasingly louder"], "sample_ids": ["sd7xVssqlw", "vBslzh7saPw"], "start_seconds": ["50", "90"], "properties": ["accelerates, tires, squealing", "engine, roar, louder"], "captions_pred_video": [null, "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a jet engine roars and accelerates "], "question": "which entity is louder", "label": 1}, {"captions": ["a small engine spits as it runs", "a train horn blows as it passes by"], "sample_ids": ["sZvwOuuPGP0", "zVacuqSb4LI"], "start_seconds": ["50", "30"], "properties": ["spits, engine, runs", "horn, blows, train"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a medium engine is running ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which train is blowing a horn?", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["y2bVZ7rz-5M", "yajyRTUQk3U"], "start_seconds": ["280", "400"], "properties": ["motor noise, horn, siren", "a woman, something, fried"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["u7C-AEBQM", "wSVhSdj0F0"], "start_seconds": ["30", "10"], "properties": ["ticks, rhythmic, quiet", "horn honks, keys jingle, electronic beep"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a car horn honks and keys jangle with background noise "], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["y8dSeubCNI", "uZesmtKZGSw"], "start_seconds": ["4", "250"], "properties": ["engine revving, people speaking, motorcycle", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["an engine revving and people talking in the background", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["birds vocalize and a man speaks", "a person uses a saw to cut some wood"], "sample_ids": ["v0wPrLBI3hg", "sHbXC6na9hg"], "start_seconds": ["30", "0"], "properties": ["vocalize, bird, speak", "a person, saw, wood"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "an engine is idling and vibrating"], "question": "which entity is a person", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["x5cuQjOdM3E", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["cat, meows, young woman", "a, scream, girl"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "wind blows as people chatter quietly"], "sample_ids": ["uiItxDsDMFI", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["wood, piece, saw", "wind, chatter, people"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "footage is blurry and out of focus"], "captions_pred_audio": ["a saw is being used with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a car speeding up in the distance"], "sample_ids": ["x5cuQjOdM3E", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["cat, meows, young woman", "distance, car, speed"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "a drill drills through something then people begin laughing"], "sample_ids": ["tjmoSi330GM", "tEE3MpBt1sg"], "start_seconds": ["23", "50"], "properties": ["speed, water, boat", "drill, something, laugh"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "people are laughing breathing and speaking with background noise "], "question": "which is a drill", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "multiple people speak and children yell while water gurgles"], "sample_ids": ["siJFXfGWgDk", "vb1fPSDI4c"], "start_seconds": ["50", "30"], "properties": ["a, bird, vehicle", "multiple, people, yell"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a man is filing a hard object"], "sample_ids": ["v0x1odnXtP0", "vveS8HT7Uog"], "start_seconds": ["210", "100"], "properties": ["keyboard, type, computer", "a man, hard, object"], "captions_pred_video": ["how to make money on youtube in spanish", "footage is of a workbench with various tools on it including a hammer and a screwdriver"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is filing and speaking with background noise and breathing "], "question": "which object is harder to file", "label": 0}, {"captions": ["an animal growls followed by birds chirping", "a small musical boom and then birds tweet and a few dogs pant"], "sample_ids": ["y2ZBGpgbhHM", "y2ZBGpgbhHM"], "start_seconds": ["30", "30"], "properties": ["animal, growl, bird", "birds, tweet, pant"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "birds chirping and a dog panting"], "question": "which entity has more animals", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "water flows as a woman laughs and a man speaks"], "sample_ids": ["zkKdxzNC97Y", "vddP56-ogds"], "start_seconds": ["27", "30"], "properties": ["hard, surface, door", "water, flow, laugh"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "water is running and gurgling and a man is speaking"], "question": "which entity is a video", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["wRBHTgrbiwg", "tDlysoZiA1I"], "start_seconds": ["50", "0"], "properties": ["birds, chirp, cooing", "animal, grunts, chirps"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "birds are chirping and a rooster is crowing "], "question": "which entity is a recording of birds chirping?", "label": 0}, {"captions": ["the rumbling of a bus followed by a soft male voice", "winds blows roughly as a vehicle races past"], "sample_ids": ["vK93VuO0yNc", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["male voice, bus, rumble", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "an infant crying as a woman laughs"], "sample_ids": ["wqZ135Ssz0", "xhmRY9yhC7c"], "start_seconds": ["60", "20"], "properties": ["man, woman, squawks", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a baby cries and a woman speaks"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["tapping occurs then a baby cries", "a motor noise is accompanied by a door opening and closing"], "sample_ids": ["wIJK3-5y0kA", "vBHyYJ8pL0"], "start_seconds": ["30", "2"], "properties": ["a, cry, baby", "noise, door, opening"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is accompanied by a door opening and closing?", "label": 1}, {"captions": ["an adult woman and an adult man speak", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zTLVJCo4WEE", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["two people, adult, speak", "People, motor, brakes"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has more people", "label": 1}, {"captions": ["an engine runs and a man speaks", "a toilet flushes and a female speaks"], "sample_ids": ["yT5WfYMRr-U", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["engine, run, man", "female, flushes, toilet"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a toilet flushes and a man speaks"], "question": "which entity is a man speaking?", "label": 0}, {"captions": ["multiple birds chirp and an animal grunts", "water pouring and bubbling"], "sample_ids": ["tDlysoZiA1I", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["animal, grunt, multiple", "water, bubbles, pouring"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an airplane engine runs", "a stream of water runs briefly"], "sample_ids": ["yVPZ2MNWpms", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["engine, airplane, runs", "stream, water, run"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a car is driving by on the road ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a vehicle engine accelerating then running on idle"], "sample_ids": ["u2f5NpsoHBg", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["person, laugh, clap", "engine, accelerate, idle"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "an engine is idling"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["uqFtmnhuqA8", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["a, b, c", "airplane, boy, fly"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a toilet flushes and a female speaks"], "sample_ids": ["vXlk0lIQBFo", "yaln9y8I7ms"], "start_seconds": ["470", "230"], "properties": ["wind, talk, vocalize", "female, flushes, toilet"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage is blurry and out of focus"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["goats bleat and people speak", "loud, continuous burping"], "sample_ids": ["z5iUE5h0EPs", "y636gklDioE"], "start_seconds": ["30", "20"], "properties": ["goats bleat, people speak, language", "loud, continuous, burping"], "captions_pred_video": ["of the goat in the barn", "a dog sitting on a red chair in front of an old telephone"], "captions_pred_audio": ["a goat bleats and a man speaks", "a person burps loudly several times"], "question": "which is louder", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "an engine runs loudly"], "sample_ids": ["yaln9y8I7ms", "vqZuVbG6-HI"], "start_seconds": ["230", "130"], "properties": ["female, flushes, toilet", "loud, engine, run"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vSeGhaZt-aI", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["water, bubbles, run", "a, scream, girl"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a grown man speaking and water bubbles and runs?", "label": 0}, {"captions": ["a child speaks in closed space", "a female speaks softly as paper crinkles"], "sample_ids": ["yW6FWLSLkx4", "xvDdE3zNf8Y"], "start_seconds": ["40", "120"], "properties": ["child, space, speak", "a, female, speaks"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a woman speaks and crumples paper"], "question": "which entity is speaking softly", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a frog croaks as other frogs croak in the background"], "sample_ids": ["xKB8O8LTs6s", "yswmmRZFItk"], "start_seconds": ["70", "0"], "properties": ["music, gunshots, explosion", "background, frog, croak"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a close up of a frog in the water"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a frog is croaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["children speak as a female ask them questions", "water flows and trickles"], "sample_ids": ["wEBlkGWVWwE", "tB7hWb9gTuQ"], "start_seconds": ["260", "30"], "properties": ["female, speak, questions", "water, flow, trickle"], "captions_pred_video": ["shows a person writing on the whiteboard", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "water is splashing and gurgling"], "question": "which entity is not a flow of water?", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a drill drills through something then people begin laughing"], "sample_ids": ["zkKdxzNC97Y", "tEE3MpBt1sg"], "start_seconds": ["27", "50"], "properties": ["loud, bang, noise", "drill, something, laugh"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a door is opened and closed", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["a child speaks", "a car speeding up in the distance"], "sample_ids": ["yW6FWLSLkx4", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["a, child, speaks", "distance, car, speed"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "vehicles pass by on a roadway"], "sample_ids": ["xKB8O8LTs6s", "tgbONvsP47Y"], "start_seconds": ["70", "0"], "properties": ["music, gunfire, explosion", "pass, vehicle, roadway"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a fire truck entering a garage"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a car is driving on the road "], "question": "which entity is more calm", "label": 1}, {"captions": ["women speak and laugh as wind blows", "dishes cling together then a man begins to speak"], "sample_ids": ["un9VQlzgZM", "sQGXqGcwOTc"], "start_seconds": ["5", "3"], "properties": ["wind, speak, laugh", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "an infant crying frantically"], "sample_ids": ["zj2R0XoFr5k", "zwOBqeFTgiU"], "start_seconds": ["50", "30"], "properties": ["airplane, boy, fly", "cry, infant, frantically"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of the baby crying in the car seat"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a man speaks with another voice speaking in the background"], "sample_ids": ["xKB8O8LTs6s", "u21-Z5gJCB8"], "start_seconds": ["70", "30"], "properties": ["music, radio, gunshots", "background, voice, man"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a woman speaking on a radio?", "label": 0}, {"captions": ["a vehicle accelerates and squeals tires", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yRx9txMcBl0", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["accelerates, tires, squeals", "People, motor, brakes"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["females talk and laugh over gusting wind", "a man speaks as a car is passing by"], "sample_ids": ["un9VQlzgZM", "sK4u5T8hW78"], "start_seconds": ["5", "30"], "properties": ["females, talk, laugh", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["small dogs yip and bark sharply", "paper is crumpling consistently"], "sample_ids": ["v-wcQf4BDY0", "v5cSxLaHADY"], "start_seconds": ["120", "0"], "properties": ["bark, yip, sharply", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a dog barks and growls", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "dishes cling together then a man begins to speak"], "sample_ids": ["zY3icUyMdh8", "sQGXqGcwOTc"], "start_seconds": ["20", "3"], "properties": ["dog, bark, engine", "cling, speak, dishes"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "mechanisms are operating and water is splashing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "dogs barking and whimpering"], "sample_ids": ["w5W5Kqtc8E", "tIY7qOV3rEM"], "start_seconds": ["100", "0"], "properties": ["wind, engine, scream", "barking, whimpering, dog"], "captions_pred_video": [null, "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a dog is barking and a cat is meowing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a car accelerates and wind blows"], "sample_ids": ["yYEVLuqEytU", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["animal, pig, background", "accelerates, wind, blows"], "captions_pred_video": ["a baby goat is being petted by a person's hand", null], "captions_pred_audio": ["several sheep bleat and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["continuous sneezing together with speech", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["x4dZyf9Gbj0", "yajyRTUQk3U"], "start_seconds": ["130", "400"], "properties": ["continuous, sneeze, speech", "a woman, something, fried"], "captions_pred_video": ["footage is blurry and out of focus", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman sneezes and speaks", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["sLUnaPT5gM8", "y2bVZ7rz-5M"], "start_seconds": ["0", "280"], "properties": ["loud, laughter, intermittent", "motor noise, horn, siren"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a person snores loudly multiple times at a close distance"], "sample_ids": ["ukxt9I7eMMg", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["continuous, woman, speaking", "loud, multiple, distance"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["water splashes as an animal walks through", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["w1ir-sZ3Im8", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["animal, water, splashes", "airplane, boy, fly"], "captions_pred_video": ["footage of a group of people riding horses through a river", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks while water drains", "birds chirp and objects are moved around"], "sample_ids": ["vSeGhaZt-aI", "yPUYU6t3rwo"], "start_seconds": ["50", "370"], "properties": ["water, drain, man", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a baby cries and a woman moans", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["smDKStoHBJo", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["a, cry, woman", "a woman, something, fried"], "captions_pred_video": ["a man holding a crying baby in his arms", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking while food is frying in the background"], "question": "which entity is about a woman talking?", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "someone is burping continuously"], "sample_ids": ["w5W5Kqtc8E", "y636gklDioE"], "start_seconds": ["100", "20"], "properties": ["wind, engine, scream", "burps, burps, burps"], "captions_pred_video": [null, "a dog sitting on a red chair in front of an old telephone"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a person burps loudly several times"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["t25U-v4k4ts", "sLUnaPT5gM8"], "start_seconds": ["40", "0"], "properties": ["bees buzz, birds chirp, man speaks", "loud, laughter, intermittent"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks as a machine runs", "a man speaks as a machine runs"], "sample_ids": ["vD6lYD1l0BY", "vD6lYD1l0BY"], "start_seconds": ["330", "330"], "properties": ["a, machine, run", "a, machine, run"], "captions_pred_video": ["game controller being held in the hands of the person", "game controller being held in the hands of the person"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking and dishes are being washed "], "question": "which machine is running in the first image?", "label": 0}, {"captions": ["children cheer as a man speaks then an audience screams", "someone snores nearby"], "sample_ids": ["vJvryTwuAV8", "spJCm8tD9Zo"], "start_seconds": ["16", "90"], "properties": ["audience, cheer, man", "someone snores, nearby, someone"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["an adult woman and an adult man speak", "people cheer as a vehicle engine revs"], "sample_ids": ["zTLVJCo4WEE", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["two people, adult, speak", "engine revs, vehicle, people"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a truck is revving its engine and a man is speaking "], "question": "which entity shows people cheering?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yRx9txMcBl0", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["accelerates, tires, squeals", "People, motor, brakes"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["people speak then an engine runs", "wind blows as people chatter quietly"], "sample_ids": ["uMTTDZ2mb4", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["engine, run, people", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["tapping occurs then a baby cries", "wind blowing followed by a zoom"], "sample_ids": ["wIJK3-5y0kA", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["a, cry, baby", "wind, blow, zoom"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a baby cries and a woman speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["females talk and laugh over gusting wind", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["un9VQlzgZM", "vbZ-0lGPneg"], "start_seconds": ["5", "30"], "properties": ["females, talk, laugh", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more likely to be a documentary", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wqADXCzngMw", "tdWhHV3X25Q"], "start_seconds": ["340", "60"], "properties": ["engine, idle, man", "applause, audience, yells"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["someone whistles a tune", "a child speaks in closed space"], "sample_ids": ["sIXTftIuUgw", "yW6FWLSLkx4"], "start_seconds": ["90", "40"], "properties": ["someone, tune, whistle", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xyL9F5VrjkE", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["wind, motor, distance", "rustling, ducks, quack"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a duck quacks and a woman speaks"], "question": "which entity is about a duck?", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a jet engine spools up and takes off"], "sample_ids": ["tEE3MpBt1sg", "vBslzh7saPw"], "start_seconds": ["50", "90"], "properties": ["drill, something, laugh", "engine, spools, takes"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a jet engine roars and accelerates "], "question": "which entity is a machine?", "label": 0}, {"captions": ["a cat meows as a young woman speaks", "paper is crumpling consistently"], "sample_ids": ["x5cuQjOdM3E", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["cat, meows, young woman", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a cat meows and a woman speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "a consistent ticking pattern"], "sample_ids": ["vz8868znkVQ", "sCeWURVHfOM"], "start_seconds": ["60", "30"], "properties": ["audio, click, kid speaking", "ticking, pattern, clock"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "- a close-up view of the clock's inner workings"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "ticking of a clock"], "question": "which entity is a clock?", "label": 1}, {"captions": ["a person sniffs and sneezes", "a train horn blows as it passes by"], "sample_ids": ["uRlbY6aoBU", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["sneezes, person, sniffs", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is sneezing ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "someone is typing on a computer keyboard"], "sample_ids": ["w-4gHptFNuU", "v0x1odnXtP0"], "start_seconds": ["21", "210"], "properties": ["engine revs, accelerates, bump", "keyboard, type, computer"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "how to make money on youtube in spanish"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person is typing on a keyboard"], "question": "which entity is stationary", "label": 1}, {"captions": ["water pouring and bubbling", "an airplane engine spools and people speak"], "sample_ids": ["uyRfq-jKPpo", "wTjoRj1se3U"], "start_seconds": ["50", "390"], "properties": ["water, bubbles, pouring", "airplane, engine, spool"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["water is running from a faucet", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["continuous sneezing together with speech", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["x4dZyf9Gbj0", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["continuous, sneeze, speech", "a woman, laughs, animal"], "captions_pred_video": ["footage is blurry and out of focus", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman sneezes and speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a drill drills through something then people begin laughing"], "sample_ids": ["wSVhSdj0F0", "tEE3MpBt1sg"], "start_seconds": ["10", "50"], "properties": ["horn honks, keys jingle, electronic beep", "drill, something, laugh"], "captions_pred_video": [null, "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a vehicle is skidding and squealing tires"], "sample_ids": ["tZGN5a7ybxo", "soTOh3zYJfY"], "start_seconds": ["60", "40"], "properties": ["ring, train, horn", "vehicle, skid, tires"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["people speak and tapping occurs", "winds blows roughly as a vehicle races past"], "sample_ids": ["tFCUUGdREgA", "xjvTpk2Zpr8"], "start_seconds": ["70", "70"], "properties": ["people, tap, speak", "wind, blows, vehicle"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a beep occurs briefly", "music plays and a woman speaks on a radio before gunshots are fired"], "sample_ids": ["xtWeJ56-U-g", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["beep, occur, briefly", "music, radio, gunshots"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has a woman speaking on a radio?", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a woman speaks and dog vocalizes"], "sample_ids": ["voJh2gJxXhA", "uWAAAL4CIoc"], "start_seconds": ["50", "0"], "properties": ["music, frog, croak", "a, dog, vocalize"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", null], "captions_pred_audio": ["music is playing and crickets are chirping ", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "someone whistles a tune"], "sample_ids": ["tw76HGONaKg", "sIXTftIuUgw"], "start_seconds": ["570", "90"], "properties": ["music, click, man", "someone, tune, whistle"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a horse runs while two women talk", "an infant crying as a woman laughs"], "sample_ids": ["sdvI1mHAsc", "xhmRY9yhC7c"], "start_seconds": ["20", "20"], "properties": ["two women, horse, run", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "a child speaks"], "sample_ids": ["soTOh3zYJfY", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["vehicle, skid, tires", "a, child, speaks"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wyllXV6PjKo", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["a baby, a woman, a man", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking and a baby is crying"], "question": "which entity has a baby?", "label": 0}, {"captions": ["children speak as a female ask them questions", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wEBlkGWVWwE", "y8WEcpOlT3I"], "start_seconds": ["260", "40"], "properties": ["female, speak, questions", "harsh, wind, blows"], "captions_pred_video": ["shows a person writing on the whiteboard", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and water drains", "water flows as men speak and yell"], "sample_ids": ["sfAvvZwdLCY", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["water drains, flushes, water", "water, flow, men"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water", "label": 1}, {"captions": ["a baby laugh at a sputter", "pigeons vocalize and birds chirp"], "sample_ids": ["sLUnaPT5gM8", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["laugh, sputter, baby", "vocalize, bird, chirp"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "of the pigeon in the cage"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "wind blows as people chatter quietly"], "sample_ids": ["y2bVZ7rz-5M", "xBxDz0CFVn0"], "start_seconds": ["280", "30"], "properties": ["motor noise, horn, siren", "wind, chatter, people"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage is blurry and out of focus"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a weapon fires multiple times"], "sample_ids": ["vSeGhaZt-aI", "sMC07Ucy7kg"], "start_seconds": ["50", "10"], "properties": ["water, bubbles, run", "weapon, fire, multiple"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage is from a car's point of view"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "people are clapping and speaking with background noise "], "question": "which entity is more violent", "label": 1}, {"captions": ["leaves rustle while man speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["zOZleIRqZm4", "xjhAnI2q6hM"], "start_seconds": ["80", "6"], "properties": ["leaves, rustle, speak", "engine revs, vehicle, people"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a vehicle?", "label": 1}, {"captions": ["children speak and play together", "water splashes and a door squeaks"], "sample_ids": ["yVVP8XvWJTo", "sdXV-ylviw"], "start_seconds": ["260", "190"], "properties": ["children, speak, play", "sound, splash, door"], "captions_pred_video": ["footage of a playground at a school or daycare center", null], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a dog barks and taps with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "an airplane engine runs"], "sample_ids": ["vbZ-0lGPneg", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["a woman, a television program, a bird", "engine, airplane, runs"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a door slams shut roughly", "a vehicle engine accelerating then running on idle"], "sample_ids": ["zkKdxzNC97Y", "vYkA3cfXp5Q"], "start_seconds": ["27", "30"], "properties": ["a door, slams, shut", "engine, accelerate, idle"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a door is opened and closed", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tPJvjq9QePY", "zl9Dqx-j7q4"], "start_seconds": ["40", "6"], "properties": ["animal, bleat, moo", "engine, laugh, loud"], "captions_pred_video": ["a dog and a sheep in a barn", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a baby cries and a man speaks", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a person is burping while a girl speaks", "a man speaks while turning a water faucet on"], "sample_ids": ["vdoxuJn9lTc", "vf9xf3vMsGM"], "start_seconds": ["40", "540"], "properties": ["person, burp, girl", "A man speaks while turning a water faucet on."], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of the person washing their hands under the faucet"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking while water is running in the background"], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "water pouring and bubbling"], "sample_ids": ["tEE3MpBt1sg", "uyRfq-jKPpo"], "start_seconds": ["50", "50"], "properties": ["drill, something, laugh", "water, bubbles, pouring"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["male speech with light ticking", "winds blows roughly as a vehicle races past"], "sample_ids": ["xO-Q2BlIIPU", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["male, speech, ticking", "wind, blows, vehicle"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "some tunes played by whistling"], "sample_ids": ["xERFUeZONz8", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["ring, approach, traffic", "tune, play, whistling"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["an emergency vehicle siren blares", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xjhAnI2q6hM", "uZesmtKZGSw"], "start_seconds": ["6", "250"], "properties": ["wind, blow, loudly", "men, talk, cars"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "plastic is tapped on while someone speaks"], "sample_ids": ["ylpYOorfH4o", "wvKpEYswXO0"], "start_seconds": ["410", "150"], "properties": ["motor, run, steady", "plastic, tap, speak"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "small dogs yip and bark sharply"], "sample_ids": ["v7jJS8aAyA", "v-wcQf4BDY0"], "start_seconds": ["10", "120"], "properties": ["wind, blows, loudly", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as horns blow", "a telephone rings followed by a woman talking"], "sample_ids": ["tHyNqRyK34A", "tGcFnX0GHI"], "start_seconds": ["24", "0"], "properties": ["a, man, speaks", "ring, talk, woman"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", null], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["xl2PIWyXaM", "ziUT9IFTkjg"], "start_seconds": ["160", "10"], "properties": ["chirp, man, younger person", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and people are talking", "birds are chirping and a chime is ringing "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "someone is typing on a computer keyboard"], "sample_ids": ["wz7N8YRy74I", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["rooster, crow, background, men", "keyboard, type, computer"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a man speaks as a motor runs in the background"], "sample_ids": ["vh30P49Po6s", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["loud, continuous, quacks", "background, motor, run"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a duck is quacking loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "a woman speaks with water running"], "sample_ids": ["sjlVMgdGSK0", "wTideSjRFS0"], "start_seconds": ["30", "30"], "properties": ["accelerates, vehicle, race car", "water, running, woman"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking while water is running in the background"], "question": "which entity is stationary", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a mechanical buzzing getting louder"], "sample_ids": ["wz7N8YRy74I", "sEprKHm8Sj8"], "start_seconds": ["30", "90"], "properties": ["rooster, crow, background, people", "noise, loud, buzzing"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["a male speaks over some small clicks", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["uXxVebHsGZ8", "s7knHCFW82w"], "start_seconds": ["30", "30"], "properties": ["male, clicks, speak", "blow horn, get close, train"], "captions_pred_video": [null, "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is moving", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "someone snores nearby"], "sample_ids": ["yks4cLgIDMc", "spJCm8tD9Zo"], "start_seconds": ["170", "90"], "properties": ["background, speaking, child", "someone snores, nearby, someone"], "captions_pred_video": ["footage of two kids wrestling on the floor", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking and a child is crying", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "wind blows as people chatter quietly"], "sample_ids": ["y8WEcpOlT3I", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["harsh, wind, blows", "wind, chatter, people"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "people applaud and hoot and chat quietly"], "sample_ids": ["x6ijhqRY38s", "wwyfGO2J4"], "start_seconds": ["250", "90"], "properties": ["bowl, silverware, man", "people, applaud, hoot"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wTjoRj1se3U", "yajyRTUQk3U"], "start_seconds": ["390", "400"], "properties": ["engine, run, people", "a woman, something, fried"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a jet engine is running and people are talking", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "vehicles pass by on a roadway"], "sample_ids": ["uJV8NDaHqqk", "tgbONvsP47Y"], "start_seconds": ["100", "0"], "properties": ["loud, fly, chirp", "pass, vehicle, roadway"], "captions_pred_video": ["a bee hive in a wooden box", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a swarm of bees buzzing around", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a person whistles a meandering tune", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["uFoga8sHpiw", "w5W5Kqtc8E"], "start_seconds": ["90", "100"], "properties": ["person, tune, whistle", "wind, blow, vehicle"], "captions_pred_video": ["footage of a bird in a cage", null], "captions_pred_audio": ["a person whistles a song", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["water splashes and a door squeaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["sdXV-ylviw", "uWAAAL4CIoc"], "start_seconds": ["190", "0"], "properties": ["sound, splash, door", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and taps with background noise ", "a woman is speaking and a dog is barking "], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "an airplane engine spools and people speak"], "sample_ids": ["su6FAOcOA8c", "wTjoRj1se3U"], "start_seconds": ["4", "390"], "properties": ["engine, idle, woman", "airplane, engine, spool"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a jet engine is running and people are talking"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a woman speaks as she rubs two objects together"], "sample_ids": ["uzQnlJXBbOM", "vzxHnu-SFEw"], "start_seconds": ["50", "80"], "properties": ["ringing, beep, stop", "two objects, woman, speak"], "captions_pred_video": ["footage of a person using a cell phone on a table", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a telephone rings and a man speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vdoxuJn9lTc", "zl9Dqx-j7q4"], "start_seconds": ["40", "6"], "properties": ["burp, loud, girl", "engine, laugh, loud"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a child speaks followed by a burp", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["water flows as men speak and yell", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vJ7JPEFhyLA", "xfaoyyzw2WU"], "start_seconds": ["16", "180"], "properties": ["water, flow, men", "loud, jet engine, roar"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "water is sprayed across a hard surface"], "sample_ids": ["xl2PIWyXaM", "sQwlkXjQabo"], "start_seconds": ["160", "10"], "properties": ["chirp, man, younger person", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["birds are chirping and people are talking", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a car speeding up in the distance"], "sample_ids": ["sapQIQUhFc", "u0TrcHhkPQ"], "start_seconds": ["280", "20"], "properties": ["liquid, flow, distance", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a person sniffs and sneezes"], "sample_ids": ["vBHyYJ8pL0", "uRlbY6aoBU"], "start_seconds": ["2", "0"], "properties": ["noise, door, opening", "sneezes, person, sniffs"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man is sneezing "], "question": "which entity is accompanied by a door opening and closing", "label": 0}, {"captions": ["someone sprays liquid onto a hard surface", "water is sprayed across a hard surface"], "sample_ids": ["sQwlkXjQabo", "sQwlkXjQabo"], "start_seconds": ["10", "10"], "properties": ["liquid, surface, spray", "water, spray, surface"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["spraying followed by silence", "spraying followed by silence"], "question": "which entity is sprayed across a hard surface", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a bird chirps in response to a woman chirping for the birds"], "sample_ids": ["wnpJndXuxLc", "uOpoD0gGXcs"], "start_seconds": ["50", "120"], "properties": ["blows, vehicle, train", "chirps, woman, bird"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a herd of cows grazing in the field"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "birds are chirping and a man is speaking"], "question": "which entity is a person", "label": 1}, {"captions": ["material crumbles into a microphone", "water splashes as an animal walks through"], "sample_ids": ["vofpvUo6NAw", "w1ir-sZ3Im8"], "start_seconds": ["220", "90"], "properties": ["material, crumbles, microphone", "animal, water, splashes"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["paper is being crumpled and crinkled", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a video of a person speaking?", "label": 0}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tQWGZLItBXk", "tdWhHV3X25Q"], "start_seconds": ["170", "60"], "properties": ["music, person, ding", "applause, audience, yells"], "captions_pred_video": ["worms revolution screenshots", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tZGN5a7ybxo", "vYkA3cfXp5Q"], "start_seconds": ["60", "30"], "properties": ["ring, train, horn", "engine, accelerate, idle"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a train is moving and blowing its horn ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a train engine runs and a horn blows", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zPX9o1uDiI", "wqZ135Ssz0"], "start_seconds": ["40", "60"], "properties": ["engine, horn, run", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a human activity", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "two frogs croak at each other"], "sample_ids": ["un9VQlzgZM", "zg0X6BnhOLQ"], "start_seconds": ["5", "410"], "properties": ["females, talk, laugh", "two frogs, croak, at each other"], "captions_pred_video": [null, "footage of lightning in the sky at night"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a frog is croaking"], "question": "which entity is more likely to be a frog", "label": 1}, {"captions": ["scraping and female speech with distant music", "someone whistles a tune"], "sample_ids": ["yHeVV-xeOxQ", "sIXTftIuUgw"], "start_seconds": ["130", "90"], "properties": ["female, speech, music", "someone, tune, whistle"], "captions_pred_video": ["of a girl milking a goat's udder", null], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "wind blows as people chatter quietly"], "sample_ids": ["y8dSeubCNI", "xBxDz0CFVn0"], "start_seconds": ["4", "30"], "properties": ["men, women, car", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["an engine revving and people talking in the background", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "people speak in a closed space"], "sample_ids": ["s6DESzUTGjY", "sTpirNYo8vQ"], "start_seconds": ["16", "30"], "properties": ["wind, laugh, woman", "people, space, speak"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a man is speaking while a car is revving and accelerating "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sK4u5T8hW78", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["a, car, pass", "engine, revs, vehicle"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a race car accelerates and revs its engine "], "question": "which vehicle is passing by", "label": 1}, {"captions": ["white noise and birds chirping", "roadway noise occurs and a truck accelerates"], "sample_ids": ["wRBHTgrbiwg", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["noise, white, chirping", "noise, truck, accelerate"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a car is driving on the road "], "question": "which noise is more likely to be heard", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xzKKf9bKNUo", "zl9Dqx-j7q4"], "start_seconds": ["10", "6"], "properties": ["background, noise, snoring", "engine, laugh, loud"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a person snoring loudly", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "small dogs yip and bark sharply"], "sample_ids": ["vb1fPSDI4c", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["multiple, people, yell", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["food is frying then a woman speaks", "a man speaks followed by another man speaking outside"], "sample_ids": ["ukxt9I7eMMg", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["food, woman, speak", "two men, speak, follow"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "a frog croaks as other frogs croak in the background"], "sample_ids": ["uRlbY6aoBU", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["a, distance, sneeze", "background, frog, croak"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["a man is sneezing ", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "winds blows roughly as a vehicle races past"], "sample_ids": ["sfAvvZwdLCY", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["flushes, drains, water", "wind, blows, vehicle"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a toilet is flushed", "a jet engine roars and wind blows "], "question": "which entity is a source of water", "label": 0}, {"captions": ["a door opens and closes", "birds chirp and objects are moved around"], "sample_ids": ["vBHyYJ8pL0", "yPUYU6t3rwo"], "start_seconds": ["2", "370"], "properties": ["open, close, door", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["uEU-Hg5MTN8", "uYT5gxnyMWM"], "start_seconds": ["27", "50"], "properties": ["a woman, laughs, animal", "a, scream, girl"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "several insects fly while two men talk"], "sample_ids": ["y4tPJXBKDig", "s-T9OVOiMLo"], "start_seconds": ["20", "330"], "properties": ["a, noise, talk", "several, fly, men"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video of a girl talking", "label": 0}, {"captions": ["an engine idles consistently before sputtering some", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["rwTERCUno", "tdWhHV3X25Q"], "start_seconds": ["90", "60"], "properties": ["engine, idle, sputter", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "an airplane engine runs"], "sample_ids": ["vbpKkWvfOu4", "yVPZ2MNWpms"], "start_seconds": ["560", "0"], "properties": ["a, woman, man", "engine, airplane, runs"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "paper is crumpling consistently"], "sample_ids": ["xl2PIWyXaM", "v5cSxLaHADY"], "start_seconds": ["160", "0"], "properties": ["chirp, man, younger person", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["birds are chirping and people are talking", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a machine beeps continuously"], "sample_ids": ["wyllXV6PjKo", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["a kid, talk, cry", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "a duck quacks continuously"], "sample_ids": ["xO-Q2BlIIPU", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["two men, exclamation, speak", "quacks, continuously, duck"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["an animal quacks rapidly", "an infant crying as a woman laughs"], "sample_ids": ["vh30P49Po6s", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["animal, quacks, rapidly", "a, laugh, infant"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a duck is quacking loudly", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["u21-Z5gJCB8", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["background, voice, man", "two men, woman, birds"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a clock ticktocks"], "sample_ids": ["sDSppXIlJrs", "v-g-j2uTByM"], "start_seconds": ["27", "30"], "properties": ["microphone, water, wind", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["food is frying and sizzles", "dishes cling together then a man begins to speak"], "sample_ids": ["zNRChLjqcU", "sQGXqGcwOTc"], "start_seconds": ["220", "3"], "properties": ["food is frying, sizzles, food", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["water is running from a faucet into a sink", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a woman speaks as she rubs two objects together"], "sample_ids": ["t25U-v4k4ts", "vzxHnu-SFEw"], "start_seconds": ["40", "80"], "properties": ["bees buzz, birds chirp, man speaks", "two objects, woman, speak"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "an animal growls followed by birds chirping"], "sample_ids": ["w9lpbUn0hPc", "y2ZBGpgbhHM"], "start_seconds": ["30", "30"], "properties": ["male, wind, rustling", "animal, growl, bird"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", null], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "birds chirping and a dog panting"], "question": "which entity is followed by birds chirping", "label": 1}, {"captions": ["water splashes as an animal walks through", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["w1ir-sZ3Im8", "yDoT73BWsdA"], "start_seconds": ["90", "10"], "properties": ["animal, water, splashes", "engine, revs, vehicle"], "captions_pred_video": ["footage of a group of people riding horses through a river", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["an audience gives applause as a man yells and a group sings", "a woman speaks and then a man speaks"], "sample_ids": ["tdWhHV3X25Q", "vbpKkWvfOu4"], "start_seconds": ["60", "560"], "properties": ["applause, audience, yells", "a, man, speaks"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a woman is speaking and a man is speaking"], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "birds vocalize and chirp continuously"], "sample_ids": ["uOpoD0gGXcs", "w1mlz3Pe4fU"], "start_seconds": ["120", "300"], "properties": ["chirps, woman, bird", "vocalize, chirp, continuously"], "captions_pred_video": ["a herd of cows grazing in the field", "of a bird in a cage"], "captions_pred_audio": ["birds are chirping and a man is speaking", "birds are chirping and singing"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while water drains", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vSeGhaZt-aI", "w34HjHr6gAY"], "start_seconds": ["50", "30"], "properties": ["water, drain, man", "beeps, hit, woman"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["w2M4i1mklOA", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["loud, chime, bell", "motor noise, horn, siren"], "captions_pred_video": ["footage of an antique clock", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vXlk0lIQBFo", "zj2R0XoFr5k"], "start_seconds": ["470", "50"], "properties": ["wind, talk, vocalize", "airplane, boy, fly"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a person snoring several times", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["spJCm8tD9Zo", "tDVADusiIoc"], "start_seconds": ["90", "60"], "properties": ["snore, person, several", "water, radio, man"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a person?", "label": 0}, {"captions": ["birds chirp quietly and an adult man speaks", "a car speeding up in the distance"], "sample_ids": ["zuua6-5goWw", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["birds, chirp, quiet, man, speaks", "distance, car, speed"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", null], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["engines sputter roughly and tires squeal", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zhx6hoYrHeI", "uYT5gxnyMWM"], "start_seconds": ["160", "50"], "properties": ["engine, sputter, rough", "female, spraying, scream"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "an engine idles consistently before sputtering some"], "sample_ids": ["tDlfY3nmx1A", "rwTERCUno"], "start_seconds": ["160", "90"], "properties": ["applause, laugh, man", "engine, idle, sputter"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", null], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "an engine is idling and vibrating"], "question": "which entity is a machine?", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "a car accelerates and wind blows"], "sample_ids": ["vBslzh7saPw", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["engine, roar, louder", "accelerates, wind, blows"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["water runs into a sink while men speak", "a airplane flies overhead as a woman speaks"], "sample_ids": ["vzceMbklWc", "zj2R0XoFr5k"], "start_seconds": ["180", "50"], "properties": ["water, sink, run", "airplane, fly, woman"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["water is running and a man is speaking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a woman speaks as frying food sizzles"], "sample_ids": ["wqN6IIHw3po", "wTideSjRFS0"], "start_seconds": ["30", "30"], "properties": ["rain, surface, fall", "food, sizzle, woman"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a man is speaking and water is splashing", "a woman is speaking while water is running in the background"], "question": "which entity is a man speaking to?", "label": 0}, {"captions": ["a vehicle accelerates squealing tires", "a stream of water runs briefly"], "sample_ids": ["sd7xVssqlw", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["accelerates, tires, squealing", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "people speak as gunfire rings out"], "sample_ids": ["sAam2NqGhLY", "wqTCwqVRDlk"], "start_seconds": ["20", "80"], "properties": ["snoring, breathing, child", "gunfire, ring, speak"], "captions_pred_video": ["of a little girl sleeping on a couch", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person is snoring", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["su6FAOcOA8c", "vbZ-0lGPneg"], "start_seconds": ["4", "30"], "properties": ["engine, idle, woman", "a woman, a television program, a bird"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vW4x7S1VfQc", "vJ7JPEFhyLA"], "start_seconds": ["150", "16"], "properties": ["clacking, oil, woman", "three men, wind, flow"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["food sizzles in a frying pan", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["a man speaks as a machine runs", "an engine sputters followed by a car zooming by"], "sample_ids": ["vD6lYD1l0BY", "u5RmF3c3Aw"], "start_seconds": ["330", "60"], "properties": ["a, machine, run", "engine, car, zoom"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a race car accelerates and skids with wind noise in the background "], "question": "which entity is a car?", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["xfaoyyzw2WU", "vbZ-0lGPneg"], "start_seconds": ["180", "30"], "properties": ["loud, jet engine, roar", "a woman, a television program, a bird"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a woman is speaking and a dog is whimpering"], "question": "which is quieter", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a woman speaks happily and an animal chirps"], "sample_ids": ["slZLHwNbbt4", "uWAAAL4CIoc"], "start_seconds": ["300", "0"], "properties": ["clap, distance, horn", "a woman, chirps, animal"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "an adult man speaks over glass clinking"], "sample_ids": ["yYEVLuqEytU", "u6jIvCtKarQ"], "start_seconds": ["40", "70"], "properties": ["grunt, slurp, background", "a, man, speaks"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of a person using a blender on a stove top"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a man is speaking and dishes are being moved with background noise "], "question": "which entity is a person", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a woman speaks as she rubs two objects together"], "sample_ids": ["xKB8O8LTs6s", "vzxHnu-SFEw"], "start_seconds": ["70", "80"], "properties": ["music, radio, gunshots", "two objects, woman, speak"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman is speaking", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "rustling with distant murmuring"], "sample_ids": ["vh30P49Po6s", "wnNNcxAPwGQ"], "start_seconds": ["30", "0"], "properties": ["loud, continuous, quacks", "sound, distance, rustling"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a yellow truck doing a burnout on a race track"], "captions_pred_audio": ["a duck is quacking loudly", "a crowd of people are talking and laughing while a skateboard rolls by "], "question": "which entity is quieter", "label": 1}, {"captions": ["a weapon fires multiple times", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sMC07Ucy7kg", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["weapon, fire, multiple", "animal, grunts, snorts"], "captions_pred_video": ["footage is from a car's point of view", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is not a weapon", "label": 1}, {"captions": ["an animal quacks rapidly", "someone snores nearby"], "sample_ids": ["vh30P49Po6s", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["animal, quacks, rapidly", "someone snores, nearby, someone"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a duck is quacking loudly", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a stream of water runs briefly"], "sample_ids": ["voJh2gJxXhA", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["music, frog, croak", "stream, water, run"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["uWPRNLnpy7Y", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["accelerate, laugh, vehicle", "engine, revs, vehicle"], "captions_pred_video": ["is taken from a car driving down the street", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving faster", "label": 0}, {"captions": ["scraping and female speech with distant music", "a telephone rings followed by a woman talking"], "sample_ids": ["yHeVV-xeOxQ", "tGcFnX0GHI"], "start_seconds": ["130", "0"], "properties": ["female, speech, music", "ring, talk, woman"], "captions_pred_video": ["of a girl milking a goat's udder", null], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a dark barks and whimpers", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sYj4hpDUZDQ", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["barks, whimpers, dark", "two men, woman, birds"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", null], "captions_pred_audio": ["a dog barks and a cat meows", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speak softly as food sizzles", "frogs croak and vocalize"], "sample_ids": ["yhQ2Lg-7qDY", "yswmmRZFItk"], "start_seconds": ["130", "0"], "properties": ["food, sizzle, speak", "croak, vocalize, frog"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a close up of a frog in the water"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a frog is croaking"], "question": "which entity is a frog", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "continuous sizzling with a woman speaking towards the end"], "sample_ids": ["vfYTJq7nU", "ukxt9I7eMMg"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "continuous, woman, speaking"], "captions_pred_video": [null, "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a woman is speaking while food is frying in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vVhthZ45k3Y", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["cat, purr, hiss", "a woman, something, fried"], "captions_pred_video": ["footage is blurry and out of focus", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "paper folding and crinkling"], "sample_ids": ["tIY7qOV3rEM", "zPpG3RD8lSs"], "start_seconds": ["0", "20"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "paper, fold, crinkle"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "the wind blows and a mouse clicks "], "question": "which entity is not a living thing", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "wind blows as people chatter quietly"], "sample_ids": ["xfudFO976zE", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["animal, bleats, cry", "wind, chatter, people"], "captions_pred_video": ["footage is blurry and shaky", "footage is blurry and out of focus"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a person speaks briefly", "a jet engine spools up and takes off"], "sample_ids": ["zOZleIRqZm4", "vBslzh7saPw"], "start_seconds": ["80", "90"], "properties": ["person, talk, brief", "engine, spools, takes"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a jet engine roars and accelerates "], "question": "which is a moving object", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["yDoT73BWsdA", "xfaoyyzw2WU"], "start_seconds": ["10", "180"], "properties": ["engine, revs, vehicle", "loud, jet engine, roar"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["water bubbles and gurgles.", "birds chirp and objects are moved around"], "sample_ids": ["tB7hWb9gTuQ", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["bubbles, gurgles, water", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["water is splashing and gurgling", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "a woman speaks happily and an animal chirps"], "sample_ids": ["yFB25fqfU8I", "uWAAAL4CIoc"], "start_seconds": ["300", "0"], "properties": ["wave, crash, shoreline", "a woman, chirps, animal"], "captions_pred_video": ["footage of a person surfing in the ocean", null], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["a person is burping while a girl speaks", "paper folding and crinkling"], "sample_ids": ["vdoxuJn9lTc", "zPpG3RD8lSs"], "start_seconds": ["40", "20"], "properties": ["person, burp, girl", "paper, fold, crinkle"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a child speaks followed by a burp", "the wind blows and a mouse clicks "], "question": "which is not a person", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "some tunes played by whistling"], "sample_ids": ["xfudFO976zE", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["animal, bleats, cry", "tune, play, whistling"], "captions_pred_video": ["footage is blurry and shaky", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a toilet flushes and water drains", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sfAvvZwdLCY", "w5W5Kqtc8E"], "start_seconds": ["20", "100"], "properties": ["water drains, flushes, water", "wind, blow, vehicle"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an engine runs and a man speaks", "an emergency vehicle engine runs then a horn blows and siren sounds"], "sample_ids": ["yT5WfYMRr-U", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["engine, run, man", "engine, horn, siren"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn and siren", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "water splashes as an animal walks through"], "sample_ids": ["ylpYOorfH4o", "w1ir-sZ3Im8"], "start_seconds": ["410", "90"], "properties": ["engine, running, wind", "animal, water, splashes"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and an engine is revving", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "some tunes played by whistling"], "sample_ids": ["uYT5gxnyMWM", "u6BnG6YZqJ4"], "start_seconds": ["50", "0"], "properties": ["person, spray, yell", "tune, play, whistling"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "people speak as gunfire rings out"], "sample_ids": ["tDVADusiIoc", "wqTCwqVRDlk"], "start_seconds": ["60", "80"], "properties": ["man, radio, blows", "gunfire, ring, speak"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war zone", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a car accelerates and wind blows"], "sample_ids": ["vs65y4qmyBE", "u0TrcHhkPQ"], "start_seconds": ["340", "20"], "properties": ["engine, run, man", "accelerates, wind, blows"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a helicopter engine runs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["t5ZbXbniOWk", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["engine, helicopter, run", "female, spraying, scream"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["birds vocalize and a man speaks", "an infant crying as a woman laughs"], "sample_ids": ["v0wPrLBI3hg", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["vocalize, bird, speak", "a, laugh, infant"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["a person snoring", "birds chirp and an owl hoots before a man speaks briefly"], "sample_ids": ["t8tv5YRMJUg", "wRBHTgrbiwg"], "start_seconds": ["0", "50"], "properties": ["a person, snore, loud", "bird, owl, speak"], "captions_pred_video": ["of a man getting his face licked by another man", "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a person sniffs and breathes heavily", "birds are chirping and insects are buzzing"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "dishes cling together then a man begins to speak"], "sample_ids": ["ugHJF0hfYkg", "sQGXqGcwOTc"], "start_seconds": ["10", "3"], "properties": ["engine, running, continuously", "cling, speak, dishes"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a helicopter is flying overhead ", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a car speeding up in the distance"], "sample_ids": ["rwtmaKiCcQU", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["nozzle, depressed, spray can", "distance, car, speed"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", null], "captions_pred_audio": ["spraying and people speaking", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a man talks as several small engines run", "someone snores nearby"], "sample_ids": ["u9A6VZQCZpU", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["a, man, talk", "someone snores, nearby, someone"], "captions_pred_video": [null, "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["uiItxDsDMFI", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["wood, piece, saw", "wind, blow, vehicle"], "captions_pred_video": ["a man cutting a log with an axe in the woods", null], "captions_pred_audio": ["a saw is being used with background noise ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not being sawed", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a diesel truck engine runs continuously"], "sample_ids": ["uYT5gxnyMWM", "sZvwOuuPGP0"], "start_seconds": ["50", "50"], "properties": ["a, scream, girl", "engine, diesel, truck"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a bulldozer clearing a road in a forest stock footage and royalty-free videos"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a medium engine is running "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "water pouring and bubbling"], "sample_ids": ["sWZzXuWYY", "uyRfq-jKPpo"], "start_seconds": ["420", "50"], "properties": ["male, speech, banging", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "water is running from a faucet"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a horn rings out as a machine runs by"], "sample_ids": ["xyx6eNVEYRY", "slZLHwNbbt4"], "start_seconds": ["380", "300"], "properties": ["loud, engine, muffles", "a, horn, run"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["x6ijhqRY38s", "w34HjHr6gAY"], "start_seconds": ["250", "30"], "properties": ["bowl, silverware, man", "beeps, hit, woman"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a beep sounds followed by a child speaking"], "question": "which entity is about a woman talking?", "label": 1}, {"captions": ["ticking continues without interruption", "a man speaks uses a drill"], "sample_ids": ["v-g-j2uTByM", "x5eIC7S0fbg"], "start_seconds": ["30", "60"], "properties": ["ticking, continuous, clock", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking and using a power tool "], "question": "which entity is a tool", "label": 1}, {"captions": ["birds tweet and squawk", "a person snores loudly multiple times at a close distance"], "sample_ids": ["w1mlz3Pe4fU", "sSMl2vc3ek"], "start_seconds": ["300", "20"], "properties": ["squawk, tweet, scream", "loud, multiple, distance"], "captions_pred_video": ["of a bird in a cage", null], "captions_pred_audio": ["birds are chirping and singing", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["birds coo incessantly", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["yZrFNS7GFBQ", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["coo, bird, incessant", "sheep, baa, birds"], "captions_pred_video": ["of the bird in the cage", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["an owl hoots in the background ", "a goat bleats and birds chirp"], "question": "which entity is a sheep?", "label": 1}, {"captions": ["people speak softly as food sizzles", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["yhQ2Lg-7qDY", "su6FAOcOA8c"], "start_seconds": ["130", "4"], "properties": ["food, sizzle, speak", "engine, idle, woman"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a woman is speaking and a subway train is moving "], "question": "which entity is about a bus engine?", "label": 1}, {"captions": ["running water in a faucet with some clinks", "a series of light horn beeps is followed by a loud steam whistle"], "sample_ids": ["zNRChLjqcU", "wnpJndXuxLc"], "start_seconds": ["220", "50"], "properties": ["water, faucet, run", "beeps, loud, whistle"], "captions_pred_video": [null, "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["water is running from a faucet into a sink", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "a stream of water runs briefly"], "sample_ids": ["u--KhUW8l1Y", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["engine, sound, horn", "stream, water, run"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["leaves rustle while man speaks", "a infant makes noise and is excited"], "sample_ids": ["zOZleIRqZm4", "wIJK3-5y0kA"], "start_seconds": ["80", "30"], "properties": ["leaves, rustle, speak", "noise, excited, infant"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a baby cries and a woman speaks"], "question": "which is quieter", "label": 0}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["uEU-Hg5MTN8", "zl9Dqx-j7q4"], "start_seconds": ["27", "6"], "properties": ["a woman, laughs, animal", "engine, laugh, loud"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["several insects fly while two men talk", "water pouring and bubbling"], "sample_ids": ["s-T9OVOiMLo", "uyRfq-jKPpo"], "start_seconds": ["330", "50"], "properties": ["several, fly, men", "water, bubbles, pouring"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["sWZzXuWYY", "sQGXqGcwOTc"], "start_seconds": ["420", "3"], "properties": ["male, clanks, thumps", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "mechanisms are operating and water is splashing "], "question": "which entity is about a machine?", "label": 0}, {"captions": ["a door opens and closes", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vBHyYJ8pL0", "zj2R0XoFr5k"], "start_seconds": ["2", "50"], "properties": ["open, close, door", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "a propeller rotates loudly and intensely"], "sample_ids": ["x6ijhqRY38s", "ugHJF0hfYkg"], "start_seconds": ["250", "10"], "properties": ["something metal, glass, hit", "loud, intense, propeller"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "vehicles pass by on a roadway"], "sample_ids": ["wTjoRj1se3U", "tgbONvsP47Y"], "start_seconds": ["390", "0"], "properties": ["engine, run, people", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a jet engine is running and people are talking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["water splashes as an animal walks through", "a car speeding up in the distance"], "sample_ids": ["w1ir-sZ3Im8", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["animal, water, splashes", "distance, car, speed"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yeFvk9x0wWI", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["chirp, twitter, clatter", "a woman, something, fried"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "paper is crumpling consistently"], "sample_ids": ["wSVhSdj0F0", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["horn honks, keys jingle, slam", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "a stream of water flows as people talk and wind blows"], "sample_ids": ["tMJne1a4AFI", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["wind, buzz, rustling", "stream, water, flow"], "captions_pred_video": ["a swarm of bees on the ground", "footage is blurry and out of focus"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaking with light rustling", "people applaud and hoot and chat quietly"], "sample_ids": ["zOZleIRqZm4", "wwyfGO2J4"], "start_seconds": ["80", "90"], "properties": ["light, rustling, man", "people, applaud, hoot"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a woman speaks over sizzling noise"], "sample_ids": ["xKB8O8LTs6s", "yajyRTUQk3U"], "start_seconds": ["70", "400"], "properties": ["music, gunfire, explosion", "noise, woman, speak"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "- a woman cooking in the kitchen"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a woman is speaking while food is frying in the background"], "question": "which entity has a woman speaking over a sizzling noise?", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sSMl2vc3ek", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["loud, multiple, distance", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks followed by clicks and scraping", "a car speeding up in the distance"], "sample_ids": ["yYJksgsxx5U", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["audio, clicks, scraping", "distance, car, speed"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", null], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "some men converse over an engine running"], "sample_ids": ["wqADXCzngMw", "sCiy7QS1U"], "start_seconds": ["340", "300"], "properties": ["audio, humming, revving", "men, converse, engine"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", null], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video", "label": 1}, {"captions": ["water bubbles and gurgles.", "water pouring and bubbling"], "sample_ids": ["tB7hWb9gTuQ", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["bubbles, gurgles, water", "water, bubbles, pouring"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["water is splashing and gurgling", "water is running from a faucet"], "question": "which entity is a video of water flowing and bubbling?", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a man speaks as a car is passing by"], "sample_ids": ["vr8ZXjEBhMQ", "sK4u5T8hW78"], "start_seconds": ["150", "30"], "properties": ["wind, blow, zoom", "a, car, pass"], "captions_pred_video": ["is taken from a motorcycle's point of view", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a zoom of", "label": 0}, {"captions": ["a motor slows to a stopover traffic noises", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zofjfKhqLk8", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["noise, stop, motor", "applause, audience, yells"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["tGcFnX0GHI", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["ring, talk, woman", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "small dogs yip and bark sharply"], "sample_ids": ["y8WEcpOlT3I", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["wind, speak, buffeting", "bark, yip, sharply"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vXlk0lIQBFo", "yajyRTUQk3U"], "start_seconds": ["470", "400"], "properties": ["wind, speak, vocalize", "a woman, something, fried"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "- a woman cooking in the kitchen"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tOSWIURC-4", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["engine, work, nearby", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a lawn mower is running ", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["wind blowing followed by a zoom", "an infant crying as a woman laughs"], "sample_ids": ["vr8ZXjEBhMQ", "xhmRY9yhC7c"], "start_seconds": ["150", "20"], "properties": ["wind, blow, zoom", "a, laugh, infant"], "captions_pred_video": ["is taken from a motorcycle's point of view", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["wfHeoPDLMaM", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["quacking, squawking, ducks", "sheep, baa, birds"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["ducks are quacking", "a goat bleats and birds chirp"], "question": "which entity is a single animal", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "wind blows as people chatter quietly"], "sample_ids": ["sU53zg9Jp7s", "xBxDz0CFVn0"], "start_seconds": ["380", "30"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "wind, chatter, people"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "footage is blurry and out of focus"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "wind blowing followed by a zoom"], "sample_ids": ["vbpKkWvfOu4", "vr8ZXjEBhMQ"], "start_seconds": ["560", "150"], "properties": ["a, man, speaks", "wind, blow, zoom"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["birds chirp then an animal grunts", "people cheer as a vehicle engine revs"], "sample_ids": ["tDlysoZiA1I", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["animal, grunt, chirp", "engine revs, vehicle, people"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["x5cuQjOdM3E", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["cat, talk, meow", "engine, laugh, loud"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a cat meows and a woman speaks", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a person snoring", "wind blows as people chatter quietly"], "sample_ids": ["t8tv5YRMJUg", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["a person, snore, loud", "wind, chatter, people"], "captions_pred_video": ["of a man getting his face licked by another man", "footage is blurry and out of focus"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "motors rev and run loudly as a person laughs"], "sample_ids": ["vJ7JPEFhyLA", "zl9Dqx-j7q4"], "start_seconds": ["16", "6"], "properties": ["three men, wind, flow", "motors rev, laugh, loudly"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine roars "], "question": "which entity is more quiet", "label": 0}, {"captions": ["some clanking with distant murmuring", "motors rev and run loudly as a person laughs"], "sample_ids": ["uMTTDZ2mb4", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["clanking, murmuring, distant", "motors rev, laugh, loudly"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a jet engine roars "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a girl talking, laughing and sneezing noise", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["y4tPJXBKDig", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["a, noise, talk", "music, gunfire, explosion"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a train horn sounds as a railroad passing bell rings", "a person speaks briefly"], "sample_ids": ["zgUgkpk78xU", "zOZleIRqZm4"], "start_seconds": ["70", "80"], "properties": ["horn, bell, train", "person, talk, brief"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking with crickets chirping in the background"], "question": "which entity is talking", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["vJvryTwuAV8", "vddP56-ogds"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "liquid, laughs, man"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", null], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "water is running and gurgling and a man is speaking"], "question": "which entity has a man speaking to an audience?", "label": 0}, {"captions": ["motors runs briefly and tires screech", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yRx9txMcBl0", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["motors, tires, screech", "multiple, people, yell"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "small dogs yip and bark sharply"], "sample_ids": ["rqu8iB22IY", "v-wcQf4BDY0"], "start_seconds": ["5", "120"], "properties": ["sound, repeats, laugh", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a dog barks and growls"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vbr9mHKc8WM", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["noise, loudness, engine", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["an engine is idling", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["an infant crying frantically", "a man speaks as a motor runs in the background"], "sample_ids": ["zwOBqeFTgiU", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["cry, infant, frantically", "background, motor, run"], "captions_pred_video": ["of the baby crying in the car seat", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a baby cries loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is silent", "label": 1}, {"captions": ["ticking continues without interruption", "a man speaks as a motor runs in the background"], "sample_ids": ["v-g-j2uTByM", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "background, motor, run"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a clock is ticking loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a clock", "label": 0}, {"captions": ["men speak and a nozzle sprays liquid", "water flows and trickles"], "sample_ids": ["wRV8yMk886E", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["liquid, spray, nozzle", "water, flow, trickle"], "captions_pred_video": ["two cars are parked in a parking lot at night", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man speaks followed by a loud burst", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["birds vocalize and a man speaks", "an insect buzzes around continuously"], "sample_ids": ["v0wPrLBI3hg", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["vocalize, bird, speak", "buzzes, continuously, insect"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a fly is buzzing around a microphone "], "question": "which entity is not a bird?", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["xERFUeZONz8", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "a woman, a television program, a bird"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["an emergency vehicle siren blares", "a woman is speaking and a dog is whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "waves crash against a shoreline and wind blows"], "sample_ids": ["vJvryTwuAV8", "zdYdyF9-m8U"], "start_seconds": ["16", "7"], "properties": ["audience, cheer, man", "wind, crash, shoreline"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "a person kayaking in the ocean near a cliff"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "waves crash and wind blows "], "question": "which entity is more active", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "people applaud and hoot and chat quietly"], "sample_ids": ["sQGXqGcwOTc", "wwyfGO2J4"], "start_seconds": ["3", "90"], "properties": ["cling, speak, dishes", "people, applaud, hoot"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a performance", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "wind blowing followed by a zoom"], "sample_ids": ["wqN6IIHw3po", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["rain, surface, fall", "wind, blow, zoom"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and water is splashing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["material crumbles into a microphone", "water flows as men speak and yell"], "sample_ids": ["vofpvUo6NAw", "vJ7JPEFhyLA"], "start_seconds": ["220", "16"], "properties": ["material, crumbles, microphone", "water, flow, men"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "multiple people speak and children yell while water gurgles"], "sample_ids": ["ylpYOorfH4o", "vb1fPSDI4c"], "start_seconds": ["410", "30"], "properties": ["motor, run, steady", "multiple, people, yell"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vcmWSmvti8", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["music, man, fire", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about to fly", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wqZ135Ssz0", "xBxDz0CFVn0"], "start_seconds": ["60", "30"], "properties": ["man, woman, squawks", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is accompanied by a man and woman speaking", "label": 0}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "someone whistles a tune"], "sample_ids": ["wyllXV6PjKo", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["a baby, a woman, a man", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["paper is crumpling consistently", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["v5cSxLaHADY", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "a woman, a television program, a bird"], "captions_pred_video": ["footage of the person holding a pair of scissors", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["paper is crumpled and crinkled", "a woman is speaking and a dog is whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["male speech with light ticking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xO-Q2BlIIPU", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["male, speech, ticking", "applause, audience, yells"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "a telephone rings followed by a woman talking"], "sample_ids": ["vb1fPSDI4c", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["multiple, people, yell", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a crowd of people are talking and laughing", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "winds blows roughly as a vehicle races past"], "sample_ids": ["ujMt0-D-x2k", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["snoring, rhythmical, nearby", "wind, blows, vehicle"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a person is snoring loudly", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a man speaks as a car is passing by"], "sample_ids": ["s4Uz1Ffgo04", "sK4u5T8hW78"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "a, car, pass"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is quieter", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tIY7qOV3rEM", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "wind, blow, vehicle"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["dishes cling together then a man begins to speak", "an animal growls followed by birds chirping"], "sample_ids": ["sQGXqGcwOTc", "y2ZBGpgbhHM"], "start_seconds": ["3", "30"], "properties": ["cling, speak, dishes", "animal, growl, bird"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "birds chirping and a dog panting"], "question": "which entity is more likely to be a scream", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["xBxDz0CFVn0", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["stream, water, flow", "a woman, laughs, animal"], "captions_pred_video": ["footage is blurry and out of focus", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "a man speaks as a motor runs in the background"], "sample_ids": ["vK93VuO0yNc", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["male voice, bus, rumble", "background, motor, run"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a child speaks in closed space"], "sample_ids": ["v-wcQf4BDY0", "yW6FWLSLkx4"], "start_seconds": ["120", "40"], "properties": ["bark, yip, sharply", "child, space, speak"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a dog barks and growls", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tGcFnX0GHI", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["ring, talk, woman", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sZPuqDgX2V0", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["engine, accelerate, intercom", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird nearby?", "label": 1}, {"captions": ["an insect buzzes around continuously", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["v25l1jef3JY", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["buzzes, continuously, insect", "a, scream, girl"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking and a baby is crying"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["slZLHwNbbt4", "ziUT9IFTkjg"], "start_seconds": ["300", "10"], "properties": ["clap, distance, horn", "background, birds, rustling"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an animal quacks rapidly", "a car speeding up in the distance"], "sample_ids": ["vh30P49Po6s", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["animal, quacks, rapidly", "distance, car, speed"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "paper folding and crinkling"], "sample_ids": ["sOa7g-44Dag", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["background, man, spray", "paper, fold, crinkle"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of a process", "label": 1}, {"captions": ["a child speaks in closed space", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yW6FWLSLkx4", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["child, space, speak", "stream, water, flow"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "wind blowing and birds chirping with the distant cooing of a large bird"], "sample_ids": ["w5W5Kqtc8E", "wRBHTgrbiwg"], "start_seconds": ["100", "50"], "properties": ["wind, blow, vehicle", "birds, chirp, cooing"], "captions_pred_video": [null, "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "birds are chirping and insects are buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a toilet flushes and water drains", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sfAvvZwdLCY", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["water drains, flushes, water", "men, talk, cars"], "captions_pred_video": ["footage of the toilet in the bathroom", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is a source of water", "label": 0}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["uZesmtKZGSw", "zFjIWfSD-4"], "start_seconds": ["250", "410"], "properties": ["men, talk, cars", "People, motor, brakes"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has more cars", "label": 0}, {"captions": ["a man speaks as bees buzz and birds chirp", "a stream of water flows as people talk and wind blows"], "sample_ids": ["t25U-v4k4ts", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["bees buzz, birds chirp, man speaks", "stream, water, flow"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a male speaks over some small clicks", "birds chirp and objects are moved around"], "sample_ids": ["uXxVebHsGZ8", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["male, clicks, speak", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "children speak and play together"], "sample_ids": ["se87d6yxEOA", "yVVP8XvWJTo"], "start_seconds": ["10", "260"], "properties": ["run, whistle, pass", "children, speak, play"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "children are speaking and breathing with background noise "], "question": "which entity is moving", "label": 0}, {"captions": ["an aircraft engine runs as wind blows heavily", "a duck quacks continuously"], "sample_ids": ["xjvTpk2Zpr8", "vh30P49Po6s"], "start_seconds": ["70", "30"], "properties": ["engine, run, wind", "quacks, continuously, duck"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a duck quacks continuously"], "sample_ids": ["tK4VlLsNxak", "vh30P49Po6s"], "start_seconds": ["120", "30"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "quacks, continuously, duck"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a beep occurs briefly", "someone is typing on a computer keyboard"], "sample_ids": ["xtWeJ56-U-g", "v0x1odnXtP0"], "start_seconds": ["20", "210"], "properties": ["beep, occur, briefly", "keyboard, type, computer"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "how to make money on youtube in spanish"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a person is typing on a keyboard"], "question": "which is not a type of computer", "label": 0}, {"captions": ["loud clanking and banging with brief male speech", "paper is crumpling consistently"], "sample_ids": ["sWZzXuWYY", "v5cSxLaHADY"], "start_seconds": ["420", "0"], "properties": ["male, speech, banging", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["w1mlz3Pe4fU", "wz7N8YRy74I"], "start_seconds": ["300", "30"], "properties": ["vocalize, chirp, continuously", "rooster, crow, background, men"], "captions_pred_video": ["of a bird in a cage", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a rooster?", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a woman talks while a baby cries and a man whispers"], "sample_ids": ["y1saVTXsKwc", "smDKStoHBJo"], "start_seconds": ["80", "0"], "properties": ["a, dog, talk", "a, talk, baby, cry"], "captions_pred_video": ["a dog playing with a pink ball", "a man holding a crying baby in his arms"], "captions_pred_audio": ["a dog barks and a man speaks", "a baby is crying and a woman is speaking"], "question": "which entity has a dog?", "label": 0}, {"captions": ["a person speaks over rustling leaves", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zOZleIRqZm4", "yajyRTUQk3U"], "start_seconds": ["80", "400"], "properties": ["rustling, leaves, person", "a woman, something, fried"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a muffled toilet flushes and the water drains"], "sample_ids": ["vzceMbklWc", "sfAvvZwdLCY"], "start_seconds": ["180", "20"], "properties": ["water, faucet, sink", "flushes, drains, water"], "captions_pred_video": [null, "footage of the toilet in the bathroom"], "captions_pred_audio": ["water is running and a man is speaking", "a toilet is flushed"], "question": "which entity has water running in it", "label": 0}, {"captions": ["food is frying while a woman speaks", "a woman speaks happily and an animal chirps"], "sample_ids": ["yhQ2Lg-7qDY", "uWAAAL4CIoc"], "start_seconds": ["130", "0"], "properties": ["food, woman, speak", "a woman, chirps, animal"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a woman is speaking and a dog is barking "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a infant makes noise and is excited", "a loud engine muffles a man as he speaks"], "sample_ids": ["wIJK3-5y0kA", "xyx6eNVEYRY"], "start_seconds": ["30", "380"], "properties": ["noise, excited, infant", "loud, engine, muffles"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a baby cries and a woman speaks", "an aircraft engine is running and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "an infant crying as a woman laughs"], "sample_ids": ["y2ZBGpgbhHM", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["animal, growl, bird", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["birds chirping and a dog panting", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "an engine runs loudly"], "sample_ids": ["vVhthZ45k3Y", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["cat, purr, hiss", "loud, engine, run"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tDlysoZiA1I", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["animal, grunt, multiple", "engine, laugh, loud"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a car speeding up in the distance"], "sample_ids": ["su6FAOcOA8c", "u0TrcHhkPQ"], "start_seconds": ["4", "20"], "properties": ["engine, run, woman", "distance, car, speed"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a race car accelerates and revs its engine "], "question": "which car is speeding up in the distance", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yaln9y8I7ms", "uEU-Hg5MTN8"], "start_seconds": ["230", "27"], "properties": ["female, flushes, toilet", "a woman, laughs, animal"], "captions_pred_video": ["footage is blurry and out of focus", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["an audience gives applause", "a telephone rings followed by a woman talking"], "sample_ids": ["x6iCUDmRpKQ", "tGcFnX0GHI"], "start_seconds": ["38", "0"], "properties": ["applause, audience, give", "ring, talk, woman"], "captions_pred_video": ["a black background with the moon and stars in the sky", null], "captions_pred_audio": ["a group of people are clapping and cheering", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a dog barks and whimpers"], "sample_ids": ["ukxt9I7eMMg", "sShpyu2l4YQ"], "start_seconds": ["30", "0"], "properties": ["continuous, woman, speaking", "barks, whimpers, dog"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "the puppies are playing with a toy"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a dog is barking and growling"], "question": "which entity is a dog?", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["skd2PphS6oI", "uZesmtKZGSw"], "start_seconds": ["190", "250"], "properties": ["ring, bird, vocalize", "men, talk, cars"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a person burps loudly for a long time nearby", "a propeller rotates loudly and intensely"], "sample_ids": ["vf44CgrjT0A", "ugHJF0hfYkg"], "start_seconds": ["20", "10"], "properties": ["loud, long, person", "loud, intense, propeller"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a loud burp", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a dog barks and whimpers", "a piece of wood is being placed down and sawed"], "sample_ids": ["sShpyu2l4YQ", "uiItxDsDMFI"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "wood, piece, saw"], "captions_pred_video": ["the puppies are playing with a toy", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a dog is barking and growling", "a saw is being used with background noise "], "question": "which entity is being cut", "label": 1}, {"captions": ["some men converse over an engine running", "water pouring and bubbling"], "sample_ids": ["sCiy7QS1U", "uyRfq-jKPpo"], "start_seconds": ["300", "50"], "properties": ["men, converse, engine", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "water is running from a faucet"], "question": "which entity is more likely to be in a kitchen", "label": 1}, {"captions": ["a man talks while vehicles pass by", "a man speaks with another voice speaking in the background"], "sample_ids": ["sK4u5T8hW78", "u21-Z5gJCB8"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "background, voice, man"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man talking while vehicles pass by?", "label": 0}, {"captions": ["wind blows strongly", "an infant crying as a woman laughs"], "sample_ids": ["w8uLijTqtlU", "xhmRY9yhC7c"], "start_seconds": ["70", "20"], "properties": ["wind, blows, strongly", "a, laugh, infant"], "captions_pred_video": ["footage is blurry and shaky", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["the wind is blowing strongly", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["yLy-WycbVVE", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["background, people, talk", "two men, woman, birds"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", null], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has a more natural background", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "vehicles pass by on a roadway"], "sample_ids": ["yDoT73BWsdA", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["engine revs, tires squeal, vehicle", "pass, vehicle, roadway"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "a woman speaks happily and an animal chirps"], "sample_ids": ["uiS58TNyUiw", "uWAAAL4CIoc"], "start_seconds": ["430", "0"], "properties": ["audio, man, speaking", "a woman, chirps, animal"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a woman is speaking and a dog is barking "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["speaking following by laughing and clapping", "people speak as gunfire rings out"], "sample_ids": ["u2f5NpsoHBg", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["person, laugh, clap", "gunfire, ring, speak"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking and a gun is fired"], "question": "which entity shows a person speaking and then laughing and clapping?", "label": 0}, {"captions": ["a vehicle accelerates and squeals tires", "a series of light horn beeps is followed by a loud steam whistle"], "sample_ids": ["yRx9txMcBl0", "wnpJndXuxLc"], "start_seconds": ["40", "50"], "properties": ["accelerates, tires, squeals", "beeps, loud, whistle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a door opens and birds chirp", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["yeFvk9x0wWI", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["door, open, birds", "a, scream, girl"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vYkA3cfXp5Q", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["engine, accelerate, idle", "two men, woman, birds"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", null], "captions_pred_audio": ["an engine is idling", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a video of a vehicle engine accelerating then running on idle?", "label": 0}, {"captions": ["someone is typing on a computer keyboard", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["v0x1odnXtP0", "w5W5Kqtc8E"], "start_seconds": ["210", "100"], "properties": ["keyboard, type, computer", "wind, blow, vehicle"], "captions_pred_video": ["how to make money on youtube in spanish", null], "captions_pred_audio": ["a person is typing on a keyboard", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a helicopter engine idles continuously", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["ugHJF0hfYkg", "yajyRTUQk3U"], "start_seconds": ["10", "400"], "properties": ["engine, idle, continuously", "a woman, something, fried"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "a man speaks as a car is passing by"], "sample_ids": ["soTOh3zYJfY", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["vehicle, skid, tires", "a, car, pass"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which vehicle is skidding and squealing tires", "label": 0}, {"captions": ["a baby cries and a woman moans", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["smDKStoHBJo", "yks4cLgIDMc"], "start_seconds": ["0", "170"], "properties": ["a, cry, woman", "background, speaking, child"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["a child babbles as a woman speaks", "an infant crying as a woman laughs"], "sample_ids": ["wEBlkGWVWwE", "xhmRY9yhC7c"], "start_seconds": ["260", "20"], "properties": ["a, babble, woman", "a, laugh, infant"], "captions_pred_video": ["shows a person writing on the whiteboard", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a baby cries and a woman speaks"], "question": "which entity is a child", "label": 0}, {"captions": ["a person whistles and clicks a mouse", "vehicle tires screech and a man speaks before a car door opens"], "sample_ids": ["zCrAfDfv6-A", "sxYkFKFIZD0"], "start_seconds": ["30", "20"], "properties": ["person, mouse, click", "screech, man, door"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2"], "captions_pred_audio": ["a person whistles a song", "a man is speaking while a car is revving and accelerating with a squeal in the background "], "question": "which entity is about a car door opening?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "an airplane engine runs"], "sample_ids": ["ugHJF0hfYkg", "yVPZ2MNWpms"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "engine, airplane, runs"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is driving by on the road "], "question": "which entity has a running engine", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["sLUnaPT5gM8", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["loud, laughter, intermittent", "clickety-clack, train, whistle"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a train blows its whistle and blows its horn "], "question": "which entity is continuous", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["xfaoyyzw2WU", "uEU-Hg5MTN8"], "start_seconds": ["180", "27"], "properties": ["loud, jet engine, roar", "animal, grunts, snorts"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a woman is speaking and a baby is crying"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "wind blowing followed by a zoom"], "sample_ids": ["zALy31PjDl0", "vr8ZXjEBhMQ"], "start_seconds": ["21", "150"], "properties": ["a man, a vehicle, a horn", "wind, blow, zoom"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is about a man blowing a vehicle horn?", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "small dogs yip and bark sharply"], "sample_ids": ["x6ijhqRY38s", "v-wcQf4BDY0"], "start_seconds": ["250", "120"], "properties": ["something metal, glass, hit", "bark, yip, sharply"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["w5W5Kqtc8E", "uZesmtKZGSw"], "start_seconds": ["100", "250"], "properties": ["wind, engine, scream", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["a stream runs then someone speaks", "a man speaks as a car is passing by"], "sample_ids": ["wbHTKEJZyhc", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["stream, run, someone", "a, car, pass"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking to a car passing by?", "label": 1}, {"captions": ["a stream runs then someone speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wbHTKEJZyhc", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["stream, run, someone", "music, gunfire, explosion"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more action", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "people applaud and hoot and chat quietly"], "sample_ids": ["wsHBIgzs9Fs", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["horn, continuous, buzzing", "people, applaud, hoot"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", null], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a mechanical buzzing getting louder"], "sample_ids": ["vbZ-0lGPneg", "sEprKHm8Sj8"], "start_seconds": ["30", "90"], "properties": ["a woman, a television program, a bird", "noise, loud, buzzing"], "captions_pred_video": ["of a man holding a baby duck in his hands", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "some tunes played by whistling"], "sample_ids": ["uWAAAL4CIoc", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["a woman, chirps, animal", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a woman speaks with water running"], "sample_ids": ["sShpyu2l4YQ", "wTideSjRFS0"], "start_seconds": ["0", "30"], "properties": ["growl, bark, yip", "water, running, woman"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking while water is running in the background"], "question": "which entity is more calm", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a dog barks and whimpers"], "sample_ids": ["xM4joTqDVp4", "sShpyu2l4YQ"], "start_seconds": ["160", "0"], "properties": ["background, chirp, birds", "barks, whimpers, dog"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "the puppies are playing with a toy"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a dog is barking and growling"], "question": "which entity is more active", "label": 1}, {"captions": ["an engine runs loudly", "a horn rings out as a machine runs by"], "sample_ids": ["vqZuVbG6-HI", "slZLHwNbbt4"], "start_seconds": ["130", "300"], "properties": ["loud, engine, run", "a, horn, run"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a car speeding up in the distance", "a woman speaks with water running"], "sample_ids": ["u0TrcHhkPQ", "wTideSjRFS0"], "start_seconds": ["20", "30"], "properties": ["distance, car, speed", "water, running, woman"], "captions_pred_video": [null, "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking while water is running in the background"], "question": "which entity is moving at a slower speed", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "wind blows as people chatter quietly"], "sample_ids": ["y2ZBGpgbhHM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["birds, tweet, pant", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a duck quacks continuously"], "sample_ids": ["vzceMbklWc", "vh30P49Po6s"], "start_seconds": ["180", "30"], "properties": ["water, faucet, sink", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["water is running and a man is speaking", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "people speak as gunfire rings out"], "sample_ids": ["smDKStoHBJo", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["a, talk, baby, cry", "gunfire, ring, speak"], "captions_pred_video": ["a man holding a crying baby in his arms", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "an infant crying as a woman laughs"], "sample_ids": ["sG7TyPnFDR0", "xhmRY9yhC7c"], "start_seconds": ["180", "20"], "properties": ["beeps, machine, smoke alarm", "a, laugh, infant"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["speaking following by laughing and clapping", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["u2f5NpsoHBg", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["person, laugh, clap", "a woman, a television program, a bird"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird nearby?", "label": 1}, {"captions": ["goats bleat and people speak", "people speak as gunfire rings out"], "sample_ids": ["z5iUE5h0EPs", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["goats bleat, people speak, language", "gunfire, ring, speak"], "captions_pred_video": ["of the goat in the barn", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a goat bleats and a man speaks", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a man is filing a hard object"], "sample_ids": ["vzxHnu-SFEw", "vveS8HT7Uog"], "start_seconds": ["80", "100"], "properties": ["two objects, woman, speak", "a man, hard, object"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage is of a workbench with various tools on it including a hammer and a screwdriver"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is filing and speaking with background noise and breathing "], "question": "which object is harder to file", "label": 0}, {"captions": ["a man speaks on a radio as wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tDVADusiIoc", "yajyRTUQk3U"], "start_seconds": ["60", "400"], "properties": ["man, radio, blows", "a woman, something, fried"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xSKJGCItUWE", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["engine, work, child", "music, gunfire, explosion"], "captions_pred_video": ["footage of the helicopter flying in the room", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a kid speaks followed by music playing", "an infant crying as a woman laughs"], "sample_ids": ["tQWGZLItBXk", "xhmRY9yhC7c"], "start_seconds": ["170", "20"], "properties": ["music, kid, speak", "a, laugh, infant"], "captions_pred_video": ["worms revolution screenshots", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tDVADusiIoc", "su6FAOcOA8c"], "start_seconds": ["60", "4"], "properties": ["man, radio, blows", "engine, idle, woman"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a motorcycle engine works nearby", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tOSWIURC-4", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["engine, work, nearby", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a stream of water runs briefly"], "sample_ids": ["vXlk0lIQBFo", "x-PeY8Yb8M4"], "start_seconds": ["470", "300"], "properties": ["wind, speak, vocalize", "stream, water, run"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "water flows and trickles"], "sample_ids": ["w-4gHptFNuU", "tB7hWb9gTuQ"], "start_seconds": ["21", "30"], "properties": ["engine revs, accelerates, bump", "water, flow, trickle"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a car accelerates and revs its engine ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["w-4gHptFNuU", "xKB8O8LTs6s"], "start_seconds": ["21", "70"], "properties": ["engine revs, accelerates, bump", "music, gunfire, explosion"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car accelerates and revs its engine ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["someone snores nearby", "people applaud and hoot and chat quietly"], "sample_ids": ["spJCm8tD9Zo", "wwyfGO2J4"], "start_seconds": ["90", "90"], "properties": ["someone snores, nearby, someone", "people, applaud, hoot"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone whistles a song", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sIXTftIuUgw", "sLUnaPT5gM8"], "start_seconds": ["90", "0"], "properties": ["someone, song, whistle", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a person whistling a song", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman speaks happily and an animal chirps", "small dogs yip and bark sharply"], "sample_ids": ["uWAAAL4CIoc", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["a woman, chirps, animal", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a drill runs and two people laugh", "a duck quacks continuously"], "sample_ids": ["tEE3MpBt1sg", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "quacks, continuously, duck"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "paper is crumpling consistently"], "sample_ids": ["uWAAAL4CIoc", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["a woman, chirps, animal", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a vehicle engine accelerating then running on idle"], "sample_ids": ["y682ml90jGw", "vYkA3cfXp5Q"], "start_seconds": ["11", "30"], "properties": ["beeps, series, electronic", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a beeping sound is being made ", "an engine is idling"], "question": "which entity is a series of beeps", "label": 0}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "three men talk while wind blows and some liquid flows"], "sample_ids": ["ylpYOorfH4o", "vJ7JPEFhyLA"], "start_seconds": ["410", "16"], "properties": ["motor, run, steady", "three men, wind, flow"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "water flows and trickles"], "sample_ids": ["vbZ-0lGPneg", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["a woman, a television program, a bird", "water, flow, trickle"], "captions_pred_video": ["of a man holding a baby duck in his hands", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "wind blowing followed by a zoom"], "sample_ids": ["w2JXXIAdUdg", "vr8ZXjEBhMQ"], "start_seconds": ["10", "150"], "properties": ["emits, sleeping, person", "wind, blow, zoom"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a person snoring and a dog whimpering", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "water splashes as an animal walks through"], "sample_ids": ["uOpoD0gGXcs", "w1ir-sZ3Im8"], "start_seconds": ["120", "90"], "properties": ["chirps, woman, bird", "animal, water, splashes"], "captions_pred_video": ["a herd of cows grazing in the field", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["birds are chirping and a man is speaking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "insects humming with a dog barking and small goat bleating"], "sample_ids": ["wyllXV6PjKo", "tIY7qOV3rEM"], "start_seconds": ["30", "0"], "properties": ["a kid, talk, cry", "animal, bark, dog, barking, small, goat, bleating"], "captions_pred_video": [null, "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a woman speaks and a baby cries", "a dog is barking and a cat is meowing"], "question": "which entity has more animals", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "a car revs and accelerates loudly and men and women chatter among themselves"], "sample_ids": ["wPz6QRAkEb4", "y8dSeubCNI"], "start_seconds": ["60", "4"], "properties": ["chirps, tweets, song", "men, women, car"], "captions_pred_video": ["a bird in a cage on top of a pole", null], "captions_pred_audio": ["birds are chirping in the background ", "an engine revving and people talking in the background"], "question": "which entity is more quiet", "label": 0}, {"captions": ["race cars go around a track as a man commentates", "pigeons vocalize and birds chirp"], "sample_ids": ["uZesmtKZGSw", "uiS58TNyUiw"], "start_seconds": ["250", "430"], "properties": ["car, track, man", "vocalize, bird, chirp"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["ul60S8TXDA8", "t25U-v4k4ts"], "start_seconds": ["60", "40"], "properties": ["sound, distance, bell", "a, chirps, bird"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a man is speaking and bees are buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a kid speaks followed by music playing", "water flows and trickles"], "sample_ids": ["tQWGZLItBXk", "tB7hWb9gTuQ"], "start_seconds": ["170", "30"], "properties": ["music, kid, speak", "water, flow, trickle"], "captions_pred_video": ["worms revolution screenshots", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a beep repeats multiple times", "a man speaks followed by another man speaking outside"], "sample_ids": ["y682ml90jGw", "viuTg1M-dqg"], "start_seconds": ["11", "30"], "properties": ["beep, repeat, multiple", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a beeping sound is being made ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a single speaker?", "label": 0}, {"captions": ["a man speaks as music plays before artillery is fired", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vcmWSmvti8", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["music, man, fire", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp as a train approaches", "water pouring and bubbling"], "sample_ids": ["xM4joTqDVp4", "uyRfq-jKPpo"], "start_seconds": ["160", "50"], "properties": ["bird, chirp, train", "water, bubbles, pouring"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["birds are chirping and a train is moving ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["rustling with distant murmuring", "birds chirp and objects are moved around"], "sample_ids": ["wnNNcxAPwGQ", "yPUYU6t3rwo"], "start_seconds": ["0", "370"], "properties": ["sound, distance, rustling", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a yellow truck doing a burnout on a race track", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a crowd of people are talking and laughing while a skateboard rolls by ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["water pouring and bubbling", "a stream of water runs briefly"], "sample_ids": ["uyRfq-jKPpo", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["water, bubbles, pouring", "stream, water, run"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["water is running from a faucet", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a person uses a saw to cut some wood"], "sample_ids": ["yRx9txMcBl0", "sHbXC6na9hg"], "start_seconds": ["40", "0"], "properties": ["accelerates, tires, squeals", "a person, saw, wood"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a car is revving its engine and skidding ", "an engine is idling and vibrating"], "question": "which entity is stationary", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["xyL9F5VrjkE", "xfaoyyzw2WU"], "start_seconds": ["20", "180"], "properties": ["engine, run, wind", "loud, jet engine, roar"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "a vehicle accelerates before a race car idles then accelerates quickly"], "sample_ids": ["yDoT73BWsdA", "sjlVMgdGSK0"], "start_seconds": ["10", "30"], "properties": ["engine revs, tires squeal, vehicle", "accelerates, vehicle, race car"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car accelerates and revs its engine "], "question": "which vehicle is accelerating", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "bird squawks are accompanied by a man and woman speaking"], "sample_ids": ["weDbePuc-Xc", "wqZ135Ssz0"], "start_seconds": ["40", "60"], "properties": ["music, slaps, human", "man, woman, squawks"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has a man and woman speaking?", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a child speaks in closed space"], "sample_ids": ["tIY7qOV3rEM", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "child, space, speak"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a clock ticktocks"], "sample_ids": ["vcmWSmvti8", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["music, man, fire", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a helicopter engine runs", "birds chirp and pigeons vocalize while walking around"], "sample_ids": ["t5ZbXbniOWk", "wIvYjuR3nrg"], "start_seconds": ["30", "9"], "properties": ["engine, helicopter, run", "birds, pigeons, vocalize"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage of a pigeon sitting on a roof with trees in the background"], "captions_pred_audio": ["a helicopter is flying overhead ", "birds are chirping and cooing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["an insect buzzes around continuously", "a toilet flushes and a female speaks"], "sample_ids": ["v25l1jef3JY", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["buzzes, continuously, insect", "female, flushes, toilet"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage is blurry and out of focus"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a toilet flushes and a man speaks"], "question": "which entity is not a person", "label": 0}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "dishes cling together then a man begins to speak"], "sample_ids": ["uPDn2BFTHk", "sQGXqGcwOTc"], "start_seconds": ["140", "3"], "properties": ["lady, laugh, baby", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a baby laughs and a woman speaks", "mechanisms are operating and water is splashing "], "question": "which entity is about a baby and a lady?", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "paper is crumpling consistently"], "sample_ids": ["zF8yoL0rkbI", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["engine, run, someone", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the traffic on the street at night", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "an insect buzzes around continuously"], "sample_ids": ["zl9Dqx-j7q4", "v25l1jef3JY"], "start_seconds": ["6", "0"], "properties": ["motors rev, laugh, loudly", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a man driving a car in the dark", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a jet engine roars ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "rain falls on a surface as men speak and music plays"], "sample_ids": ["vms5XGTDVQc", "w0xsN8X18Y"], "start_seconds": ["220", "30"], "properties": ["paper, crumpled, crinkled", "music, surface, rain"], "captions_pred_video": ["footage of a woman opening a black bag on a table", null], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking while a motorboat is moving in the background "], "question": "which entity is not a video of rain falling on a surface?", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yaln9y8I7ms", "ukg5L09Wpvo"], "start_seconds": ["230", "150"], "properties": ["female, flushes, toilet", "clickety-clack, train, whistle"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "water splashes as an animal walks through"], "sample_ids": ["vveS8HT7Uog", "w1ir-sZ3Im8"], "start_seconds": ["100", "90"], "properties": ["a man, objects, speak", "animal, water, splashes"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["loud, continuous burping", "birds chirp and pigeons vocalize while walking around"], "sample_ids": ["y636gklDioE", "wIvYjuR3nrg"], "start_seconds": ["20", "9"], "properties": ["loud, continuous, burping", "birds, pigeons, vocalize"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "footage of a pigeon sitting on a roof with trees in the background"], "captions_pred_audio": ["a person burps loudly several times", "birds are chirping and cooing"], "question": "which entity is not a human", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "paper is crumpling consistently"], "sample_ids": ["s4Uz1Ffgo04", "v5cSxLaHADY"], "start_seconds": ["100", "0"], "properties": ["water, rushes, vehicle", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a stream of water runs briefly"], "sample_ids": ["u2f5NpsoHBg", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["person, laugh, clap", "stream, water, run"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a consistent ticking pattern", "pigeons vocalize and birds chirp"], "sample_ids": ["sCeWURVHfOM", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["ticking, pattern, clock", "vocalize, bird, chirp"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "of the pigeon in the cage"], "captions_pred_audio": ["ticking of a clock", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["dogs bark as an engine runs and a person whistles", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["zY3icUyMdh8", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["dog, bark, engine", "animal, grunts, snorts"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman is speaking and a baby is crying"], "question": "which entity is more playful", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a saw finishes running as metal clings in the background"], "sample_ids": ["v-wcQf4BDY0", "zofjfKhqLk8"], "start_seconds": ["120", "10"], "properties": ["bark, yip, sharply", "background, metal, clings"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["a dog barks and growls", "a large engine is running and a bell is ringing"], "question": "which entity is a still image?", "label": 0}, {"captions": ["a dog whimpers and a woman briefly talks", "dog barking and vehicle engine idling followed shortly by vehicle engine revving"], "sample_ids": ["y1saVTXsKwc", "zY3icUyMdh8"], "start_seconds": ["80", "20"], "properties": ["a, dog, talk", "dog, bark, engine"], "captions_pred_video": ["a dog playing with a pink ball", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a dog barks and a man speaks", "a car is driving and dogs are barking and squealing "], "question": "which entity is more active", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a machine beeps continuously"], "sample_ids": ["vJvryTwuAV8", "y682ml90jGw"], "start_seconds": ["16", "11"], "properties": ["audience, cheer, man", "beeps, machine, continuously"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", null], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a duck quacks continuously"], "sample_ids": ["xZepNM9qcRA", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["background, motor, run", "quacks, continuously, duck"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "some men converse over an engine running"], "sample_ids": ["yRx9txMcBl0", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["accelerates, tires, squeals", "men, converse, engine"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["bees buzz as wind blows", "a man talks as something metal hits against and glass is set down"], "sample_ids": ["tMJne1a4AFI", "x6ijhqRY38s"], "start_seconds": ["0", "250"], "properties": ["bees, buzz, wind", "something metal, glass, hit"], "captions_pred_video": ["a swarm of bees on the ground", "a chef preparing a dish with a bottle of wine and a plate of food on a table"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and dishes are clanging "], "question": "which entity is not a video of something hitting something?", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "some tunes played by whistling"], "sample_ids": ["uRExseg-0XI", "u6BnG6YZqJ4"], "start_seconds": ["210", "0"], "properties": ["woman, man, water", "tune, play, whistling"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["someone whistles briefly", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["uFoga8sHpiw", "zl9Dqx-j7q4"], "start_seconds": ["90", "6"], "properties": ["sound, duration, pitch", "engine, laugh, loud"], "captions_pred_video": ["footage of a bird in a cage", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a person whistles a song", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a duck quacks continuously"], "sample_ids": ["sSMl2vc3ek", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["a person, laughs, snores", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person snoring loudly", "a duck is quacking loudly"], "question": "which entity is a noise", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["yajyRTUQk3U", "y8WEcpOlT3I"], "start_seconds": ["400", "40"], "properties": ["a woman, something, fried", "harsh, wind, blows"], "captions_pred_video": ["- a woman cooking in the kitchen", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking with wind noise in the background "], "question": "which entity is about cooking?", "label": 0}, {"captions": ["an aircraft engine runs", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["yLCORCnd35Q", "yks4cLgIDMc"], "start_seconds": ["0", "170"], "properties": ["engine, aircraft, runs", "background, speaking, child"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yZmhM1HcsyE", "tiDFTC-5vU"], "start_seconds": ["4", "30"], "properties": ["engine, roar, water", "male, duck, laugh"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", null], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a man speaks followed by another man speaking outside"], "sample_ids": ["t69a8aRKhmc", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "two men, speak, follow"], "captions_pred_video": ["footage is blurry and out of focus", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["a horse runs while two women talk", "small dogs yip and bark sharply"], "sample_ids": ["sdvI1mHAsc", "v-wcQf4BDY0"], "start_seconds": ["20", "120"], "properties": ["two women, horse, run", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "an infant crying as a woman laughs"], "sample_ids": ["yaln9y8I7ms", "xhmRY9yhC7c"], "start_seconds": ["230", "20"], "properties": ["female, flushes, toilet", "a, laugh, infant"], "captions_pred_video": ["footage is blurry and out of focus", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "roadway noise occurs and a truck accelerates"], "sample_ids": ["sDSppXIlJrs", "tgbONvsP47Y"], "start_seconds": ["27", "0"], "properties": ["microphone, water, wind", "noise, truck, accelerate"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "footage of a fire truck entering a garage"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a car is driving on the road "], "question": "which noise is made by a truck", "label": 1}, {"captions": ["a person is snoring while sleeping", "water drips and bubbles as a man speaks"], "sample_ids": ["vJrjSeP17yE", "vSeGhaZt-aI"], "start_seconds": ["40", "50"], "properties": ["a person is sleeping, snoring, person", "water, bubbles, speak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uKCSGgof8gI", "tdWhHV3X25Q"], "start_seconds": ["12", "60"], "properties": ["chirps, distance, signal", "applause, audience, yells"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a motorcycle idles loudly as wind blows"], "sample_ids": ["sYITalLZjj4", "v7jJS8aAyA"], "start_seconds": ["30", "10"], "properties": ["water, rushes, background, birds", "wind, blows, loudly"], "captions_pred_video": ["two ducks are swimming in the water near each other", null], "captions_pred_audio": ["wind blows and birds chirp", "a motorcycle engine is idling and vibrating"], "question": "which entity is louder", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "an insect buzzes around continuously"], "sample_ids": ["v0x1odnXtP0", "v25l1jef3JY"], "start_seconds": ["210", "0"], "properties": ["keyboard, type, computer", "buzzes, continuously, insect"], "captions_pred_video": ["how to make money on youtube in spanish", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a person is typing on a keyboard", "a fly is buzzing around a microphone "], "question": "which entity is not a person?", "label": 1}, {"captions": ["some men converse over an engine running", "people speak as gunfire rings out"], "sample_ids": ["sCiy7QS1U", "wqTCwqVRDlk"], "start_seconds": ["300", "80"], "properties": ["men, converse, engine", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["birds fly and flutter around", "wind blows as people chatter quietly"], "sample_ids": ["wGKgwOP3h30", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["fly, flutter, around", "wind, chatter, people"], "captions_pred_video": ["of the pigeons in the coop", "footage is blurry and out of focus"], "captions_pred_audio": ["pigeons coo and flap their wings", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["vdoxuJn9lTc", "vddP56-ogds"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "liquid, laughs, man"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", null], "captions_pred_audio": ["a child speaks followed by a burp", "water is running and gurgling and a man is speaking"], "question": "which entity has a man talking?", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["y4tPJXBKDig", "ziUT9IFTkjg"], "start_seconds": ["20", "10"], "properties": ["a, noise, talk", "background, birds, rustling"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", null], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "birds are chirping and a chime is ringing "], "question": "which noise is made by a girl", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "people applaud and hoot and chat quietly"], "sample_ids": ["zkKdxzNC97Y", "wwyfGO2J4"], "start_seconds": ["27", "90"], "properties": ["loud, bang, noise", "people, applaud, hoot"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "a vehicle accelerates and squeals tires"], "sample_ids": ["t8CV69hcvF0", "yRx9txMcBl0"], "start_seconds": ["210", "40"], "properties": ["person, sneeze, follow", "accelerates, tires, squeals"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a woman sneezes and speaks", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["y8dSeubCNI", "wz7N8YRy74I"], "start_seconds": ["4", "30"], "properties": ["engine revving, people speaking, motorcycle", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["an engine revving and people talking in the background", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "an insect buzzes around continuously"], "sample_ids": ["smDKStoHBJo", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["a, talk, baby, cry", "buzzes, continuously, insect"], "captions_pred_video": ["a man holding a crying baby in his arms", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a fly is buzzing around a microphone "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a person sniffs and sneezes", "a woman and man speak while food is frying"], "sample_ids": ["uRlbY6aoBU", "zk-xJGQU8-4"], "start_seconds": ["0", "130"], "properties": ["sneezes, person, sniffs", "food, man, woman"], "captions_pred_video": [null, "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a man is sneezing ", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity is a group of people", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a frog croaks as other frogs croak in the background"], "sample_ids": ["vbZ-0lGPneg", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["a woman, a television program, a bird", "background, frog, croak"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "people speak as gunfire rings out"], "sample_ids": ["u--KhUW8l1Y", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["horn, siren, life", "gunfire, ring, speak"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["y4tPJXBKDig", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["a, noise, talk", "animal, grunts, snorts"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a woman is speaking and a baby is crying"], "question": "which entity is a person talking", "label": 0}, {"captions": ["birds chirp then an animal grunts", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["tDlysoZiA1I", "vddP56-ogds"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, chirp", "liquid, laughs, man"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "water is running and gurgling and a man is speaking"], "question": "which entity is about a woman and a man?", "label": 1}, {"captions": ["birds chirp as a train approaches", "a toilet flushes and water drains"], "sample_ids": ["xM4joTqDVp4", "sfAvvZwdLCY"], "start_seconds": ["160", "20"], "properties": ["bird, chirp, train", "water drains, flushes, water"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of the toilet in the bathroom"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["someone is burping continuously", "three men talk while wind blows and some liquid flows"], "sample_ids": ["y636gklDioE", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["burps, burps, burps", "three men, wind, flow"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person burps loudly several times", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wy1eKjR7KC0", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["people, talk, distance", "music, gunfire, explosion"], "captions_pred_video": ["two police officers riding motorcycles down the street", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and a siren is going off", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "some tunes played by whistling"], "sample_ids": ["vb1fPSDI4c", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["multiple, people, yell", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xjvTpk2Zpr8", "w5W5Kqtc8E"], "start_seconds": ["70", "100"], "properties": ["engine, run, wind", "wind, blow, vehicle"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "pigeons vocalize and birds chirp"], "sample_ids": ["zFjIWfSD-4", "uiS58TNyUiw"], "start_seconds": ["410", "430"], "properties": ["People, motor, brakes", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a car accelerates and wind blows", "a duck quacks continuously"], "sample_ids": ["u0TrcHhkPQ", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["accelerates, wind, blows", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vddP56-ogds", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["liquid, laughs, man", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["yRx9txMcBl0", "yDoT73BWsdA"], "start_seconds": ["40", "10"], "properties": ["motors, tires, screech", "engine, revs, vehicle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a race car accelerates and revs its engine "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["wind blows and a stream of water flows nearby", "vehicles pass by on a roadway"], "sample_ids": ["sYITalLZjj4", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["stream, flow, wind", "pass, vehicle, roadway"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage of a fire truck entering a garage"], "captions_pred_audio": ["wind blows and birds chirp", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "water rushes by"], "sample_ids": ["zuua6-5goWw", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["birds, chirp, quiet, man, speaks", "water, rushes, by"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a car is driving on a wet road "], "question": "which entity is moving faster", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "an adult woman speaks over chopping and silverware noises"], "sample_ids": ["vXlk0lIQBFo", "yYJksgsxx5U"], "start_seconds": ["470", "30"], "properties": ["wind, talk, vocalize", "audio, woman, silverware"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "of a woman slicing an orange on a cutting board"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a woman is speaking and dishes are clanging in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["yajyRTUQk3U", "uYT5gxnyMWM"], "start_seconds": ["400", "50"], "properties": ["a woman, something, fried", "a, scream, girl"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman is speaking and a baby is crying"], "question": "which entity is about a girl speaking?", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wRBHTgrbiwg", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["birds, chirp, cooing", "music, gunfire, explosion"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["someone is burping continuously", "a man speaks as a motor runs in the background"], "sample_ids": ["y636gklDioE", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["burps, burps, burps", "background, motor, run"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person burps loudly several times", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "dishes cling together then a man begins to speak"], "sample_ids": ["uZesmtKZGSw", "sQGXqGcwOTc"], "start_seconds": ["250", "3"], "properties": ["men, talk, cars", "cling, speak, dishes"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a car speeding up in the distance"], "sample_ids": ["tgbONvsP47Y", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["pass, vehicle, roadway", "distance, car, speed"], "captions_pred_video": ["footage of a fire truck entering a garage", null], "captions_pred_audio": ["a car is driving on the road ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving faster", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "running water in a faucet with some clinks"], "sample_ids": ["vs65y4qmyBE", "zNRChLjqcU"], "start_seconds": ["340", "220"], "properties": ["wind, blows, strongly", "water, faucet, run"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "water is running from a faucet into a sink"], "question": "which entity is a liquid", "label": 1}, {"captions": ["someone whistles a tune", "a man speaks then multiple motorcycles pass by"], "sample_ids": ["sIXTftIuUgw", "zcDwZ6W7E3E"], "start_seconds": ["90", "180"], "properties": ["someone, tune, whistle", "a, man, speak"], "captions_pred_video": [null, "2 people riding motorcycles down a mountain road with trees lining the sides of the road"], "captions_pred_audio": ["a person whistling a song", "a man is speaking while a car accelerates and revs its engine "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a man speaks as a machine runs", "winds blows roughly as a vehicle races past"], "sample_ids": ["vD6lYD1l0BY", "xjvTpk2Zpr8"], "start_seconds": ["330", "70"], "properties": ["a, machine, run", "wind, blows, vehicle"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "wind blows as people chatter quietly"], "sample_ids": ["zkKdxzNC97Y", "xBxDz0CFVn0"], "start_seconds": ["27", "30"], "properties": ["loud, bang, noise", "wind, chatter, people"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage is blurry and out of focus"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "water flows and trickles"], "sample_ids": ["vhJWZheqaE", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["water drains unevenly, toilet flushes, water drains", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a toilet is flushed", "water is splashing and gurgling"], "question": "which entity is a source of water", "label": 1}, {"captions": ["people clap and speak in the distance", "waves crash against a shoreline and people speak"], "sample_ids": ["wwyfGO2J4", "yFB25fqfU8I"], "start_seconds": ["90", "300"], "properties": ["clap, distance, speak", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be a video of a performance?", "label": 0}, {"captions": ["small dogs growl, bark and yip.", "small dogs yip and bark sharply"], "sample_ids": ["sShpyu2l4YQ", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["growl, bark, yip", "bark, yip, sharply"], "captions_pred_video": ["the puppies are playing with a toy", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a dog is barking and growling", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["t69a8aRKhmc", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["a, b, c", "men, talk, cars"], "captions_pred_video": ["footage is blurry and out of focus", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a woman and man speak while food is frying", "an airplane engine spools and people speak"], "sample_ids": ["zk-xJGQU8-4", "wTjoRj1se3U"], "start_seconds": ["130", "390"], "properties": ["food, man, woman", "airplane, engine, spool"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a jet engine is running and people are talking"], "question": "which entity is about a plane?", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "some men converse over an engine running"], "sample_ids": ["tDlfY3nmx1A", "sCiy7QS1U"], "start_seconds": ["160", "300"], "properties": ["applause, laugh, man", "men, converse, engine"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", null], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["an airplane engine runs", "birds chirp and pigeons vocalize while walking around"], "sample_ids": ["yVPZ2MNWpms", "wIvYjuR3nrg"], "start_seconds": ["0", "9"], "properties": ["engine, airplane, runs", "birds, pigeons, vocalize"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of a pigeon sitting on a roof with trees in the background"], "captions_pred_audio": ["a car is driving by on the road ", "birds are chirping and cooing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["distant humming of an engine", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yVPZ2MNWpms", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["sound, distance, engine", "engine, laugh, loud"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a car is driving by on the road ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "water flows as men speak and yell"], "sample_ids": ["vYkA3cfXp5Q", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["speed, idle, accelerate", "water, flow, men"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["an engine is idling", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a motor?", "label": 0}, {"captions": ["an airplane engine runs", "a infant makes noise and is excited"], "sample_ids": ["yVPZ2MNWpms", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["engine, airplane, runs", "noise, excited, infant"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a car is driving by on the road ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["an airplane engine spools and people speak", "water flows and trickles"], "sample_ids": ["wTjoRj1se3U", "tB7hWb9gTuQ"], "start_seconds": ["390", "30"], "properties": ["airplane, engine, spool", "water, flow, trickle"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a jet engine is running and people are talking", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["white noise and birds chirping", "vehicles pass by on a roadway"], "sample_ids": ["wRBHTgrbiwg", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["noise, white, chirping", "pass, vehicle, roadway"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a car is driving on the road "], "question": "which is a moving object", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "people cheer as a vehicle engine revs"], "sample_ids": ["xfaoyyzw2WU", "xjhAnI2q6hM"], "start_seconds": ["180", "6"], "properties": ["loud, jet engine, roar", "engine revs, vehicle, people"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a truck is revving its engine and a man is speaking "], "question": "which is louder", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a car speeding up in the distance"], "sample_ids": ["uOpoD0gGXcs", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["chirps, woman, bird", "distance, car, speed"], "captions_pred_video": ["a herd of cows grazing in the field", null], "captions_pred_audio": ["birds are chirping and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "paper folding and crinkling"], "sample_ids": ["vveS8HT7Uog", "zPpG3RD8lSs"], "start_seconds": ["100", "20"], "properties": ["a man, objects, speak", "paper, fold, crinkle"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "the wind blows and a mouse clicks "], "question": "which object is being folded and crinkled", "label": 0}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "small dogs yip and bark sharply"], "sample_ids": ["yYEVLuqEytU", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["animal, pig, background", "bark, yip, sharply"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a stream of water runs briefly"], "sample_ids": ["y682ml90jGw", "x-PeY8Yb8M4"], "start_seconds": ["11", "300"], "properties": ["beeps, series, electronic", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a beeping sound is being made ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vqZuVbG6-HI", "wqZ135Ssz0"], "start_seconds": ["130", "60"], "properties": ["background, male, female", "two men, woman, birds"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a mechanical buzzing getting louder", "a duck quacks loudly and continuously"], "sample_ids": ["sEprKHm8Sj8", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["noise, loud, buzzing", "loud, continuous, quacks"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a child speaks in closed space"], "sample_ids": ["u21-Z5gJCB8", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["background, voice, man", "child, space, speak"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "water is sprayed across a hard surface"], "sample_ids": ["w5W5Kqtc8E", "sQwlkXjQabo"], "start_seconds": ["100", "10"], "properties": ["wind, blow, vehicle", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "winds blows roughly as a vehicle races past"], "sample_ids": ["sxYkFKFIZD0", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["screech, man, door", "wind, blows, vehicle"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle racing past?", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "an insect buzzes around continuously"], "sample_ids": ["vveS8HT7Uog", "v25l1jef3JY"], "start_seconds": ["100", "0"], "properties": ["a man, objects, speak", "buzzes, continuously, insect"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "male speech with light ticking"], "sample_ids": ["vKrYfzleLB8", "xO-Q2BlIIPU"], "start_seconds": ["110", "30"], "properties": ["a, ring, gunshots", "male, speech, ticking"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "a clock with a green glowing display showing the time 09 07 2016 12 31 2016"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "small dogs growl, bark and yip."], "sample_ids": ["zofjfKhqLk8", "sShpyu2l4YQ"], "start_seconds": ["10", "0"], "properties": ["background, metal, clings", "growl, bark, yip"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "the puppies are playing with a toy"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a dog is barking and growling"], "question": "which entity is more active", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a stream of water runs briefly"], "sample_ids": ["vJvryTwuAV8", "x-PeY8Yb8M4"], "start_seconds": ["16", "300"], "properties": ["audience, cheer, man", "stream, water, run"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["un9VQlzgZM", "zj2R0XoFr5k"], "start_seconds": ["5", "50"], "properties": ["wind, speak, laugh", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about flying", "label": 1}, {"captions": ["someone snores nearby", "some men converse over an engine running"], "sample_ids": ["spJCm8tD9Zo", "sCiy7QS1U"], "start_seconds": ["90", "300"], "properties": ["someone snores, nearby, someone", "men, converse, engine"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "two women and a man talk while a kid cries"], "sample_ids": ["u6jIvCtKarQ", "wyllXV6PjKo"], "start_seconds": ["70", "30"], "properties": ["a, man, speaks", "a kid, talk, cry"], "captions_pred_video": ["footage of a person using a blender on a stove top", null], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a woman speaks and a baby cries"], "question": "which entity has a kid crying?", "label": 1}, {"captions": ["a clock ticktocks in wind", "a heavy rain falls endlessly"], "sample_ids": ["yVumC9TGknc", "wP8ZKrlx3oA"], "start_seconds": ["30", "40"], "properties": ["ticktocks, clock, wind", "heavy, rain, fall"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a series of beeps and chirps", "a heavy rain is falling on a surface"], "question": "which entity is falling", "label": 1}, {"captions": ["water rushes and then a vehicle zooms past", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["s4Uz1Ffgo04", "ziUT9IFTkjg"], "start_seconds": ["100", "10"], "properties": ["water, rushes, vehicle", "background, birds, rustling"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["motors runs briefly and tires screech", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["yRx9txMcBl0", "xKB8O8LTs6s"], "start_seconds": ["40", "70"], "properties": ["motors, tires, screech", "music, gunfire, explosion"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car is revving its engine and skidding ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a man is filing a hard object", "water flows and trickles"], "sample_ids": ["vveS8HT7Uog", "tB7hWb9gTuQ"], "start_seconds": ["100", "30"], "properties": ["a man, hard, object", "water, flow, trickle"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "water is splashing and gurgling"], "question": "which entity is flowing", "label": 1}, {"captions": ["an engine runs and a man speaks", "a man speaks as a car is passing by"], "sample_ids": ["yT5WfYMRr-U", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "a, car, pass"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a toilet flushes and water drains", "running water in a faucet with some clinks"], "sample_ids": ["sfAvvZwdLCY", "zNRChLjqcU"], "start_seconds": ["20", "220"], "properties": ["water drains, flushes, water", "water, faucet, run"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "water is running from a faucet into a sink"], "question": "which entity has water running through it?", "label": 1}, {"captions": ["bees buzz as wind blows", "a toilet flushes and a female speaks"], "sample_ids": ["tMJne1a4AFI", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["bees, buzz, wind", "female, flushes, toilet"], "captions_pred_video": ["a swarm of bees on the ground", "footage is blurry and out of focus"], "captions_pred_audio": ["a swarm of bees buzzing around", "a toilet flushes and a man speaks"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a clock ticktocks briefly", "a stream of water runs briefly"], "sample_ids": ["u7C-AEBQM", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["ticktocks, clock, ticktocks briefly", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a ticktock of a clock", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "an insect buzzes around continuously"], "sample_ids": ["y2bVZ7rz-5M", "v25l1jef3JY"], "start_seconds": ["280", "0"], "properties": ["engine, horn, siren", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a fly is buzzing around a microphone "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a person sniffles and sneezes", "three men talk while wind blows and some liquid flows"], "sample_ids": ["uRlbY6aoBU", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["sneezes, sniffles, person", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a person", "label": 0}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a stream of water runs briefly"], "sample_ids": ["sapQIQUhFc", "x-PeY8Yb8M4"], "start_seconds": ["280", "300"], "properties": ["liquid, flow, distance", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tQWGZLItBXk", "tdWhHV3X25Q"], "start_seconds": ["170", "60"], "properties": ["voice, music, whoosh", "applause, audience, yells"], "captions_pred_video": ["worms revolution screenshots", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zkKdxzNC97Y", "xKB8O8LTs6s"], "start_seconds": ["27", "70"], "properties": ["hard, surface, door", "music, gunfire, explosion"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a door is opened and closed", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["yaln9y8I7ms", "xjhAnI2q6hM"], "start_seconds": ["230", "6"], "properties": ["female, flushes, toilet", "engine revs, vehicle, people"], "captions_pred_video": ["footage is blurry and out of focus", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity has a vehicle?", "label": 1}, {"captions": ["a motorcycle engine is idling", "winds blows roughly as a vehicle races past"], "sample_ids": ["vZAqdHZ81yA", "xjvTpk2Zpr8"], "start_seconds": ["180", "70"], "properties": ["engine, motorcycle, idling", "wind, blows, vehicle"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["an engine is idling loudly", "a jet engine roars and wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a dark barks and whimpers"], "sample_ids": ["sncRqQ67iJU", "sYj4hpDUZDQ"], "start_seconds": ["460", "30"], "properties": ["loud, repeatedly, man", "barks, whimpers, dark"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "a brown and white dog standing in front of a wall with its mouth open"], "captions_pred_audio": ["a person is snoring", "a dog barks and a cat meows"], "question": "which entity is a dog", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a person snores loudly multiple times at a close distance"], "sample_ids": ["u7C-AEBQM", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["ticks, rhythmic, quiet", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as horns blow", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["tHyNqRyK34A", "sLUnaPT5gM8"], "start_seconds": ["24", "0"], "properties": ["a, man, speaks", "loud, laughter, intermittent"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "after a few seconds of silence, a loud bang occurs followed by a softer banging noise"], "sample_ids": ["zgUgkpk78xU", "zkKdxzNC97Y"], "start_seconds": ["70", "27"], "properties": ["clinking, humming, horn", "loud, bang, noise"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a door is opened and closed"], "question": "which entity is softer", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a vehicle accelerates and squeals tires"], "sample_ids": ["uZesmtKZGSw", "yRx9txMcBl0"], "start_seconds": ["250", "40"], "properties": ["men, talk, cars", "accelerates, tires, squeals"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "pigeons vocalize and birds chirp"], "sample_ids": ["vimzuGQvdcU", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["a, man, yells", "vocalize, bird, chirp"], "captions_pred_video": ["a group of people are rafting down a river", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a child speaks", "an infant crying frantically"], "sample_ids": ["yW6FWLSLkx4", "zwOBqeFTgiU"], "start_seconds": ["40", "30"], "properties": ["a, child, speaks", "cry, infant, frantically"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of the baby crying in the car seat"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a baby cries loudly"], "question": "which entity is a child", "label": 0}, {"captions": ["frogs croak and vocalize", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yswmmRZFItk", "sSMl2vc3ek"], "start_seconds": ["0", "20"], "properties": ["croak, vocalize, frog", "loud, multiple, distance"], "captions_pred_video": ["a close up of a frog in the water", null], "captions_pred_audio": ["a frog is croaking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["water flows as men speak and yell", "people speak as gunfire rings out"], "sample_ids": ["vJ7JPEFhyLA", "wqTCwqVRDlk"], "start_seconds": ["16", "80"], "properties": ["water, flow, men", "gunfire, ring, speak"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "people cheer as a vehicle engine revs"], "sample_ids": ["zALy31PjDl0", "xjhAnI2q6hM"], "start_seconds": ["21", "6"], "properties": ["a man, a vehicle, a horn", "engine revs, vehicle, people"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a truck is revving its engine and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["someone is snoring while sleeping", "people cheer as a vehicle engine revs"], "sample_ids": ["ujMt0-D-x2k", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["snore, sleep, someone", "engine revs, vehicle, people"], "captions_pred_video": ["of the dog playing with a toy on the floor", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person is snoring loudly", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a telephone rings followed by a woman talking"], "sample_ids": ["wy1eKjR7KC0", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["people, talk, distance", "ring, talk, woman"], "captions_pred_video": ["two police officers riding motorcycles down the street", null], "captions_pred_audio": ["a man is speaking and a siren is going off", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a woman speaks happily and an animal chirps"], "sample_ids": ["uJV8NDaHqqk", "uWAAAL4CIoc"], "start_seconds": ["100", "0"], "properties": ["loud, fly, chirp", "a woman, chirps, animal"], "captions_pred_video": ["a bee hive in a wooden box", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and wind blows", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sxIvBMSavMQ", "wqZ135Ssz0"], "start_seconds": ["210", "60"], "properties": ["birds, chirp, wind", "two men, woman, birds"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sapQIQUhFc", "uZesmtKZGSw"], "start_seconds": ["280", "250"], "properties": ["water, trickles, flow", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman speaks and food sizzles while frying", "wind blowing followed by a zoom"], "sample_ids": ["wTideSjRFS0", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["food, sizzle, woman", "wind, blow, zoom"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["a woman speaks and a baby laughs", "small dogs yip and bark sharply"], "sample_ids": ["tOj4tdLRaA", "v-wcQf4BDY0"], "start_seconds": ["70", "120"], "properties": ["woman, laugh, baby", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["someone sprays liquid onto a hard surface", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sQwlkXjQabo", "wqZ135Ssz0"], "start_seconds": ["10", "60"], "properties": ["liquid, surface, spray", "two men, woman, birds"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a propeller rotates loudly and intensely"], "sample_ids": ["spJCm8tD9Zo", "ugHJF0hfYkg"], "start_seconds": ["90", "10"], "properties": ["snores, wheezes, sleeps", "loud, intense, propeller"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a person is snoring loudly", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["someone snores nearby", "females talk and laugh over gusting wind"], "sample_ids": ["spJCm8tD9Zo", "un9VQlzgZM"], "start_seconds": ["90", "5"], "properties": ["someone snores, nearby, someone", "females, talk, laugh"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["people speak as gunfire rings out", "birds chirp and objects are moved around"], "sample_ids": ["wqTCwqVRDlk", "yPUYU6t3rwo"], "start_seconds": ["80", "370"], "properties": ["gunfire, ring, speak", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and a gun is fired", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a helicopter engine idles continuously", "birds chirp and pigeons vocalize while walking around"], "sample_ids": ["ugHJF0hfYkg", "wIvYjuR3nrg"], "start_seconds": ["10", "9"], "properties": ["engine, idle, continuously", "birds, pigeons, vocalize"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a pigeon sitting on a roof with trees in the background"], "captions_pred_audio": ["a helicopter is flying overhead ", "birds are chirping and cooing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a helicopter engine runs"], "sample_ids": ["y2bVZ7rz-5M", "t5ZbXbniOWk"], "start_seconds": ["280", "30"], "properties": ["motor noise, horn, siren", "engine, helicopter, run"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a helicopter is flying overhead "], "question": "which entity is a helicopter?", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zl9Dqx-j7q4", "vfYTJq7nU"], "start_seconds": ["6", "130"], "properties": ["engine, laugh, loud", "rustling, ducks, quack"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a duck quacks and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "water flows as men speak and yell"], "sample_ids": ["sU53zg9Jp7s", "vJ7JPEFhyLA"], "start_seconds": ["380", "16"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "water, flow, men"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "a man speaks as a car is passing by"], "sample_ids": ["vms5XGTDVQc", "sK4u5T8hW78"], "start_seconds": ["220", "30"], "properties": ["paper, crumpled, crinkled", "a, car, pass"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "water splashes and a door squeaks"], "sample_ids": ["w2bYrCVLT60", "sdXV-ylviw"], "start_seconds": ["120", "190"], "properties": ["ducks, speak, quack", "sound, splash, door"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", null], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a dog barks and taps with background noise "], "question": "which entity is silent", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zY3icUyMdh8", "wz7N8YRy74I"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "rooster, crow, background, men"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more social", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "wind blowing followed by a zoom"], "sample_ids": ["sofxkNWaP0s", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["wind, engine, louder", "wind, blow, zoom"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom?", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "a man speaks as a motor runs in the background"], "sample_ids": ["s59PfAghdkM", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["bird, chirp, background, horse, neigh", "background, motor, run"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "an airplane engine spools and people speak"], "sample_ids": ["un9VQlzgZM", "wTjoRj1se3U"], "start_seconds": ["5", "390"], "properties": ["females, talk, laugh", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a jet engine is running and people are talking"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sQwlkXjQabo", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "stream, water, flow"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage is blurry and out of focus"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking with wind noise in the background "], "question": "which entity is flowing", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xSKJGCItUWE", "ukg5L09Wpvo"], "start_seconds": ["10", "150"], "properties": ["engine, run, boy", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["multiple people speak while a television play and a child screams", "a diesel truck engine runs steadily"], "sample_ids": ["yks4cLgIDMc", "sZvwOuuPGP0"], "start_seconds": ["170", "50"], "properties": ["multiple people, television, child", "engine, diesel, truck"], "captions_pred_video": ["footage of two kids wrestling on the floor", "of a bulldozer clearing a road in a forest stock footage and royalty-free videos"], "captions_pred_audio": ["a man is speaking and a child is crying", "a medium engine is running "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["an insect buzzes around continuously", "a toilet flushes and water drains unevenly"], "sample_ids": ["v25l1jef3JY", "vhJWZheqaE"], "start_seconds": ["0", "0"], "properties": ["buzzes, continuously, insect", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a toilet is flushed"], "question": "which entity is not a living thing", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "people cheer as a vehicle engine revs"], "sample_ids": ["ziUT9IFTkjg", "xjhAnI2q6hM"], "start_seconds": ["10", "6"], "properties": ["background, birds, rustling", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sZvwOuuPGP0", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["engine, diesel, truck", "airplane, boy, fly"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a medium engine is running ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a stream of water runs briefly", "a woman speaks as she rubs two objects together"], "sample_ids": ["x-PeY8Yb8M4", "vzxHnu-SFEw"], "start_seconds": ["300", "80"], "properties": ["stream, water, run", "two objects, woman, speak"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a car is driving on a wet road ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a physical action", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["x5cuQjOdM3E", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["cat, talk, meow", "loud, jet engine, roar"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a cat meows and a woman speaks", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "small dogs yip and bark sharply"], "sample_ids": ["vZAw4apG0Es", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["background, clock, ticktocks", "bark, yip, sharply"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a clock is ticking and people are talking", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a person snoring", "birds chirp and objects are moved around"], "sample_ids": ["t8tv5YRMJUg", "yPUYU6t3rwo"], "start_seconds": ["0", "370"], "properties": ["a person, snore, loud", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a man getting his face licked by another man", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a person sniffs and breathes heavily", "insects buzz and a man speaks"], "question": "which entity is not a person?", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "wind blows as people chatter quietly"], "sample_ids": ["uKCSGgof8gI", "xBxDz0CFVn0"], "start_seconds": ["12", "30"], "properties": ["chirps, distance, signal", "wind, chatter, people"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "footage is blurry and out of focus"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a dog barks and whimpers", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sShpyu2l4YQ", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["barks, whimpers, dog", "a woman, laughs, animal"], "captions_pred_video": ["the puppies are playing with a toy", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and a baby is crying"], "question": "which entity is more playful", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xfudFO976zE", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["animal, bleats, cry", "engine, laugh, loud"], "captions_pred_video": ["footage is blurry and shaky", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["dogs barking and whimpering", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tIY7qOV3rEM", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "gun, shoot, water"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more violent", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "pigeons vocalize and birds chirp"], "sample_ids": ["ukxt9I7eMMg", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["continuous, woman, speaking", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of the pigeon in the cage"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "small dogs yip and bark sharply"], "sample_ids": ["vYkA3cfXp5Q", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["speed, idle, accelerate", "bark, yip, sharply"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["an engine is idling", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yLy-WycbVVE", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "a woman, a television program, a bird"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program playing in the background?", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["tDVADusiIoc", "tDlysoZiA1I"], "start_seconds": ["60", "0"], "properties": ["wind, radio, waves", "animal, grunts, chirps"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "paper is crumpling consistently"], "sample_ids": ["zhx6hoYrHeI", "v5cSxLaHADY"], "start_seconds": ["160", "0"], "properties": ["engine, sputter, rough", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a car accelerates and revs its engine ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["an insect buzzes around continuously", "winds blows roughly as a vehicle races past"], "sample_ids": ["v25l1jef3JY", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["buzzes, continuously, insect", "wind, blows, vehicle"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["water flows followed by women screaming", "a woman speaks happily and an animal chirps"], "sample_ids": ["w5W5Kqtc8E", "uWAAAL4CIoc"], "start_seconds": ["100", "0"], "properties": ["water, flow, women", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["an infant crying frantically", "people speak as gunfire rings out"], "sample_ids": ["zwOBqeFTgiU", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["cry, infant, frantically", "gunfire, ring, speak"], "captions_pred_video": ["of the baby crying in the car seat", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a baby cries loudly", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war zone", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "people speak as gunfire rings out"], "sample_ids": ["sHbXC6na9hg", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["a person, saw, wood", "gunfire, ring, speak"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "an infant crying frantically"], "sample_ids": ["zofjfKhqLk8", "zwOBqeFTgiU"], "start_seconds": ["10", "30"], "properties": ["noise, stop, motor", "cry, infant, frantically"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of the baby crying in the car seat"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["a stream of water flows quickly", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wbHTKEJZyhc", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["stream, water, flow", "men, talk, cars"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is moving faster", "label": 1}, {"captions": ["goats bleat and people speak", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["z5iUE5h0EPs", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["goats bleat, people speak, language", "rooster, crow, background, men"], "captions_pred_video": ["of the goat in the barn", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a goat bleats and a man speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a rooster?", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "a machine beeps continuously"], "sample_ids": ["vbpKkWvfOu4", "y682ml90jGw"], "start_seconds": ["560", "11"], "properties": ["a, man, speaks", "beeps, machine, continuously"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a person is snoring while sleeping", "people applaud and hoot and chat quietly"], "sample_ids": ["vJrjSeP17yE", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["a person is sleeping, snoring, person", "people, applaud, hoot"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a helicopter engine idles continuously", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["ugHJF0hfYkg", "wqZ135Ssz0"], "start_seconds": ["10", "60"], "properties": ["engine, idle, continuously", "two men, woman, birds"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a human activity", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a man woman speak while crickets sing"], "sample_ids": ["zj2R0XoFr5k", "zTLVJCo4WEE"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, woman", "a, crickets, sing"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman speaks and crickets chirp"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["dogs bark as an engine runs and a person whistles", "water pouring and bubbling"], "sample_ids": ["zY3icUyMdh8", "uyRfq-jKPpo"], "start_seconds": ["20", "50"], "properties": ["dog, bark, engine", "water, bubbles, pouring"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["some people speak", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vbZ-0lGPneg", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "beeps, hit, woman"], "captions_pred_video": ["of a man holding a baby duck in his hands", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "someone is typing on a computer keyboard"], "sample_ids": ["t8CV69hcvF0", "v0x1odnXtP0"], "start_seconds": ["210", "210"], "properties": ["person, sneeze, follow", "keyboard, type, computer"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman sneezes and speaks", "a person is typing on a keyboard"], "question": "which person is typing on a computer keyboard", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a vehicle engine accelerating then running on idle"], "sample_ids": ["yJ0TePmaOo", "vYkA3cfXp5Q"], "start_seconds": ["390", "30"], "properties": ["two hard objects, man, speak", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "an engine is idling"], "question": "which is a vehicle", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a infant makes noise and is excited"], "sample_ids": ["zuua6-5goWw", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["birds, chirp, quiet, man, speaks", "noise, excited, infant"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "a steam engine runs and whistles as it passes by"], "sample_ids": ["wyllXV6PjKo", "se87d6yxEOA"], "start_seconds": ["30", "10"], "properties": ["a kid, talk, cry", "run, whistle, pass"], "captions_pred_video": [null, "footage of a train passing by a train station with smoke billowing out of the train's smokestack"], "captions_pred_audio": ["a woman speaks and a baby cries", "a train is moving and blowing its whistle "], "question": "which entity is moving", "label": 1}, {"captions": ["paper folding and crinkling", "a woman speaks as frying food sizzles"], "sample_ids": ["zPpG3RD8lSs", "wTideSjRFS0"], "start_seconds": ["20", "30"], "properties": ["paper, fold, crinkle", "food, sizzle, woman"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a woman is speaking while water is running in the background"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "an infant crying frantically"], "sample_ids": ["xO-Q2BlIIPU", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["two men, exclamation, speak", "cry, infant, frantically"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uiS58TNyUiw", "uEU-Hg5MTN8"], "start_seconds": ["430", "27"], "properties": ["vocalize, bird, chirp", "a woman, laughs, animal"], "captions_pred_video": ["of the pigeon in the cage", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a woman is speaking and a baby is crying"], "question": "which entity is a human", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["uEU-Hg5MTN8", "yDoT73BWsdA"], "start_seconds": ["27", "10"], "properties": ["a woman, laughs, animal", "engine, revs, vehicle"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xC8kbrKJmco", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["background, goat, scream", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a goat is bleating ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["multiple ducks quack continuously", "an adult woman and an adult man speak"], "sample_ids": ["wfHeoPDLMaM", "zTLVJCo4WEE"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "two people, adult, speak"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["ducks are quacking", "a woman speaks and crickets chirp"], "question": "which entity is speaking", "label": 1}, {"captions": ["a door opens and closes", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vBHyYJ8pL0", "uEU-Hg5MTN8"], "start_seconds": ["2", "27"], "properties": ["open, close, door", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a door", "label": 0}, {"captions": ["water rushes by", "a man speaks as water trickles down a stream"], "sample_ids": ["x-PeY8Yb8M4", "sapQIQUhFc"], "start_seconds": ["300", "280"], "properties": ["water, rushes, by", "water, stream, trickles"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is moving more slowly", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "an audience gives applause"], "sample_ids": ["xZepNM9qcRA", "x6iCUDmRpKQ"], "start_seconds": ["30", "38"], "properties": ["background, motor, run", "applause, audience, give"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "a black background with the moon and stars in the sky"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a group of people are clapping and cheering"], "question": "which is a more active scene", "label": 1}, {"captions": ["a power tool runs and touches a surface", "some people speak"], "sample_ids": ["zfvPRf3chY", "vbZ-0lGPneg"], "start_seconds": ["290", "30"], "properties": ["power tool, run, touch", "some people speak English, some people speak Spanish, some people speak French"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a woman is speaking and a dog is whimpering"], "question": "which entity is not a power tool", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a person screams glaringly"], "sample_ids": ["vSeGhaZt-aI", "xC8kbrKJmco"], "start_seconds": ["50", "0"], "properties": ["water, sink, talk", "glaringly, screams, person"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a goat is bleating "], "question": "which entity is more likely to be a person", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "paper is crumpling consistently"], "sample_ids": ["sQGXqGcwOTc", "v5cSxLaHADY"], "start_seconds": ["3", "0"], "properties": ["cling, speak, dishes", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vJ7JPEFhyLA", "yDoT73BWsdA"], "start_seconds": ["16", "10"], "properties": ["three men, wind, flow", "engine, revs, vehicle"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a car speeds away loudly followed by a car revving loudly and driving away while outside"], "sample_ids": ["ukg5L09Wpvo", "sjlVMgdGSK0"], "start_seconds": ["150", "30"], "properties": ["a train, a horn, a bell", "car, revving, loudly"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a car accelerates and revs its engine "], "question": "which entity is revving loudly", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "pigeons vocalize and birds chirp"], "sample_ids": ["y2bVZ7rz-5M", "uiS58TNyUiw"], "start_seconds": ["280", "430"], "properties": ["motor noise, horn, siren", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "of the pigeon in the cage"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["sQGXqGcwOTc", "vlJS7LN2XyM"], "start_seconds": ["3", "30"], "properties": ["cling, speak, dishes", "background, clocks, ticking"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a ticktock of a clock"], "question": "which entity is more quiet", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["tIY7qOV3rEM", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "a woman, laughs, animal"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uZesmtKZGSw", "tdWhHV3X25Q"], "start_seconds": ["250", "60"], "properties": ["men, talk, cars", "applause, audience, yells"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "people applaud and hoot and chat quietly"], "sample_ids": ["smGI3C1NZc", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["water, drain, toilet", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "people are clapping and speaking with background noise "], "question": "which entity is a performance", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vcmWSmvti8", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["music, man, fire", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a duck quacks and a woman speaks"], "question": "which entity is about hunting?", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "a toilet flushes and a female speaks"], "sample_ids": ["tMJne1a4AFI", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["wind, buzz, rustling", "female, flushes, toilet"], "captions_pred_video": ["a swarm of bees on the ground", "footage is blurry and out of focus"], "captions_pred_audio": ["a swarm of bees buzzing around", "a toilet flushes and a man speaks"], "question": "which entity is not a toilet?", "label": 0}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a man speaks followed by another man speaking outside"], "sample_ids": ["ul60S8TXDA8", "viuTg1M-dqg"], "start_seconds": ["60", "30"], "properties": ["sound, distance, bell", "two men, speak, follow"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker?", "label": 1}, {"captions": ["a dog barks and whimpers", "a man speaks as a car is passing by"], "sample_ids": ["sShpyu2l4YQ", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "a, car, pass"], "captions_pred_video": ["the puppies are playing with a toy", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a person is whistling a tune", "a toilet flushes and water drains"], "sample_ids": ["scYRUkrFLiQ", "sfAvvZwdLCY"], "start_seconds": ["30", "20"], "properties": ["a, tune, whistle", "water drains, flushes, water"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a person whistling a song", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a man speaks as a car is passing by"], "sample_ids": ["vZAw4apG0Es", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["background, clock, ticktocks", "a, car, pass"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a clock ticktocking in the background", "label": 0}, {"captions": ["there are rhythmical snoring nearby", "an engine runs loudly"], "sample_ids": ["ujMt0-D-x2k", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["snoring, rhythmical, nearby", "loud, engine, run"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a person is snoring loudly", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["food is frying and sizzles", "a man speaks as a car is passing by"], "sample_ids": ["zNRChLjqcU", "sK4u5T8hW78"], "start_seconds": ["220", "30"], "properties": ["food is frying, sizzles, food", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking with background noise and breathing sounds "], "question": "which entity is not a person?", "label": 0}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "three men talk while wind blows and some liquid flows"], "sample_ids": ["smDKStoHBJo", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["a, infant, speaking", "three men, wind, flow"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a child yells and another yells", "people cheer as a vehicle engine revs"], "sample_ids": ["vMDHu7Lxcgw", "xjhAnI2q6hM"], "start_seconds": ["410", "6"], "properties": ["two, yell, child", "engine revs, vehicle, people"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a truck is revving its engine and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["a horse runs while two women talk", "a clock ticktocks"], "sample_ids": ["sdvI1mHAsc", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["two women, horse, run", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "vehicles pass by on a roadway"], "sample_ids": ["xV7Mg1QucSc", "tgbONvsP47Y"], "start_seconds": ["14", "0"], "properties": ["alarm, ticktocks, laughs", "pass, vehicle, roadway"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage of a fire truck entering a garage"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a car is driving on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a car speeding up in the distance"], "sample_ids": ["tDlysoZiA1I", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["animal, grunt, chirp", "distance, car, speed"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["two frogs croak at each other", "vehicles pass by on a roadway"], "sample_ids": ["zg0X6BnhOLQ", "tgbONvsP47Y"], "start_seconds": ["410", "0"], "properties": ["two frogs, croak, at each other", "pass, vehicle, roadway"], "captions_pred_video": ["footage of lightning in the sky at night", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a frog is croaking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["someone whistles a tune", "a man is filing a hard object"], "sample_ids": ["sIXTftIuUgw", "vveS8HT7Uog"], "start_seconds": ["90", "100"], "properties": ["someone, tune, whistle", "a man, hard, object"], "captions_pred_video": [null, "footage is of a workbench with various tools on it including a hammer and a screwdriver"], "captions_pred_audio": ["a person whistling a song", "a man is filing and speaking with background noise and breathing "], "question": "which action is more passive", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["zofjfKhqLk8", "y8WEcpOlT3I"], "start_seconds": ["10", "40"], "properties": ["background, metal, clings", "harsh, wind, blows"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking with wind noise in the background "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a train horn blows as it passes by"], "sample_ids": ["w5W5Kqtc8E", "zVacuqSb4LI"], "start_seconds": ["100", "30"], "properties": ["water, splashes, motorboat", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zofjfKhqLk8", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["noise, stop, motor", "female, spraying, scream"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tiDFTC-5vU", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["male, duck, laugh", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a door opens and birds chirp", "a woman speaks with water running"], "sample_ids": ["yeFvk9x0wWI", "wTideSjRFS0"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "water, running, woman"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a woman is speaking while water is running in the background"], "question": "which entity is a video of a door opening and birds chirping?", "label": 0}, {"captions": ["some liquid flows while a woman laughs and man talks", "a clock ticks quietly and rhythmically"], "sample_ids": ["vddP56-ogds", "u7C-AEBQM"], "start_seconds": ["30", "30"], "properties": ["liquid, laughs, man", "ticks, rhythmic, quiet"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a ticktock of a clock"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "water flows as men speak and yell"], "sample_ids": ["s3cTDAj31g", "vJ7JPEFhyLA"], "start_seconds": ["80", "16"], "properties": ["man, talk, woman", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["water flows as men speak and yell", "an engine starts and increases in power"], "sample_ids": ["vJ7JPEFhyLA", "zjTG0gaGCUI"], "start_seconds": ["16", "80"], "properties": ["water, flow, men", "power, increase, engine"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine roars as wind blows "], "question": "which entity is more powerful", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zkKdxzNC97Y", "zj2R0XoFr5k"], "start_seconds": ["27", "50"], "properties": ["hard, surface, door", "airplane, boy, fly"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a door is opened and closed", "a woman speaks while a helicopter flies overhead "], "question": "which object is moving", "label": 1}, {"captions": ["multiple ducks quack continuously", "a duck quacks loudly and continuously"], "sample_ids": ["wfHeoPDLMaM", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "loud, continuous, quacks"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["ducks are quacking", "a duck is quacking loudly"], "question": "which duck is louder", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "people applaud and hoot and chat quietly"], "sample_ids": ["vXlk0lIQBFo", "wwyfGO2J4"], "start_seconds": ["470", "90"], "properties": ["wind, speak, vocalize", "people, applaud, hoot"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", null], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "someone is typing on a computer keyboard"], "sample_ids": ["tQWGZLItBXk", "v0x1odnXtP0"], "start_seconds": ["170", "210"], "properties": ["music, person, ding", "keyboard, type, computer"], "captions_pred_video": ["worms revolution screenshots", "how to make money on youtube in spanish"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a person is typing on a keyboard"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a church bell rings several times", "winds blows roughly as a vehicle races past"], "sample_ids": ["sUVVjE3Ucp8", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["ring, bell, several", "wind, blows, vehicle"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a church bell is ringing ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a baby laugh at a sputter", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sLUnaPT5gM8", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["laugh, sputter, baby", "two men, woman, birds"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["uqFtmnhuqA8", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["a, b, c", "water, radio, man"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "distant men speak as a spray can nozzle is depressed"], "sample_ids": ["vh30P49Po6s", "rwtmaKiCcQU"], "start_seconds": ["30", "30"], "properties": ["loud, continuous, quacks", "nozzle, depressed, spray can"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "shows a man spraying paint on a wall with a spray gun"], "captions_pred_audio": ["a duck is quacking loudly", "spraying and people speaking"], "question": "which entity is silent", "label": 1}, {"captions": ["a helicopter engine idles continuously", "birds chirp as a train approaches"], "sample_ids": ["ugHJF0hfYkg", "xM4joTqDVp4"], "start_seconds": ["10", "160"], "properties": ["engine, idle, continuously", "bird, chirp, train"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack"], "captions_pred_audio": ["a helicopter is flying overhead ", "birds are chirping and a train is moving "], "question": "which entity is not a train?", "label": 0}, {"captions": ["a train horn blows as it passes by", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["zVacuqSb4LI", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["horn, blows, train", "motor noise, horn, siren"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["slZLHwNbbt4", "wDVMhEdTiVw"], "start_seconds": ["300", "30"], "properties": ["a, horn, run", "gun, shoot, water"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "pigeons vocalize and birds chirp"], "sample_ids": ["yks4cLgIDMc", "uiS58TNyUiw"], "start_seconds": ["170", "430"], "properties": ["background, speaking, child", "vocalize, bird, chirp"], "captions_pred_video": ["footage of two kids wrestling on the floor", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking and a child is crying", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a man speaks as a machine runs", "the revving of an engine throttle followed by a man speaking"], "sample_ids": ["vD6lYD1l0BY", "tezvROoo4bs"], "start_seconds": ["330", "40"], "properties": ["a, machine, run", "audio, throttle, speaking"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a busy city street with cars parked on both sides of the road"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a car accelerates and revs while a man speaks "], "question": "which entity is about a man speaking as a machine runs?", "label": 0}, {"captions": ["people speak as gunfire rings out", "a woman speaks as she rubs two objects together"], "sample_ids": ["wqTCwqVRDlk", "vzxHnu-SFEw"], "start_seconds": ["80", "80"], "properties": ["gunfire, ring, speak", "two objects, woman, speak"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a duck quacks continuously"], "sample_ids": ["vh30P49Po6s", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["loud, continuous, quacks", "quacks, continuously, duck"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a duck is quacking loudly", "a duck is quacking loudly"], "question": "which duck is louder", "label": 0}, {"captions": ["a man speaks as he moves silverware in a bowl", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["x6ijhqRY38s", "vfYTJq7nU"], "start_seconds": ["250", "130"], "properties": ["bowl, silverware, man", "rustling, ducks, quack"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a duck quacks and a woman speaks"], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["an small aircraft engine runs and a boy speaks", "continuous chugging with birds chirping in the background"], "sample_ids": ["xSKJGCItUWE", "xM4joTqDVp4"], "start_seconds": ["10", "160"], "properties": ["engine, run, boy", "background, chirp, birds"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "birds are chirping and a train is moving "], "question": "which entity has a boy speaking?", "label": 0}, {"captions": ["a weapon fires multiple times", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sMC07Ucy7kg", "xfaoyyzw2WU"], "start_seconds": ["10", "180"], "properties": ["weapon, fire, multiple", "loud, jet engine, roar"], "captions_pred_video": ["footage is from a car's point of view", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "some tunes played by whistling"], "sample_ids": ["tDVADusiIoc", "u6BnG6YZqJ4"], "start_seconds": ["60", "0"], "properties": ["wind, radio, waves", "tune, play, whistling"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["a train engine runs and a horn blows", "an infant crying as a woman laughs"], "sample_ids": ["zPX9o1uDiI", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["engine, horn, run", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a male speaks over some small clicks", "a motorcycle idles loudly as wind blows"], "sample_ids": ["uXxVebHsGZ8", "v7jJS8aAyA"], "start_seconds": ["30", "10"], "properties": ["male, clicks, speak", "wind, blows, loudly"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a motorcycle engine is idling and vibrating"], "question": "which entity is louder", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a clock ticktocks"], "sample_ids": ["rqfQRErjfk8", "v-g-j2uTByM"], "start_seconds": ["170", "30"], "properties": ["crowd, cheers, applauds", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["birds coo incessantly", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yZrFNS7GFBQ", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["coo, bird, incessant", "multiple, people, yell"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks, then dials a rotary telephone", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tK4VlLsNxak", "wqZ135Ssz0"], "start_seconds": ["120", "60"], "properties": ["a, dial, telephone", "two men, woman, birds"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["some men converse over an engine running", "an infant crying frantically"], "sample_ids": ["sCiy7QS1U", "zwOBqeFTgiU"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "birds chirp and objects are moved around"], "sample_ids": ["vddP56-ogds", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["water, flow, laugh", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["t25U-v4k4ts", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["a, chirps, bird", "water, radio, man"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["an infant crying as a woman laughs", "water pouring and bubbling"], "sample_ids": ["xhmRY9yhC7c", "uyRfq-jKPpo"], "start_seconds": ["20", "50"], "properties": ["a, laugh, infant", "water, bubbles, pouring"], "captions_pred_video": ["of a baby crying in a baby bouncer", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a baby cries and a woman speaks", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "an infant crying frantically"], "sample_ids": ["xZepNM9qcRA", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["background, motor, run", "cry, infant, frantically"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "of the baby crying in the car seat"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a duck quacks continuously"], "sample_ids": ["vKrYfzleLB8", "vh30P49Po6s"], "start_seconds": ["110", "30"], "properties": ["a, ring, gunshots", "quacks, continuously, duck"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yRx9txMcBl0", "tiDFTC-5vU"], "start_seconds": ["40", "30"], "properties": ["motors, tires, screech", "male, duck, laugh"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sd7xVssqlw", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["accelerates, tires, squealing", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a stream of water runs briefly"], "sample_ids": ["siJFXfGWgDk", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["a, bird, vehicle", "stream, water, run"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sd7xVssqlw", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["accelerates, tires, squealing", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "dishes cling together then a man begins to speak"], "sample_ids": ["wz7N8YRy74I", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["rooster, crow, background, men", "cling, speak, dishes"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "a person screams glaringly"], "sample_ids": ["vzxHnu-SFEw", "xC8kbrKJmco"], "start_seconds": ["80", "0"], "properties": ["two objects, woman, speak", "glaringly, screams, person"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a goat is bleating "], "question": "which entity is more silent", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uoGVs9yUqY4", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["multiple, vocalize, wind", "stream, water, flow"], "captions_pred_video": ["for how to make a wooden shed door youtube", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["ugHJF0hfYkg", "w34HjHr6gAY"], "start_seconds": ["10", "30"], "properties": ["loud, intense, propeller", "beeps, hit, woman"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a helicopter is flying overhead ", "a beep sounds followed by a child speaking"], "question": "which entity is quieter", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "water pouring and bubbling"], "sample_ids": ["ukxt9I7eMMg", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["food, pan, cook", "water, bubbles, pouring"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "wind blows as people chatter quietly"], "sample_ids": ["wRV8yMk886E", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["liquid, spray, nozzle", "wind, chatter, people"], "captions_pred_video": ["two cars are parked in a parking lot at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["siJFXfGWgDk", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["man, woman, vehicle", "three men, wind, flow"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks as a machine runs", "water flows as men speak and yell"], "sample_ids": ["vD6lYD1l0BY", "vJ7JPEFhyLA"], "start_seconds": ["330", "16"], "properties": ["a, machine, run", "water, flow, men"], "captions_pred_video": ["game controller being held in the hands of the person", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking as a machine runs?", "label": 0}, {"captions": ["a clock ticktocks", "wind blows in gusts as a woman speaks in the distance"], "sample_ids": ["v-g-j2uTByM", "uC9dtII1KDI"], "start_seconds": ["30", "150"], "properties": ["ticktocks, clock, ticktocks", "wind, gusts, distance"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage of a person riding a horse in a riding arena"], "captions_pred_audio": ["a clock is ticking loudly", "a woman is speaking with wind noise and breathing in the background "], "question": "which entity is not a clock?", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zofjfKhqLk8", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["background, metal, clings", "applause, audience, yells"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["running water in a faucet with some clinks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zNRChLjqcU", "xBxDz0CFVn0"], "start_seconds": ["220", "30"], "properties": ["water, faucet, run", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking with wind noise in the background "], "question": "which entity is flowing water", "label": 1}, {"captions": ["a man talks as several small engines run", "a man is snoring loudly and repeatedly"], "sample_ids": ["u9A6VZQCZpU", "sncRqQ67iJU"], "start_seconds": ["30", "460"], "properties": ["a, man, talk", "loud, repeatedly, man"], "captions_pred_video": [null, "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a person is snoring"], "question": "which man is louder", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a man speaks on a radio as wind blows"], "sample_ids": ["s4Uz1Ffgo04", "tDVADusiIoc"], "start_seconds": ["100", "60"], "properties": ["roars, background, people speaking", "man, radio, blows"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking on a radio?", "label": 1}, {"captions": ["people speak then an engine runs", "water pouring and bubbling"], "sample_ids": ["uMTTDZ2mb4", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["engine, run, people", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["two frogs croak at each other", "a telephone rings followed by a woman talking"], "sample_ids": ["zg0X6BnhOLQ", "tGcFnX0GHI"], "start_seconds": ["410", "0"], "properties": ["two frogs, croak, at each other", "ring, talk, woman"], "captions_pred_video": ["footage of lightning in the sky at night", null], "captions_pred_audio": ["a frog is croaking", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a man speaks as a car is passing by"], "sample_ids": ["yYEVLuqEytU", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["grunt, slurp, background", "a, car, pass"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["bees buzz and wind blows", "people cheer as a vehicle engine revs"], "sample_ids": ["tMJne1a4AFI", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["bees buzz, wind blows, bees", "engine revs, vehicle, people"], "captions_pred_video": ["a swarm of bees on the ground", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a swarm of bees buzzing around", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "people applaud and hoot and chat quietly"], "sample_ids": ["tiDFTC-5vU", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["male, duck, laugh", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be at a party", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "a horn rings out as a machine runs by"], "sample_ids": ["zVacuqSb4LI", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["blares, fades, train", "a, horn, run"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["uEU-Hg5MTN8", "tiDFTC-5vU"], "start_seconds": ["27", "30"], "properties": ["a woman, laughs, animal", "male, duck, laugh"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["a person is snoring while sleeping", "people speak as gunfire rings out"], "sample_ids": ["vJrjSeP17yE", "wqTCwqVRDlk"], "start_seconds": ["40", "80"], "properties": ["a person is sleeping, snoring, person", "gunfire, ring, speak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 0}, {"captions": ["people cheer as a vehicle engine revs", "vehicles pass by on a roadway"], "sample_ids": ["xjhAnI2q6hM", "tgbONvsP47Y"], "start_seconds": ["6", "0"], "properties": ["engine revs, vehicle, people", "pass, vehicle, roadway"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "winds blows roughly as a vehicle races past"], "sample_ids": ["sZPuqDgX2V0", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["engine, accelerate, intercom", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["xKB8O8LTs6s", "uEU-Hg5MTN8"], "start_seconds": ["70", "27"], "properties": ["music, radio, gunshots", "a woman, laughs, animal"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking on a radio?", "label": 0}, {"captions": ["a man speaking with light rustling", "females talk and laugh over gusting wind"], "sample_ids": ["zOZleIRqZm4", "un9VQlzgZM"], "start_seconds": ["80", "5"], "properties": ["light, rustling, man", "females, talk, laugh"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is a group of people", "label": 1}, {"captions": ["a man talks as several small engines run", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["u9A6VZQCZpU", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["a, man, talk", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an aircraft engine runs", "a stream of water runs briefly"], "sample_ids": ["yLCORCnd35Q", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["engine, aircraft, runs", "stream, water, run"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a person sniffs and sneezes", "someone is typing on a computer keyboard"], "sample_ids": ["uRlbY6aoBU", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["sneezes, person, sniffs", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is sneezing ", "a person is typing on a keyboard"], "question": "which person is typing on a computer keyboard", "label": 1}, {"captions": ["a beep occurs briefly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xtWeJ56-U-g", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["beep, occur, briefly", "music, gunfire, explosion"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["two frogs croak at each other", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zg0X6BnhOLQ", "sLUnaPT5gM8"], "start_seconds": ["410", "0"], "properties": ["two frogs, croak, at each other", "loud, laughter, intermittent"], "captions_pred_video": ["footage of lightning in the sky at night", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a frog is croaking", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a siren comes to life as a horn blares", "a vehicle engine accelerating then running on idle"], "sample_ids": ["u--KhUW8l1Y", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["horn, siren, life", "engine, accelerate, idle"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a clock ticktocks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["v-g-j2uTByM", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["ticktocks, clock, ticktocks", "engine, idle, woman"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a clock is ticking loudly", "a woman is speaking and a subway train is moving "], "question": "which entity is not stationary", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["xM4joTqDVp4", "y2bVZ7rz-5M"], "start_seconds": ["160", "280"], "properties": ["background, chirp, birds", "motor noise, horn, siren"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "someone is typing on a computer keyboard"], "sample_ids": ["vfYTJq7nU", "v0x1odnXtP0"], "start_seconds": ["130", "210"], "properties": ["rustling, ducks, quack", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wP8ZKrlx3oA", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["rain, storm, thunder", "men, talk, cars"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone snores nearby", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["spJCm8tD9Zo", "uZesmtKZGSw"], "start_seconds": ["90", "250"], "properties": ["someone snores, nearby, someone", "men, talk, cars"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman sneezes then speaks", "a toilet flushes and a female speaks"], "sample_ids": ["x4dZyf9Gbj0", "yaln9y8I7ms"], "start_seconds": ["130", "230"], "properties": ["sneezes, speaks, woman", "female, flushes, toilet"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman sneezes and speaks", "a toilet flushes and a man speaks"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a vehicle engine revs and tires squeal", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yDoT73BWsdA", "sSMl2vc3ek"], "start_seconds": ["10", "20"], "properties": ["engine revs, tires squeal, vehicle", "loud, multiple, distance"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp, a woman speaks, and insects buzz", "a infant makes noise and is excited"], "sample_ids": ["t97k0cejSQE", "wIJK3-5y0kA"], "start_seconds": ["250", "30"], "properties": ["sound, chirp, buzz", "noise, excited, infant"], "captions_pred_video": ["a bee on a purple thistle flower", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a baby cries and a woman speaks"], "question": "which entity makes a lot of noise", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "an insect buzzes around continuously"], "sample_ids": ["su6FAOcOA8c", "v25l1jef3JY"], "start_seconds": ["4", "0"], "properties": ["engine, run, woman", "buzzes, continuously, insect"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "wind blowing followed by a zoom"], "sample_ids": ["sDSppXIlJrs", "vr8ZXjEBhMQ"], "start_seconds": ["27", "150"], "properties": ["microphone, water, wind", "wind, blow, zoom"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["the wind is blowing and water is splashing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a video of wind blowing?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a helicopter engine runs continuously"], "sample_ids": ["yRx9txMcBl0", "ugHJF0hfYkg"], "start_seconds": ["40", "10"], "properties": ["motors, tires, screech", "engine, running, continuously"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a helicopter is flying overhead "], "question": "which entity is running continuously", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "an engine runs loudly"], "sample_ids": ["tiDFTC-5vU", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["male, duck, laugh", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a clock ticktocks"], "sample_ids": ["zcDwZ6W7E3E", "v-g-j2uTByM"], "start_seconds": ["180", "30"], "properties": ["man, speak, motorcycles", "ticktocks, clock, ticktocks"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["y2bVZ7rz-5M", "uEU-Hg5MTN8"], "start_seconds": ["280", "27"], "properties": ["engine, horn, siren", "a woman, laughs, animal"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a woman is speaking and a baby is crying"], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wAAkbZToh8", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["burp, laugh, speak", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man burps and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a toilet flushes and water drains"], "sample_ids": ["vzxHnu-SFEw", "sfAvvZwdLCY"], "start_seconds": ["80", "20"], "properties": ["two objects, woman, speak", "water drains, flushes, water"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "people cheer as a vehicle engine revs"], "sample_ids": ["yajyRTUQk3U", "xjhAnI2q6hM"], "start_seconds": ["400", "6"], "properties": ["noise, woman, speak", "engine revs, vehicle, people"], "captions_pred_video": ["- a woman cooking in the kitchen", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a goat screams and people speak in the background", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xC8kbrKJmco", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["background, goat, scream", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has a more natural background", "label": 1}, {"captions": ["a large crowd cheers and applauds", "vehicles pass by on a roadway"], "sample_ids": ["rqfQRErjfk8", "tgbONvsP47Y"], "start_seconds": ["170", "0"], "properties": ["crowd, cheers, applauds", "pass, vehicle, roadway"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "a toilet flushes and water drains"], "sample_ids": ["tw76HGONaKg", "sfAvvZwdLCY"], "start_seconds": ["570", "20"], "properties": ["music, click, man", "water drains, flushes, water"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a clock ticktocks in wind"], "sample_ids": ["zkKdxzNC97Y", "yVumC9TGknc"], "start_seconds": ["27", "30"], "properties": ["loud, bang, noise", "ticktocks, clock, wind"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "game title screen of the game shadow of the colossus on sony playstation 2"], "captions_pred_audio": ["a door is opened and closed", "a series of beeps and chirps"], "question": "which entity is quieter", "label": 1}, {"captions": ["white noise and birds chirping", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wRBHTgrbiwg", "xfaoyyzw2WU"], "start_seconds": ["50", "180"], "properties": ["noise, white, chirping", "loud, jet engine, roar"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "an aircraft engine roars and a man speaks "], "question": "which noise is louder", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yVumC9TGknc", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["humming, clock, birds", "engine, laugh, loud"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a series of beeps and chirps", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a machine runs", "waves crash against a shoreline and people speak"], "sample_ids": ["vD6lYD1l0BY", "yFB25fqfU8I"], "start_seconds": ["330", "300"], "properties": ["a, machine, run", "wave, crash, shoreline"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["w8uLijTqtlU", "y8WEcpOlT3I"], "start_seconds": ["70", "40"], "properties": ["wind, microphone, noise", "harsh, wind, blows"], "captions_pred_video": ["footage is blurry and shaky", "on how to use a sewing machine youtube"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking with wind noise in the background "], "question": "which entity is a recording of a harsh wind blowing?", "label": 1}, {"captions": ["an airplane engine runs", "people applaud and hoot and chat quietly"], "sample_ids": ["yVPZ2MNWpms", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["engine, airplane, runs", "people, applaud, hoot"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "people are clapping and speaking with background noise "], "question": "which entity is a performance", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a church bell rings several times"], "sample_ids": ["wyllXV6PjKo", "sUVVjE3Ucp8"], "start_seconds": ["30", "0"], "properties": ["a baby, a woman, a man", "ring, bell, several"], "captions_pred_video": [null, "the video shows a stone wall with a clock on top of it and a bench in front of it"], "captions_pred_audio": ["a woman speaks and a baby cries", "a church bell is ringing "], "question": "which entity is silent", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "waves crash against a shoreline and people speak"], "sample_ids": ["sHbXC6na9hg", "yFB25fqfU8I"], "start_seconds": ["0", "300"], "properties": ["a person, saw, wood", "wave, crash, shoreline"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "footage of a person surfing in the ocean"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be in a natural environment", "label": 1}, {"captions": ["a heavy rain falls endlessly", "water flows as men speak and yell"], "sample_ids": ["wP8ZKrlx3oA", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["heavy, rain, fall", "water, flow, men"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of water flowing?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "birds chirp and objects are moved around"], "sample_ids": ["w2M4i1mklOA", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["alarm, gears, turn", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of an antique clock", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "insects buzz and a man speaks"], "question": "which entity is more like a clock", "label": 0}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "the revving of an engine throttle followed by a man speaking"], "sample_ids": ["su6FAOcOA8c", "tezvROoo4bs"], "start_seconds": ["4", "40"], "properties": ["engine, idle, woman", "audio, throttle, speaking"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a busy city street with cars parked on both sides of the road"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a car accelerates and revs while a man speaks "], "question": "which entity is about a bus engine?", "label": 0}, {"captions": ["two frogs croak at each other", "a car speeding up in the distance"], "sample_ids": ["zg0X6BnhOLQ", "u0TrcHhkPQ"], "start_seconds": ["410", "20"], "properties": ["two frogs, croak, at each other", "distance, car, speed"], "captions_pred_video": ["footage of lightning in the sky at night", null], "captions_pred_audio": ["a frog is croaking", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["v0x1odnXtP0", "tDVADusiIoc"], "start_seconds": ["210", "60"], "properties": ["keyboard, type, computer", "water, radio, man"], "captions_pred_video": ["how to make money on youtube in spanish", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a person?", "label": 0}, {"captions": ["an engine runs and a man speaks", "a duck quacks continuously"], "sample_ids": ["yT5WfYMRr-U", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "quacks, continuously, duck"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a man speaks as a motor runs in the background"], "sample_ids": ["zCrAfDfv6-A", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["person, mouse, click", "background, motor, run"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person whistles a song", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yJ0TePmaOo", "uZesmtKZGSw"], "start_seconds": ["390", "250"], "properties": ["two hard objects, man, speak", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["running water in a faucet with some clinks", "a woman speaks and other women and a man talk with her"], "sample_ids": ["zNRChLjqcU", "vbpKkWvfOu4"], "start_seconds": ["220", "560"], "properties": ["water, faucet, run", "a, woman, man"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["water is running from a faucet into a sink", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["running water in a faucet with some clinks", "plastic is tapped on while someone speaks"], "sample_ids": ["zNRChLjqcU", "wvKpEYswXO0"], "start_seconds": ["220", "150"], "properties": ["water, faucet, run", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["water is running from a faucet into a sink", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "water splashes as an animal walks through"], "sample_ids": ["w8uLijTqtlU", "w1ir-sZ3Im8"], "start_seconds": ["70", "90"], "properties": ["wind, microphone, noise", "animal, water, splashes"], "captions_pred_video": ["footage is blurry and shaky", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["the wind is blowing strongly", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a recording", "label": 1}, {"captions": ["small dogs yip and bark sharply", "an infant crying frantically"], "sample_ids": ["v-wcQf4BDY0", "zwOBqeFTgiU"], "start_seconds": ["120", "30"], "properties": ["bark, yip, sharply", "cry, infant, frantically"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "of the baby crying in the car seat"], "captions_pred_audio": ["a dog barks and growls", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["material crumbles into a microphone", "some people speak"], "sample_ids": ["vofpvUo6NAw", "vbZ-0lGPneg"], "start_seconds": ["220", "30"], "properties": ["material, crumbles, microphone", "some people speak English, some people speak Spanish, some people speak French"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a woman is speaking and a dog is whimpering"], "question": "which entity is not a person", "label": 0}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["tDlfY3nmx1A", "uEU-Hg5MTN8"], "start_seconds": ["160", "27"], "properties": ["applause, laugh, man", "a woman, laughs, animal"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a vehicle engine accelerating then running on idle", "an airplane engine runs"], "sample_ids": ["vYkA3cfXp5Q", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["engine, accelerate, idle", "engine, airplane, runs"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["an engine is idling", "a car is driving by on the road "], "question": "which entity has a running engine", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uYT5gxnyMWM", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["a, scream, girl", "stream, water, flow"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a car speeding up in the distance"], "sample_ids": ["sZPuqDgX2V0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["commentator, race, track", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which car is speeding up in the distance", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["w2JXXIAdUdg", "uZesmtKZGSw"], "start_seconds": ["10", "250"], "properties": ["snoring, distance, person", "men, talk, cars"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more likely to be a dream", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vzxHnu-SFEw", "uZesmtKZGSw"], "start_seconds": ["80", "250"], "properties": ["two objects, woman, speak", "men, talk, cars"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["bees buzz and wind blows", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tMJne1a4AFI", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["bees buzz, wind blows, bees", "wind, blow, vehicle"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blows?", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["y8WEcpOlT3I", "xKB8O8LTs6s"], "start_seconds": ["40", "70"], "properties": ["harsh, wind, blows", "music, gunfire, explosion"], "captions_pred_video": ["on how to use a sewing machine youtube", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yFB25fqfU8I", "vfYTJq7nU"], "start_seconds": ["300", "130"], "properties": ["wave, crash, shoreline", "rustling, ducks, quack"], "captions_pred_video": ["footage of a person surfing in the ocean", null], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yRx9txMcBl0", "sSMl2vc3ek"], "start_seconds": ["40", "20"], "properties": ["motors, tires, screech", "loud, multiple, distance"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["xyx6eNVEYRY", "tw76HGONaKg"], "start_seconds": ["380", "570"], "properties": ["loud, engine, muffles", "A, game, keyboard"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a man speaks and types on a computer keyboard "], "question": "which man is speaking", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zk-xJGQU8-4", "w5W5Kqtc8E"], "start_seconds": ["130", "100"], "properties": ["food, man, woman", "wind, blow, vehicle"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", null], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["a car speeding up in the distance", "an insect buzzes around continuously"], "sample_ids": ["u0TrcHhkPQ", "v25l1jef3JY"], "start_seconds": ["20", "0"], "properties": ["distance, car, speed", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a fly is buzzing around a microphone "], "question": "which entity is moving faster", "label": 0}, {"captions": ["a woman speaks and is crumpling paper", "a child speaks in closed space"], "sample_ids": ["xvDdE3zNf8Y", "yW6FWLSLkx4"], "start_seconds": ["120", "40"], "properties": ["A, crumple, paper", "child, space, speak"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman speaks and crumples paper", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a car accelerates and wind blows"], "sample_ids": ["tQWGZLItBXk", "u0TrcHhkPQ"], "start_seconds": ["170", "20"], "properties": ["music, kid, speak", "accelerates, wind, blows"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wRV8yMk886E", "yDoT73BWsdA"], "start_seconds": ["0", "10"], "properties": ["liquid, spray, nozzle", "engine, revs, vehicle"], "captions_pred_video": ["two cars are parked in a parking lot at night", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "people speak as gunfire rings out"], "sample_ids": ["sQGXqGcwOTc", "wqTCwqVRDlk"], "start_seconds": ["3", "80"], "properties": ["cling, speak, dishes", "gunfire, ring, speak"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["y8WEcpOlT3I", "rqu8iB22IY"], "start_seconds": ["40", "5"], "properties": ["wind, speak, buffeting", "sound, repeats, laugh"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a dog barks and a man speaks while music plays "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["continuous snoring", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["sLkeqCDJIyw", "uEU-Hg5MTN8"], "start_seconds": ["120", "27"], "properties": ["loud, snoring, noise", "a woman, laughs, animal"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is not a person", "label": 0}, {"captions": ["a helicopter engine idles continuously", "continuous snoring"], "sample_ids": ["ugHJF0hfYkg", "sLkeqCDJIyw"], "start_seconds": ["10", "120"], "properties": ["engine, idle, continuously", "loud, snoring, noise"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["a helicopter is flying overhead ", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sncRqQ67iJU", "tDVADusiIoc"], "start_seconds": ["460", "60"], "properties": ["loud, repeatedly, man", "water, radio, man"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person is snoring", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a man?", "label": 0}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a car accelerates and wind blows"], "sample_ids": ["slZLHwNbbt4", "u0TrcHhkPQ"], "start_seconds": ["300", "20"], "properties": ["clap, distance, horn", "accelerates, wind, blows"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["xNMovAf3o50", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["rain, thunder, music", "a woman, laughs, animal"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a movie", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "water splashes and a door squeaks"], "sample_ids": ["sjlVMgdGSK0", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["car, revving, loudly", "sound, splash, door"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a dog barks and taps with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uPDn2BFTHk", "vfYTJq7nU"], "start_seconds": ["140", "130"], "properties": ["lady, laugh, baby", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is about a baby?", "label": 0}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["uPDn2BFTHk", "wz7N8YRy74I"], "start_seconds": ["140", "30"], "properties": ["lady, laugh, baby", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "wind blowing followed by a zoom"], "sample_ids": ["vddP56-ogds", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["water, flow, laugh", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to blow", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a person speaks briefly"], "sample_ids": ["zF8yoL0rkbI", "zOZleIRqZm4"], "start_seconds": ["30", "80"], "properties": ["engine, run, someone", "person, talk, brief"], "captions_pred_video": ["footage of the traffic on the street at night", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man is speaking with crickets chirping in the background"], "question": "which entity is talking", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "an insect buzzes around continuously"], "sample_ids": ["sQGXqGcwOTc", "v25l1jef3JY"], "start_seconds": ["3", "0"], "properties": ["cling, speak, dishes", "buzzes, continuously, insect"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a motor slows to a stopover traffic noises"], "sample_ids": ["y8WEcpOlT3I", "zofjfKhqLk8"], "start_seconds": ["40", "10"], "properties": ["wind, speak, buffeting", "noise, stop, motor"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a large engine is running and a bell is ringing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "people speak as gunfire rings out"], "sample_ids": ["zhx6hoYrHeI", "wqTCwqVRDlk"], "start_seconds": ["160", "80"], "properties": ["engine, sputter, rough", "gunfire, ring, speak"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["an airplane engine spools and people speak", "a crowd yells, reacts and applauds"], "sample_ids": ["wTjoRj1se3U", "wztCSUxOf8"], "start_seconds": ["390", "130"], "properties": ["airplane, engine, spool", "a crowd, yells, applauds"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "an airplane engine spools and people speak"], "sample_ids": ["xKB8O8LTs6s", "wTjoRj1se3U"], "start_seconds": ["70", "390"], "properties": ["music, radio, gunshots", "airplane, engine, spool"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a jet engine is running and people are talking"], "question": "which entity is a video of a plane engine spooling?", "label": 1}, {"captions": ["a door opens and birds chirp", "water rushes and then a vehicle zooms past"], "sample_ids": ["yeFvk9x0wWI", "s4Uz1Ffgo04"], "start_seconds": ["30", "100"], "properties": ["door, open, birds", "water, rushes, vehicle"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a goat screams and people speak in the background", "bird squawks are accompanied by a man and woman speaking"], "sample_ids": ["xC8kbrKJmco", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["background, goat, scream", "man, woman, squawks"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is accompanied by a man and woman speaking", "label": 1}, {"captions": ["a woman and man are speaking", "a toilet flushes and water drains"], "sample_ids": ["vbpKkWvfOu4", "sfAvvZwdLCY"], "start_seconds": ["560", "20"], "properties": ["two people, speaking, woman, man", "water drains, flushes, water"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vbpKkWvfOu4", "xBxDz0CFVn0"], "start_seconds": ["560", "30"], "properties": ["a, man, speaks", "stream, water, flow"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking with wind noise in the background "], "question": "which entity is a video of a woman speaking and then a man speaking?", "label": 0}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "water pouring and bubbling"], "sample_ids": ["zY3icUyMdh8", "uyRfq-jKPpo"], "start_seconds": ["20", "50"], "properties": ["dog, bark, engine", "water, bubbles, pouring"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "people speak then an engine runs"], "sample_ids": ["yDoT73BWsdA", "uMTTDZ2mb4"], "start_seconds": ["10", "30"], "properties": ["engine, revs, vehicle", "engine, run, people"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "people are talking and a car is driving by with wind noise in the background "], "question": "which entity has a vehicle with an engine?", "label": 0}, {"captions": ["a man talks while vehicles pass by", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["sK4u5T8hW78", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "beeps, hit, woman"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a beep sounds followed by a child speaking"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["someone whistles briefly", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["uFoga8sHpiw", "w5W5Kqtc8E"], "start_seconds": ["90", "100"], "properties": ["sound, duration, pitch", "wind, blow, vehicle"], "captions_pred_video": ["footage of a bird in a cage", null], "captions_pred_audio": ["a person whistles a song", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a longer duration", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a clock ticktocks"], "sample_ids": ["vfYTJq7nU", "v-g-j2uTByM"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a toilet flushes and a female speaks"], "sample_ids": ["x6ijhqRY38s", "yaln9y8I7ms"], "start_seconds": ["250", "230"], "properties": ["bowl, silverware, man", "female, flushes, toilet"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a toilet flushes and a man speaks"], "question": "which entity is about a toilet?", "label": 1}, {"captions": ["a toilet flushes and water drains", "someone is typing on a computer keyboard"], "sample_ids": ["sfAvvZwdLCY", "v0x1odnXtP0"], "start_seconds": ["20", "210"], "properties": ["water drains, flushes, water", "keyboard, type, computer"], "captions_pred_video": ["footage of the toilet in the bathroom", "how to make money on youtube in spanish"], "captions_pred_audio": ["a toilet is flushed", "a person is typing on a keyboard"], "question": "which object is used to type on a computer", "label": 1}, {"captions": ["a beep occurs briefly", "a clock ticktocks"], "sample_ids": ["xtWeJ56-U-g", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["beep, occur, briefly", "ticktocks, clock, ticktocks"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a horn rings out as a machine runs by"], "sample_ids": ["sfAvvZwdLCY", "slZLHwNbbt4"], "start_seconds": ["20", "300"], "properties": ["water drains, flushes, water", "a, horn, run"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a toilet is flushed", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a child speaks in closed space"], "sample_ids": ["xjvTpk2Zpr8", "yW6FWLSLkx4"], "start_seconds": ["70", "40"], "properties": ["engine, run, wind", "child, space, speak"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "someone is typing on a computer keyboard"], "sample_ids": ["yeFvk9x0wWI", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["chirp, twitter, clatter", "keyboard, type, computer"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "how to make money on youtube in spanish"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a person is typing on a keyboard"], "question": "which entity is typing", "label": 1}, {"captions": ["a baby laugh at a sputter", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sLUnaPT5gM8", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["laugh, sputter, baby", "loud, jet engine, roar"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vSeGhaZt-aI", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["water, bubbles, speak", "a woman, a television program, a bird"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "dishes cling together then a man begins to speak"], "sample_ids": ["ugHJF0hfYkg", "sQGXqGcwOTc"], "start_seconds": ["10", "3"], "properties": ["engine, running, continuously", "cling, speak, dishes"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a helicopter is flying overhead ", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "birds chirp and objects are moved around"], "sample_ids": ["rwTERCUno", "yPUYU6t3rwo"], "start_seconds": ["90", "370"], "properties": ["engine, idle, sputter", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["an engine is idling and vibrating", "insects buzz and a man speaks"], "question": "which entity is not a person?", "label": 0}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["yYEVLuqEytU", "xjvTpk2Zpr8"], "start_seconds": ["40", "70"], "properties": ["animal, pig, background", "wind, blows, vehicle"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a duck quacks continuously"], "sample_ids": ["yks4cLgIDMc", "vh30P49Po6s"], "start_seconds": ["170", "30"], "properties": ["background, speaking, child", "quacks, continuously, duck"], "captions_pred_video": ["footage of two kids wrestling on the floor", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and a child is crying", "a duck is quacking loudly"], "question": "which entity is speaking", "label": 0}, {"captions": ["some men converse over an engine running", "people cheer as a vehicle engine revs"], "sample_ids": ["sCiy7QS1U", "xjhAnI2q6hM"], "start_seconds": ["300", "6"], "properties": ["men, converse, engine", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a truck is revving its engine and a man is speaking "], "question": "which entity shows people cheering?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["x6ijhqRY38s", "zl9Dqx-j7q4"], "start_seconds": ["250", "6"], "properties": ["bowl, silverware, man", "engine, laugh, loud"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a jet engine roars "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["w6RTHR6AeAg", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["call, owl, screech", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["zl9Dqx-j7q4", "sapQIQUhFc"], "start_seconds": ["6", "280"], "properties": ["engine, laugh, loud", "liquid, flow, distance"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more distant", "label": 1}, {"captions": ["a stream runs then someone speaks", "a woman speaks and is crumpling paper"], "sample_ids": ["wbHTKEJZyhc", "xvDdE3zNf8Y"], "start_seconds": ["20", "120"], "properties": ["stream, run, someone", "A, crumple, paper"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a woman speaks and crumples paper"], "question": "which entity is crumpling paper", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "someone is typing on a computer keyboard"], "sample_ids": ["u7C-AEBQM", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["ticks, rhythmic, quiet", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a ticktock of a clock", "a person is typing on a keyboard"], "question": "which is quieter", "label": 1}, {"captions": ["a person sniffles and sneezes", "a woman speaks as she rubs two objects together"], "sample_ids": ["uRlbY6aoBU", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["sneezes, sniffles, person", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is sneezing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 0}, {"captions": ["someone snores nearby", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["spJCm8tD9Zo", "uZesmtKZGSw"], "start_seconds": ["90", "250"], "properties": ["someone snores, nearby, someone", "men, talk, cars"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an emergency siren wails as it passes", "some tunes played by whistling"], "sample_ids": ["vGj1XLJvNrw", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["wails, wails, pass", "tune, play, whistling"], "captions_pred_video": ["footage of a police car driving down a city street", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a person whistling a song"], "question": "which entity is not playing a tune?", "label": 0}, {"captions": ["people speak and laugh as a child speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sa6TLVbooCc", "tDVADusiIoc"], "start_seconds": ["240", "60"], "properties": ["people, laugh, child", "water, radio, man"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a child speaking?", "label": 0}, {"captions": ["birds chirps while a siren signals in the distance", "a car speeding up in the distance"], "sample_ids": ["uKCSGgof8gI", "u0TrcHhkPQ"], "start_seconds": ["12", "20"], "properties": ["chirps, distance, signal", "distance, car, speed"], "captions_pred_video": ["footage of a street in a small town on a sunny day", null], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a train engine runs and a horn blows", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zPX9o1uDiI", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["engine, horn, run", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wyllXV6PjKo", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["a kid, talk, cry", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman speaks and a baby cries", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "an insect buzzes around continuously"], "sample_ids": ["wvKpEYswXO0", "v25l1jef3JY"], "start_seconds": ["150", "0"], "properties": ["water, tap, run", "buzzes, continuously, insect"], "captions_pred_video": ["of the person preparing food in the kitchen", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a person is snoring while sleeping", "a man speaks as a motor runs in the background"], "sample_ids": ["vJrjSeP17yE", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "background, motor, run"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person snoring loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 0}, {"captions": ["multiple birds chirp and an animal grunts", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["tDlysoZiA1I", "tDlysoZiA1I"], "start_seconds": ["0", "0"], "properties": ["animal, grunt, multiple", "animal, grunts, chirps"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "birds are chirping and a rooster is crowing "], "question": "which entity has more grunts", "label": 1}, {"captions": ["a cat meows and children speak", "material crumbles into a microphone"], "sample_ids": ["x5cuQjOdM3E", "vofpvUo6NAw"], "start_seconds": ["30", "220"], "properties": ["cat, speak, children", "material, crumbles, microphone"], "captions_pred_video": ["a black background with an airplane flying in the sky", "person wrapping a toy car in a plastic bag"], "captions_pred_audio": ["a cat meows and a woman speaks", "paper is being crumpled and crinkled"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wqUmIEzuNz4", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["frog, bird, vocalize", "loud, multiple, distance"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", null], "captions_pred_audio": ["a cat meows and rustles", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sG7TyPnFDR0", "su6FAOcOA8c"], "start_seconds": ["180", "4"], "properties": ["beeps, machine, smoke alarm", "engine, idle, woman"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "an infant crying and a woman speaking with some distant murmuring"], "sample_ids": ["vb1fPSDI4c", "smDKStoHBJo"], "start_seconds": ["30", "0"], "properties": ["multiple, people, yell", "a, infant, speaking"], "captions_pred_video": [null, "a man holding a crying baby in his arms"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a baby is crying and a woman is speaking"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "an animal quacks rapidly"], "sample_ids": ["vfYTJq7nU", "vh30P49Po6s"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "animal, quacks, rapidly"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a duck is quacking loudly"], "question": "which entity is a duck?", "label": 0}, {"captions": ["a beep repeats multiple times", "a duck quacks loudly and continuously"], "sample_ids": ["y682ml90jGw", "vh30P49Po6s"], "start_seconds": ["11", "30"], "properties": ["beep, repeat, multiple", "loud, continuous, quacks"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a beeping sound is being made ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks in wind", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["yVumC9TGknc", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["ticktocks, clock, wind", "loud, jet engine, roar"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a series of beeps and chirps", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a man speaks as a car is passing by"], "sample_ids": ["wvKpEYswXO0", "sK4u5T8hW78"], "start_seconds": ["150", "30"], "properties": ["sound, water, running", "a, car, pass"], "captions_pred_video": ["of the person preparing food in the kitchen", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wyllXV6PjKo", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["a baby, a woman, a man", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "water splashes as an animal walks through"], "sample_ids": ["sWZzXuWYY", "w1ir-sZ3Im8"], "start_seconds": ["420", "90"], "properties": ["male, clanks, thumps", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xjvTpk2Zpr8", "xKB8O8LTs6s"], "start_seconds": ["70", "70"], "properties": ["engine, run, wind", "music, gunfire, explosion"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a jet engine roars and wind blows ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a movie?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a vehicle accelerates squealing tires"], "sample_ids": ["sfAvvZwdLCY", "sd7xVssqlw"], "start_seconds": ["20", "50"], "properties": ["water drains, flushes, water", "accelerates, tires, squealing"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["xOZfdgAgJ9o", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["woman, whimpering, speaking", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a dog is whimpering"], "question": "which woman is speaking", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a stream of water flows as people talk and wind blows"], "sample_ids": ["xKB8O8LTs6s", "xBxDz0CFVn0"], "start_seconds": ["70", "30"], "properties": ["music, radio, gunshots", "stream, water, flow"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage is blurry and out of focus"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "a man speaks as a car is passing by"], "sample_ids": ["vveS8HT7Uog", "sK4u5T8hW78"], "start_seconds": ["100", "30"], "properties": ["a man, objects, speak", "a, car, pass"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a man is speaking with background noise and breathing sounds "], "question": "which object is rubbed together", "label": 0}, {"captions": ["a baby cries and a woman speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["tMbMDvT50j8", "wwyfGO2J4"], "start_seconds": ["12", "90"], "properties": ["a, cry, woman", "people, applaud, hoot"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["xyL9F5VrjkE", "zFjIWfSD-4"], "start_seconds": ["20", "410"], "properties": ["wind, blows, vehicle", "People, motor, brakes"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "vehicles pass by on a roadway"], "sample_ids": ["xl2PIWyXaM", "tgbONvsP47Y"], "start_seconds": ["160", "0"], "properties": ["chirp, man, younger person", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["birds are chirping and people are talking", "a car is driving on the road "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vlS6YMeWAPo", "xfaoyyzw2WU"], "start_seconds": ["40", "180"], "properties": ["sheep, baa, birds", "loud, jet engine, roar"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a goat bleats and birds chirp", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vbr9mHKc8WM", "sLUnaPT5gM8"], "start_seconds": ["40", "0"], "properties": ["noise, loudness, engine", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["an engine is idling", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "some men converse over an engine running"], "sample_ids": ["xyL9F5VrjkE", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["wind, blows, vehicle", "men, converse, engine"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks while water drains", "a drill drills through something then people begin laughing"], "sample_ids": ["vSeGhaZt-aI", "tEE3MpBt1sg"], "start_seconds": ["50", "50"], "properties": ["water, drain, man", "drill, something, laugh"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "people are laughing breathing and speaking with background noise "], "question": "which entity is about a drill?", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["zuua6-5goWw", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["sound, pop, bird", "background, motor, run"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["an insect buzzes around continuously", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["v25l1jef3JY", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["buzzes, continuously, insect", "People, motor, brakes"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is not a person?", "label": 0}, {"captions": ["a man speaks as a car is passing by", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sK4u5T8hW78", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["a, car, pass", "female, spraying, scream"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a duck quacks loudly and continuously", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vh30P49Po6s", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["loud, continuous, quacks", "multiple, people, yell"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a crowd of people are talking and laughing"], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a woman speaks and then a man speaks"], "sample_ids": ["tDlysoZiA1I", "vbpKkWvfOu4"], "start_seconds": ["0", "560"], "properties": ["animal, grunt, chirp", "a, man, speaks"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking and a man is speaking"], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zY3icUyMdh8", "wz7N8YRy74I"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "rooster, crow, background, men"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a bird?", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uYT5gxnyMWM", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["female, spraying, scream", "a woman, laughs, animal"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a woman speaking and spraying?", "label": 0}, {"captions": ["a door opens and closes", "an engine runs loudly"], "sample_ids": ["vBHyYJ8pL0", "vqZuVbG6-HI"], "start_seconds": ["2", "130"], "properties": ["open, close, door", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "water pouring and bubbling"], "sample_ids": ["uWPRNLnpy7Y", "uyRfq-jKPpo"], "start_seconds": ["10", "50"], "properties": ["accelerate, laugh, vehicle", "water, bubbles, pouring"], "captions_pred_video": ["is taken from a car driving down the street", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a car accelerates and revs its engine ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "an infant crying frantically"], "sample_ids": ["vXlk0lIQBFo", "zwOBqeFTgiU"], "start_seconds": ["470", "30"], "properties": ["wind, talk, vocalize", "cry, infant, frantically"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "of the baby crying in the car seat"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a man speaks as a car is passing by"], "sample_ids": ["tDVADusiIoc", "sK4u5T8hW78"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "a, car, pass"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "an airplane engine runs"], "sample_ids": ["zFjIWfSD-4", "yVPZ2MNWpms"], "start_seconds": ["410", "0"], "properties": ["People, motor, brakes", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a car is driving by on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "water is sprayed across a hard surface"], "sample_ids": ["siJFXfGWgDk", "sQwlkXjQabo"], "start_seconds": ["50", "10"], "properties": ["a, bird, vehicle", "water, spray, surface"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "vehicles pass by on a roadway"], "sample_ids": ["sapQIQUhFc", "tgbONvsP47Y"], "start_seconds": ["280", "0"], "properties": ["liquid, flow, distance", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a car is driving on the road "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "birds chirp quietly and an adult man speaks"], "sample_ids": ["y8dSeubCNI", "zuua6-5goWw"], "start_seconds": ["4", "30"], "properties": ["engine revving, people speaking, motorcycle", "birds, chirp, quiet, man, speaks"], "captions_pred_video": [null, "in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot"], "captions_pred_audio": ["an engine revving and people talking in the background", "birds are chirping and a man is speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "paper is crumpling consistently"], "sample_ids": ["wjsXBsc7M40", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a baby laughs and a woman speaks", "paper is crumpled and crinkled"], "question": "which entity is more likely to be a video", "label": 0}, {"captions": ["food is frying and sizzles", "people speak as gunfire rings out"], "sample_ids": ["zNRChLjqcU", "wqTCwqVRDlk"], "start_seconds": ["220", "80"], "properties": ["food is frying, sizzles, food", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sZvwOuuPGP0", "sLUnaPT5gM8"], "start_seconds": ["50", "0"], "properties": ["engine, diesel, truck", "loud, laughter, intermittent"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a medium engine is running ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is intermittent", "label": 1}, {"captions": ["scraping and female speech with distant music", "a man talks while a clock does ticktock"], "sample_ids": ["yHeVV-xeOxQ", "spYNpeN7rPY"], "start_seconds": ["130", "1"], "properties": ["female, speech, music", "a clock, ticktock, man"], "captions_pred_video": ["of a girl milking a goat's udder", "in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a man is speaking and breathing with background noise "], "question": "which entity is a clock?", "label": 1}, {"captions": ["rustling with distant murmuring", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wnNNcxAPwGQ", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["sound, distance, rustling", "stream, water, flow"], "captions_pred_video": ["footage of a yellow truck doing a burnout on a race track", "footage is blurry and out of focus"], "captions_pred_audio": ["a crowd of people are talking and laughing while a skateboard rolls by ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a goat screams and people speak in the background", "speaking following by laughing and clapping"], "sample_ids": ["xC8kbrKJmco", "u2f5NpsoHBg"], "start_seconds": ["0", "30"], "properties": ["background, goat, scream", "person, laugh, clap"], "captions_pred_video": [null, "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["a goat is bleating ", "a woman is speaking and a crowd is clapping"], "question": "which entity is more likely to be a joke", "label": 1}, {"captions": ["a dark barks and whimpers", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sYj4hpDUZDQ", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["barks, whimpers, dark", "music, gunfire, explosion"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a dog barks and a cat meows", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zliInBdC98Y", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["a, baby, cries, wails", "applause, audience, yells"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "water is sprayed across a hard surface"], "sample_ids": ["tdWhHV3X25Q", "sQwlkXjQabo"], "start_seconds": ["60", "10"], "properties": ["applause, audience, yells", "water, spray, surface"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["sawing of wood and rustling with leaves blowing in the distance", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["uiItxDsDMFI", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["sound, distance, leaves", "music, gunfire, explosion"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a saw is being used with background noise ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a woman speaks happily and an animal chirps"], "sample_ids": ["tDVADusiIoc", "uWAAAL4CIoc"], "start_seconds": ["60", "0"], "properties": ["wind, radio, waves", "a woman, chirps, animal"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "water flows and trickles"], "sample_ids": ["sQwlkXjQabo", "tB7hWb9gTuQ"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "water, flow, trickle"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["spraying followed by silence", "water is splashing and gurgling"], "question": "which entity is flowing", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a man speaks followed by another man speaking outside"], "sample_ids": ["wudZTNBtVqc", "viuTg1M-dqg"], "start_seconds": ["60", "30"], "properties": ["accelerates, engine, wind", "two men, speak, follow"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker?", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "some tunes played by whistling"], "sample_ids": ["tEE3MpBt1sg", "u6BnG6YZqJ4"], "start_seconds": ["50", "0"], "properties": ["drill, something, laugh", "tune, play, whistling"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sTpirNYo8vQ", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["a, tone, fast", "engine, laugh, loud"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["vbZ-0lGPneg", "yeFvk9x0wWI"], "start_seconds": ["30", "30"], "properties": ["a woman, a television program, a bird", "clack, bird, chirp"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "birds chirp in the background as a car drives by "], "question": "which entity has more birds", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tK4VlLsNxak", "vYkA3cfXp5Q"], "start_seconds": ["120", "30"], "properties": ["a, dial, telephone", "engine, accelerate, idle"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["w1mlz3Pe4fU", "zl9Dqx-j7q4"], "start_seconds": ["300", "6"], "properties": ["vocalize, chirp, continuously", "engine, laugh, loud"], "captions_pred_video": ["of a bird in a cage", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and singing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "water quietly rushes by while birds chirp in the background"], "sample_ids": ["vK93VuO0yNc", "sYITalLZjj4"], "start_seconds": ["30", "30"], "properties": ["male voice, bus, rumble", "water, rushes, background, birds"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "two ducks are swimming in the water near each other"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "wind blows and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "pigeons vocalize and birds chirp"], "sample_ids": ["tDlysoZiA1I", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["animal, grunts, chirps", "vocalize, bird, chirp"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "of the pigeon in the cage"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an infant crying as a woman laughs", "a car speeding up in the distance"], "sample_ids": ["xhmRY9yhC7c", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["a, laugh, infant", "distance, car, speed"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a man talks as several small engines run", "water splashes and a motorboat passes as people yell"], "sample_ids": ["u9A6VZQCZpU", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["a, man, talk", "water, splashes, motorboat"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about water?", "label": 1}, {"captions": ["an engine runs and a man speaks", "a vehicle engine accelerating then running on idle"], "sample_ids": ["yT5WfYMRr-U", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "engine, accelerate, idle"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "an engine is idling"], "question": "which entity is a vehicle engine?", "label": 1}, {"captions": ["a beep occurs briefly", "a man speaks followed by another man speaking outside"], "sample_ids": ["xtWeJ56-U-g", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["beep, occur, briefly", "two men, speak, follow"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "three men talk while wind blows and some liquid flows"], "sample_ids": ["su6FAOcOA8c", "vJ7JPEFhyLA"], "start_seconds": ["4", "16"], "properties": ["engine, idle, woman", "three men, wind, flow"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a bus engine?", "label": 0}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "small dogs yip and bark sharply"], "sample_ids": ["sjlVMgdGSK0", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["car, revving, loudly", "bark, yip, sharply"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a speedboat passes quickly on the water"], "sample_ids": ["vlJS7LN2XyM", "tjmoSi330GM"], "start_seconds": ["30", "23"], "properties": ["background, clocks, ticking", "speed, water, boat"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a ticktock of a clock", "a motorboat speeds through water with wind noise "], "question": "which is moving faster", "label": 1}, {"captions": ["birds chirp and wind blows", "paper is crumpling consistently"], "sample_ids": ["sxIvBMSavMQ", "v5cSxLaHADY"], "start_seconds": ["210", "0"], "properties": ["birds, chirp, wind", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a man speaks as a car is passing by"], "sample_ids": ["uPDn2BFTHk", "sK4u5T8hW78"], "start_seconds": ["140", "30"], "properties": ["lady, laugh, baby", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["xKB8O8LTs6s", "tiDFTC-5vU"], "start_seconds": ["70", "30"], "properties": ["music, gunshots, explosion", "male, duck, laugh"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking and ducks are quacking"], "question": "which entity is a comedy", "label": 1}, {"captions": ["a man speaks as crickets sing", "waves crash against a shoreline and people speak"], "sample_ids": ["ryFDPxgDOGc", "yFB25fqfU8I"], "start_seconds": ["570", "300"], "properties": ["a, crickets, sing", "wave, crash, shoreline"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be in a desert?", "label": 0}, {"captions": ["a vehicle engine runs and someone speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zF8yoL0rkbI", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["engine, run, someone", "rooster, crow, background, men"], "captions_pred_video": ["footage of the traffic on the street at night", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a toilet flushes and a female speaks"], "sample_ids": ["zTLVJCo4WEE", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["a, crickets, sing", "female, flushes, toilet"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "birds chirp as a train approaches"], "sample_ids": ["wqADXCzngMw", "xM4joTqDVp4"], "start_seconds": ["340", "160"], "properties": ["engine, idle, man", "bird, chirp, train"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "birds are chirping and a train is moving "], "question": "which entity is a train", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "a person snores loudly multiple times at a close distance"], "sample_ids": ["w-4gHptFNuU", "sSMl2vc3ek"], "start_seconds": ["21", "20"], "properties": ["engine revs, accelerates, bump", "loud, multiple, distance"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "a duck quacks continuously"], "sample_ids": ["uzQnlJXBbOM", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["ringing, beep, stop", "quacks, continuously, duck"], "captions_pred_video": ["footage of a person using a cell phone on a table", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a telephone rings and a man speaks", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "a machine beeps continuously"], "sample_ids": ["uYT5gxnyMWM", "y682ml90jGw"], "start_seconds": ["50", "11"], "properties": ["person, spray, yell", "beeps, machine, continuously"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a stream of water runs briefly"], "sample_ids": ["wnpJndXuxLc", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["blows, vehicle, train", "stream, water, run"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a infant makes noise and is excited"], "sample_ids": ["tZGN5a7ybxo", "wIJK3-5y0kA"], "start_seconds": ["60", "30"], "properties": ["ring, train, horn", "noise, excited, infant"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "an infant crying frantically"], "sample_ids": ["yks4cLgIDMc", "zwOBqeFTgiU"], "start_seconds": ["170", "30"], "properties": ["background, speaking, child", "cry, infant, frantically"], "captions_pred_video": ["footage of two kids wrestling on the floor", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking and a child is crying", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vddP56-ogds", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["water, splash, person, laugh", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "a man speaks as a car is passing by"], "sample_ids": ["rqu8iB22IY", "sK4u5T8hW78"], "start_seconds": ["5", "30"], "properties": ["sound, repeats, laugh", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wTideSjRFS0", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["food, sizzle, woman", "harsh, wind, blows"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking with wind noise in the background "], "question": "which entity is about a woman speaking and food sizzling while frying?", "label": 0}, {"captions": ["water splashes and a motorboat passes as people yell", "winds blows roughly as a vehicle races past"], "sample_ids": ["w5W5Kqtc8E", "xjvTpk2Zpr8"], "start_seconds": ["100", "70"], "properties": ["water, splashes, motorboat", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["a man talks as several small engines run", "a man speaks as a car is passing by"], "sample_ids": ["u9A6VZQCZpU", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zALy31PjDl0", "vbZ-0lGPneg"], "start_seconds": ["21", "30"], "properties": ["a man, a vehicle, a horn", "a woman, a television program, a bird"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a horn rings out as a machine runs by"], "sample_ids": ["wz7N8YRy74I", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["rooster, crow, background, people", "a, horn, run"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["v0x1odnXtP0", "tDlysoZiA1I"], "start_seconds": ["210", "0"], "properties": ["keyboard, type, computer", "animal, grunts, chirps"], "captions_pred_video": ["how to make money on youtube in spanish", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a person is typing on a keyboard", "birds are chirping and a rooster is crowing "], "question": "which entity is not a person?", "label": 1}, {"captions": ["water pouring and bubbling", "a car speeding up in the distance"], "sample_ids": ["uyRfq-jKPpo", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["water, bubbles, pouring", "distance, car, speed"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", null], "captions_pred_audio": ["water is running from a faucet", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["several beeps are followed by a hit and a woman talking", "a machine clanks and thumps and a male speaks"], "sample_ids": ["w34HjHr6gAY", "sWZzXuWYY"], "start_seconds": ["30", "420"], "properties": ["beeps, hit, woman", "male, clanks, thumps"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a sewing machine runs and a man speaks"], "question": "which entity has a male speaking?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "loud ringing of a telephone stops followed by a man speaking and a digital beep"], "sample_ids": ["vb1fPSDI4c", "uzQnlJXBbOM"], "start_seconds": ["30", "50"], "properties": ["multiple, people, yell", "ringing, beep, stop"], "captions_pred_video": [null, "footage of a person using a cell phone on a table"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a telephone rings and a man speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["a goat bleats as a person speaks", "a machine beeps continuously"], "sample_ids": ["tPJvjq9QePY", "y682ml90jGw"], "start_seconds": ["40", "11"], "properties": ["bleats, person, speak", "beeps, machine, continuously"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a clock ticktocks", "pigeons vocalize and birds chirp"], "sample_ids": ["v-g-j2uTByM", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["ticktocks, clock, ticktocks", "vocalize, bird, chirp"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "of the pigeon in the cage"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["xOZfdgAgJ9o", "y2bVZ7rz-5M"], "start_seconds": ["40", "280"], "properties": ["woman, whimpering, speaking", "motor noise, horn, siren"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["some tunes played by whistling", "birds twitter and chirp and clatter"], "sample_ids": ["u6BnG6YZqJ4", "yeFvk9x0wWI"], "start_seconds": ["0", "30"], "properties": ["tune, play, whistling", "chirp, twitter, clatter"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a person whistling a song", "birds chirp in the background as a car drives by "], "question": "which entity is not a musical instrument", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["wnpJndXuxLc", "vXlk0lIQBFo"], "start_seconds": ["50", "470"], "properties": ["beeps, loud, whistle", "wind, speak, vocalize"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity is not a series of light horn beeps followed by a loud steam whistle?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "an airplane engine runs"], "sample_ids": ["tOSWIURC-4", "yVPZ2MNWpms"], "start_seconds": ["0", "0"], "properties": ["engine, work, nearby", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a lawn mower is running ", "a car is driving by on the road "], "question": "which entity has a moving engine", "label": 1}, {"captions": ["an electronic device bleeps once", "a woman and man speak while food is frying"], "sample_ids": ["tHJ6JSa8Y4", "zk-xJGQU8-4"], "start_seconds": ["0", "130"], "properties": ["bleeps, electronic, device", "food, man, woman"], "captions_pred_video": [null, "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a clock is ticking and beeping", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "wind blows as people chatter quietly"], "sample_ids": ["w34HjHr6gAY", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["beeps, squawk, child speaking", "wind, chatter, people"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage is blurry and out of focus"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a machine beeps continuously"], "sample_ids": ["zk-xJGQU8-4", "y682ml90jGw"], "start_seconds": ["130", "11"], "properties": ["food, man, woman", "beeps, machine, continuously"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", null], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["water running down a sink while a man is talking", "vehicles pass by on a roadway"], "sample_ids": ["vSeGhaZt-aI", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["water, sink, talk", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaking with light rustling", "a man speaks as a car is passing by"], "sample_ids": ["zOZleIRqZm4", "sK4u5T8hW78"], "start_seconds": ["80", "30"], "properties": ["light, rustling, man", "a, car, pass"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more likely to be in a city", "label": 1}, {"captions": ["some people speak", "a woman talking as an infant is crying"], "sample_ids": ["vbZ-0lGPneg", "tMbMDvT50j8"], "start_seconds": ["30", "12"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "a, talk, infant"], "captions_pred_video": ["of a man holding a baby duck in his hands", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a baby cries and a woman speaks"], "question": "which entity is about a woman talking to an infant?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "wind blowing followed by a zoom"], "sample_ids": ["xO-Q2BlIIPU", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["two men, exclamation, speak", "wind, blow, zoom"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a person whistles a meandering tune", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["uFoga8sHpiw", "wqZ135Ssz0"], "start_seconds": ["90", "60"], "properties": ["person, tune, whistle", "two men, woman, birds"], "captions_pred_video": ["footage of a bird in a cage", null], "captions_pred_audio": ["a person whistles a song", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["several ducks are quacking and squawking", "wind blowing followed by a zoom"], "sample_ids": ["wfHeoPDLMaM", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["quacking, squawking, ducks", "wind, blow, zoom"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["ducks are quacking", "wind blows and a chainsaw cuts through wood "], "question": "which entity is silent", "label": 1}, {"captions": ["a duck quacks several times", "a man speaks as a car is passing by"], "sample_ids": ["vh30P49Po6s", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["quacks, duck, several", "a, car, pass"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a person speaks over rustling leaves", "a man speaks over intermittent keyboard taps"], "sample_ids": ["zOZleIRqZm4", "tw76HGONaKg"], "start_seconds": ["80", "570"], "properties": ["rustling, leaves, person", "audio, man, keyboard"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man speaks and types on a computer keyboard "], "question": "which entity is a person speaking over?", "label": 0}, {"captions": ["a goat bleats and someone makes a calling noise", "small dogs yip and bark sharply"], "sample_ids": ["vlS6YMeWAPo", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["noise, bleat, call", "bark, yip, sharply"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a goat bleats and birds chirp", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["someone snores nearby", "a child speaks in closed space"], "sample_ids": ["spJCm8tD9Zo", "yW6FWLSLkx4"], "start_seconds": ["90", "40"], "properties": ["someone snores, nearby, someone", "child, space, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a drill runs and two people laugh", "water splashes as an animal walks through"], "sample_ids": ["tEE3MpBt1sg", "w1ir-sZ3Im8"], "start_seconds": ["50", "90"], "properties": ["two people, laugh, drill", "animal, water, splashes"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wnpJndXuxLc", "tdWhHV3X25Q"], "start_seconds": ["50", "60"], "properties": ["blows, vehicle, train", "applause, audience, yells"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a person", "label": 1}, {"captions": ["a person is burping while a girl speaks", "a motorcycle engine works nearby"], "sample_ids": ["vdoxuJn9lTc", "tOSWIURC-4"], "start_seconds": ["40", "0"], "properties": ["person, burp, girl", "engine, work, nearby"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", null], "captions_pred_audio": ["a child speaks followed by a burp", "a lawn mower is running "], "question": "which entity is working", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wTideSjRFS0", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["food, sizzle, woman", "multiple, people, yell"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a crowd of people are talking and laughing"], "question": "which entity is more active", "label": 1}, {"captions": ["an aircraft engine runs", "a toilet flushes and a female speaks"], "sample_ids": ["yLCORCnd35Q", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["engine, aircraft, runs", "female, flushes, toilet"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "footage is blurry and out of focus"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "people cheer as a vehicle engine revs"], "sample_ids": ["zcDwZ6W7E3E", "xjhAnI2q6hM"], "start_seconds": ["180", "6"], "properties": ["a, man, speak", "engine revs, vehicle, people"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a male speaks over some small clicks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uXxVebHsGZ8", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["male, clicks, speak", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "vehicles pass by on a roadway"], "sample_ids": ["sWZzXuWYY", "tgbONvsP47Y"], "start_seconds": ["420", "0"], "properties": ["male, clanks, thumps", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a car is driving on the road "], "question": "which entity is a video of vehicles passing by on a roadway?", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a piece of wood is being placed down and sawed"], "sample_ids": ["yYEVLuqEytU", "uiItxDsDMFI"], "start_seconds": ["40", "30"], "properties": ["animal, pig, background", "wood, piece, saw"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a saw is being used with background noise "], "question": "which entity is being cut", "label": 1}, {"captions": ["birds chirp as a train approaches", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xM4joTqDVp4", "yajyRTUQk3U"], "start_seconds": ["160", "400"], "properties": ["bird, chirp, train", "a woman, something, fried"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a child speaks in closed space"], "sample_ids": ["vcmWSmvti8", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["music, man, fire", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["male speech with light ticking", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xO-Q2BlIIPU", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["male, speech, ticking", "rustling, ducks, quack"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["ugHJF0hfYkg", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["loud, propeller, move", "a woman, laughs, animal"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is not moving", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a telephone rings followed by a woman talking"], "sample_ids": ["xKB8O8LTs6s", "tGcFnX0GHI"], "start_seconds": ["70", "0"], "properties": ["music, gunshots, explosion", "ring, talk, woman"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a woman speaks as she rubs two objects together"], "sample_ids": ["vh30P49Po6s", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["loud, continuous, quacks", "two objects, woman, speak"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is silent", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yPUYU6t3rwo", "wz7N8YRy74I"], "start_seconds": ["370", "30"], "properties": ["birds chirp, objects are moved around, birds", "rooster, crow, background, men"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["insects buzz and a man speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is about a rooster?", "label": 1}, {"captions": ["a man speaks while water drains", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vSeGhaZt-aI", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["water, drain, man", "a woman, a television program, a bird"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and a dog is whimpering"], "question": "which entity has more moving parts", "label": 1}, {"captions": ["people speak as gunfire rings out", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wqTCwqVRDlk", "tDVADusiIoc"], "start_seconds": ["80", "60"], "properties": ["gunfire, ring, speak", "water, radio, man"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sTpirNYo8vQ", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["a, tone, fast", "harsh, wind, blows"], "captions_pred_video": ["of a man taking a selfie on a bus", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a woman speaks over sizzling noise"], "sample_ids": ["zCrAfDfv6-A", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["person, mouse, click", "noise, woman, speak"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person whistles a song", "a woman is speaking while food is frying in the background"], "question": "which entity is speaking over noise", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a car accelerates and wind blows"], "sample_ids": ["wTjoRj1se3U", "u0TrcHhkPQ"], "start_seconds": ["390", "20"], "properties": ["engine, run, people", "accelerates, wind, blows"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["an infant crying as a woman laughs", "a clock ticktocks"], "sample_ids": ["xhmRY9yhC7c", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["a, laugh, infant", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a baby crying in a baby bouncer", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a baby cries and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a boat travels through the waves as the wind blows loudly and a man speaks over a radio"], "sample_ids": ["wvKpEYswXO0", "tDVADusiIoc"], "start_seconds": ["150", "60"], "properties": ["sound, water, running", "wind, radio, waves"], "captions_pred_video": ["of the person preparing food in the kitchen", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a radio?", "label": 1}, {"captions": ["birds coo incessantly", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yZrFNS7GFBQ", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["coo, bird, incessant", "rustling, ducks, quack"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "a duck quacks and a woman speaks"], "question": "which entity is a bird", "label": 0}, {"captions": ["several ducks quack and cocks crow far away", "a man speaks while rain falls onto a hard surface"], "sample_ids": ["sNB8zxXneIM", "wqN6IIHw3po"], "start_seconds": ["20", "30"], "properties": ["several, quack, cocks", "rain, surface, fall"], "captions_pred_video": ["a group of geese in a cage", "in your own words what is happening in this screenshot? blood splattered all over the place"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a man is speaking and water is splashing"], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a door opens and birds chirp", "water flows and trickles"], "sample_ids": ["yeFvk9x0wWI", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "water, flow, trickle"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vJ7JPEFhyLA", "su6FAOcOA8c"], "start_seconds": ["16", "4"], "properties": ["three men, wind, flow", "engine, idle, woman"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a goat bleats and someone makes a calling noise"], "sample_ids": ["w6RTHR6AeAg", "vlS6YMeWAPo"], "start_seconds": ["40", "40"], "properties": ["call, owl, screech", "noise, bleat, call"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a goat bleats and birds chirp"], "question": "which entity is a call", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "plastic is tapped on while someone speaks"], "sample_ids": ["sWZzXuWYY", "wvKpEYswXO0"], "start_seconds": ["420", "150"], "properties": ["male, speech, banging", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is quieter", "label": 1}, {"captions": ["a person is snoring while sleeping", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vJrjSeP17yE", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "multiple, people, yell"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman speaks over sizzling noise", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["yajyRTUQk3U", "ziUT9IFTkjg"], "start_seconds": ["400", "10"], "properties": ["noise, woman, speak", "background, birds, rustling"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wvKpEYswXO0", "vJ7JPEFhyLA"], "start_seconds": ["150", "16"], "properties": ["sound, water, running", "three men, wind, flow"], "captions_pred_video": ["of the person preparing food in the kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a woman speaking softly?", "label": 0}, {"captions": ["a man speaks over a running engine and blowing wind", "a female speaks softly as paper crinkles"], "sample_ids": ["ylpYOorfH4o", "xvDdE3zNf8Y"], "start_seconds": ["410", "120"], "properties": ["engine, running, wind", "a, female, speaks"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman speaks and crumples paper"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["motors runs briefly and tires screech", "a duck quacks continuously"], "sample_ids": ["yRx9txMcBl0", "vh30P49Po6s"], "start_seconds": ["40", "30"], "properties": ["motors, tires, screech", "quacks, continuously, duck"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["birds chirp as a bell rings", "water splashes as an animal walks through"], "sample_ids": ["ziUT9IFTkjg", "w1ir-sZ3Im8"], "start_seconds": ["10", "90"], "properties": ["chirp, bell, ring", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["wqUmIEzuNz4", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["frog, bird, vocalize", "engine, laugh, loud"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a cat meows and rustles", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["viuTg1M-dqg", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["two men, speak, follow", "engine, idle, woman"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a heavy rain falls endlessly", "someone sprays a liquid onto a hard surface making a hiss sound"], "sample_ids": ["wP8ZKrlx3oA", "zO-LSSY92ZM"], "start_seconds": ["40", "30"], "properties": ["heavy, rain, fall", "liquid, surface, sound"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'"], "captions_pred_audio": ["a heavy rain is falling on a surface", "steam is hissing and hissing"], "question": "which entity is not a liquid?", "label": 0}, {"captions": ["winds blows roughly as a vehicle races past", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["xjvTpk2Zpr8", "ziUT9IFTkjg"], "start_seconds": ["70", "10"], "properties": ["wind, blows, vehicle", "background, birds, rustling"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "birds are chirping and a chime is ringing "], "question": "which entity is more calm", "label": 1}, {"captions": ["a child speaks in closed space", "people speak as gunfire rings out"], "sample_ids": ["yW6FWLSLkx4", "wqTCwqVRDlk"], "start_seconds": ["40", "80"], "properties": ["child, space, speak", "gunfire, ring, speak"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war zone", "label": 1}, {"captions": ["someone is burping continuously", "people speak as gunfire rings out"], "sample_ids": ["y636gklDioE", "wqTCwqVRDlk"], "start_seconds": ["20", "80"], "properties": ["burps, burps, burps", "gunfire, ring, speak"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person burps loudly several times", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a stream of water runs briefly", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["x-PeY8Yb8M4", "w5W5Kqtc8E"], "start_seconds": ["300", "100"], "properties": ["stream, water, run", "wind, blow, vehicle"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "goats bleat and metal clings"], "sample_ids": ["tw76HGONaKg", "tH17JPjDPnc"], "start_seconds": ["570", "260"], "properties": ["A, game, keyboard", "bleat, metal, clings"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "feed of the goats eating hay in the barn"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a cow is mooing and mechanisms are ticking "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "an airplane engine spools and people speak"], "sample_ids": ["s6DESzUTGjY", "wTjoRj1se3U"], "start_seconds": ["16", "390"], "properties": ["wind, laugh, woman", "airplane, engine, spool"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a jet engine is running and people are talking"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a propeller rotates loudly and intensely"], "sample_ids": ["tK4VlLsNxak", "ugHJF0hfYkg"], "start_seconds": ["120", "10"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "loud, intense, propeller"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a man speaks followed by another man speaking outside"], "sample_ids": ["wEBlkGWVWwE", "viuTg1M-dqg"], "start_seconds": ["260", "30"], "properties": ["a, babble, woman", "two men, speak, follow"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "a clock ticktocks"], "sample_ids": ["slZLHwNbbt4", "v-g-j2uTByM"], "start_seconds": ["300", "30"], "properties": ["a, horn, run", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "a vehicle engine accelerates and wind blows"], "sample_ids": ["wvKpEYswXO0", "wudZTNBtVqc"], "start_seconds": ["150", "60"], "properties": ["plastic, tap, speak", "accelerates, engine, wind"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage is of a parking lot with cars parked in it"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a man speaks as a motor runs in the background"], "sample_ids": ["ugHJF0hfYkg", "xZepNM9qcRA"], "start_seconds": ["10", "30"], "properties": ["engine, running, continuously", "background, motor, run"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["uYT5gxnyMWM", "tiDFTC-5vU"], "start_seconds": ["50", "30"], "properties": ["person, spray, yell", "male, duck, laugh"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["children speak as a female ask them questions", "some men converse over an engine running"], "sample_ids": ["wEBlkGWVWwE", "sCiy7QS1U"], "start_seconds": ["260", "300"], "properties": ["female, speak, questions", "men, converse, engine"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation between two people?", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "an infant crying as a woman laughs"], "sample_ids": ["vJ7JPEFhyLA", "xhmRY9yhC7c"], "start_seconds": ["16", "20"], "properties": ["three men, wind, flow", "a, laugh, infant"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as horns blow", "someone is typing on a computer keyboard"], "sample_ids": ["tHyNqRyK34A", "v0x1odnXtP0"], "start_seconds": ["24", "210"], "properties": ["a, man, speaks", "keyboard, type, computer"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["a horn honks and then loudly blares", "water splashes as an animal walks through"], "sample_ids": ["wnpJndXuxLc", "w1ir-sZ3Im8"], "start_seconds": ["50", "90"], "properties": ["horn, honk, loud", "animal, water, splashes"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "water splashes and gurgles as people speak"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a goat bleats as a person speaks", "a duck quacks continuously"], "sample_ids": ["tPJvjq9QePY", "vh30P49Po6s"], "start_seconds": ["40", "30"], "properties": ["bleats, person, speak", "quacks, continuously, duck"], "captions_pred_video": ["a dog and a sheep in a barn", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a baby cries and a man speaks", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 0}, {"captions": ["a male speaks and another male speaks", "material crumbles into a microphone"], "sample_ids": ["viuTg1M-dqg", "vofpvUo6NAw"], "start_seconds": ["30", "220"], "properties": ["two males, speaking, male", "material, crumbles, microphone"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "person wrapping a toy car in a plastic bag"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "paper is being crumpled and crinkled"], "question": "which entity is not a person", "label": 1}, {"captions": ["a machine beeps continuously", "a duck quacks loudly and continuously"], "sample_ids": ["y682ml90jGw", "vh30P49Po6s"], "start_seconds": ["11", "30"], "properties": ["beeps, machine, continuously", "loud, continuous, quacks"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a beeping sound is being made ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["a man speaks as a car is passing by", "a vehicle accelerates and squeals tires"], "sample_ids": ["sK4u5T8hW78", "yRx9txMcBl0"], "start_seconds": ["30", "40"], "properties": ["a, car, pass", "accelerates, tires, squeals"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a car is revving its engine and skidding "], "question": "which vehicle is accelerating?", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "an insect buzzes around continuously"], "sample_ids": ["soTOh3zYJfY", "v25l1jef3JY"], "start_seconds": ["40", "0"], "properties": ["vehicle, skid, tires", "buzzes, continuously, insect"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a fly is buzzing around a microphone "], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a car accelerates and wind blows"], "sample_ids": ["ugHJF0hfYkg", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["engine, running, continuously", "accelerates, wind, blows"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "several insects fly while two men talk"], "sample_ids": ["sapQIQUhFc", "s-T9OVOiMLo"], "start_seconds": ["280", "330"], "properties": ["water, trickles, flow", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a clock ticktocks"], "sample_ids": ["xyL9F5VrjkE", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["engine, run, wind", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "people applaud and hoot and chat quietly"], "sample_ids": ["vbr9mHKc8WM", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["noise, loudness, engine", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine is idling", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "animals bleat and moo as a person speaks"], "sample_ids": ["vfYTJq7nU", "tPJvjq9QePY"], "start_seconds": ["130", "40"], "properties": ["ducks, quack, man", "animal, bleat, moo"], "captions_pred_video": [null, "a dog and a sheep in a barn"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a baby cries and a man speaks"], "question": "which animal is speaking", "label": 1}, {"captions": ["water runs into a sink while men speak", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vzceMbklWc", "wz7N8YRy74I"], "start_seconds": ["180", "30"], "properties": ["water, sink, run", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["water is running and a man is speaking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more men", "label": 1}, {"captions": ["someone snores nearby", "pigeons vocalize and birds chirp"], "sample_ids": ["spJCm8tD9Zo", "uiS58TNyUiw"], "start_seconds": ["90", "430"], "properties": ["someone snores, nearby, someone", "vocalize, bird, chirp"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of the pigeon in the cage"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "someone is typing on a computer keyboard"], "sample_ids": ["s3cTDAj31g", "v0x1odnXtP0"], "start_seconds": ["80", "210"], "properties": ["man, talk, woman", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zCrAfDfv6-A", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["person, mouse, click", "gun, shoot, water"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a person whistles a song", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause harm", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a machine beeps continuously"], "sample_ids": ["ukxt9I7eMMg", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["food, pan, cook", "beeps, machine, continuously"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a beeping sound is being made "], "question": "which entity is not a machine?", "label": 0}, {"captions": ["a woman talks while something is fried and objects are tapped", "people speak as gunfire rings out"], "sample_ids": ["yajyRTUQk3U", "wqTCwqVRDlk"], "start_seconds": ["400", "80"], "properties": ["a woman, something, fried", "gunfire, ring, speak"], "captions_pred_video": ["- a woman cooking in the kitchen", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["xjhAnI2q6hM", "su6FAOcOA8c"], "start_seconds": ["6", "4"], "properties": ["engine revs, vehicle, people", "engine, idle, woman"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a woman is speaking and a subway train is moving "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["some liquid flows while a woman laughs and man talks", "a speedboat passes quickly on the water"], "sample_ids": ["vddP56-ogds", "tjmoSi330GM"], "start_seconds": ["30", "23"], "properties": ["liquid, laughs, man", "speed, water, boat"], "captions_pred_video": [null, "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a motorboat speeds through water with wind noise "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a woman speaks as she rubs two objects together"], "sample_ids": ["zl9Dqx-j7q4", "vzxHnu-SFEw"], "start_seconds": ["6", "80"], "properties": ["engine, laugh, loud", "two objects, woman, speak"], "captions_pred_video": ["footage of a man driving a car in the dark", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a jet engine roars ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "wind blows as people chatter quietly"], "sample_ids": ["voJh2gJxXhA", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["music, frog, croak", "wind, chatter, people"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "footage is blurry and out of focus"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["male speech with light ticking", "a duck quacks continuously"], "sample_ids": ["xO-Q2BlIIPU", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["male, speech, ticking", "quacks, continuously, duck"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["food is frying then a woman speaks", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["ukxt9I7eMMg", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["food, woman, speak", "male, duck, laugh"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and ducks are quacking"], "question": "which entity is about a duck?", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "people applaud and hoot and chat quietly"], "sample_ids": ["sofxkNWaP0s", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["wind, engine, louder", "people, applaud, hoot"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", null], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tjmoSi330GM", "tiDFTC-5vU"], "start_seconds": ["23", "30"], "properties": ["speed, water, boat", "male, duck, laugh"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", null], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "a person snores loudly multiple times at a close distance"], "sample_ids": ["y8dSeubCNI", "sSMl2vc3ek"], "start_seconds": ["4", "20"], "properties": ["engine revving, people speaking, motorcycle", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine revving and people talking in the background", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["t97k0cejSQE", "uEU-Hg5MTN8"], "start_seconds": ["250", "27"], "properties": ["bird, chirp, insect", "a woman, laughs, animal"], "captions_pred_video": ["a bee on a purple thistle flower", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be in a forest", "label": 0}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a infant makes noise and is excited"], "sample_ids": ["w2JXXIAdUdg", "wIJK3-5y0kA"], "start_seconds": ["10", "30"], "properties": ["snoring, distance, person", "noise, excited, infant"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["an airplane engine spools and people speak", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wTjoRj1se3U", "zFjIWfSD-4"], "start_seconds": ["390", "410"], "properties": ["airplane, engine, spool", "People, motor, brakes"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a motor?", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "children cheer as a man speaks then an audience screams"], "sample_ids": ["xzKKf9bKNUo", "vJvryTwuAV8"], "start_seconds": ["10", "16"], "properties": ["background, noise, snoring", "audience, cheer, man"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and a crowd is shouting and whooping "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "some tunes played by whistling"], "sample_ids": ["yYEVLuqEytU", "u6BnG6YZqJ4"], "start_seconds": ["40", "0"], "properties": ["animal, pig, background", "tune, play, whistling"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["continuous snoring", "a woman and man speak while food is frying"], "sample_ids": ["sLkeqCDJIyw", "zk-xJGQU8-4"], "start_seconds": ["120", "130"], "properties": ["loud, snoring, noise", "food, man, woman"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "a motor runs steadily as a man speaks, then the motor revs twice"], "sample_ids": ["s3cTDAj31g", "ylpYOorfH4o"], "start_seconds": ["80", "410"], "properties": ["man, talk, woman", "motor, run, steady"], "captions_pred_video": [null, "for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking and an engine is revving"], "question": "which entity is silent", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a man speaks as a car is passing by"], "sample_ids": ["yajyRTUQk3U", "sK4u5T8hW78"], "start_seconds": ["400", "30"], "properties": ["a woman, something, fried", "a, car, pass"], "captions_pred_video": ["- a woman cooking in the kitchen", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "pigeons vocalize and birds chirp"], "sample_ids": ["vMf1dLD6Sng", "uiS58TNyUiw"], "start_seconds": ["6", "430"], "properties": ["frog, bird, vocalize", "vocalize, bird, chirp"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "of the pigeon in the cage"], "captions_pred_audio": ["a frog croaks loudly", "a man is speaking and a bee is buzzing"], "question": "which entity is a frog", "label": 0}, {"captions": ["a female speaks softly as paper crinkles", "a woman speaks as she rubs two objects together"], "sample_ids": ["xvDdE3zNf8Y", "vzxHnu-SFEw"], "start_seconds": ["120", "80"], "properties": ["a, female, speaks", "two objects, woman, speak"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman speaks and crumples paper", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is about a woman speaking?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a train horn blows as it passes by"], "sample_ids": ["vs65y4qmyBE", "zVacuqSb4LI"], "start_seconds": ["340", "30"], "properties": ["engine, run, man", "horn, blows, train"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a child speaks in closed space"], "sample_ids": ["xfaoyyzw2WU", "yW6FWLSLkx4"], "start_seconds": ["180", "40"], "properties": ["loud, jet engine, roar", "child, space, speak"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a woman is speaking with background noise and breathing sounds "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["siJFXfGWgDk", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["man, woman, vehicle", "stream, water, flow"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a man speaks as a motor runs in the background"], "sample_ids": ["uEU-Hg5MTN8", "xZepNM9qcRA"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "background, motor, run"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a car speeding up in the distance", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["u0TrcHhkPQ", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["distance, car, speed", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["water rushes by", "music plays and a woman speaks on a radio before gunshots are fired"], "sample_ids": ["x-PeY8Yb8M4", "xKB8O8LTs6s"], "start_seconds": ["300", "70"], "properties": ["water, rushes, by", "music, radio, gunshots"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car is driving on a wet road ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a video of a woman speaking on a radio?", "label": 1}, {"captions": ["birds coo incessantly", "an infant crying as a woman laughs"], "sample_ids": ["yZrFNS7GFBQ", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["coo, bird, incessant", "a, laugh, infant"], "captions_pred_video": ["of the bird in the cage", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["an owl hoots in the background ", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "pigeons vocalize and birds chirp"], "sample_ids": ["w9lpbUn0hPc", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["male, wind, rustling", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "birds chirp and objects are moved around"], "sample_ids": ["zVacuqSb4LI", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["blares, fades, train", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "a car speeding up in the distance"], "sample_ids": ["yZp6xizR0yU", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["animal, bleat, cry", "distance, car, speed"], "captions_pred_video": ["footage of a woman feeding goats in a barn", null], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a rumble grows louder", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["y4MY9mp8-TA", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["loudness, increase, rumble", "motor noise, horn, siren"], "captions_pred_video": ["a helicopter flying in the sky", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a helicopter flies overhead ", "a truck is honking its horn and a siren is blaring "], "question": "which is louder", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["su6FAOcOA8c", "zl9Dqx-j7q4"], "start_seconds": ["4", "6"], "properties": ["engine, idle, woman", "engine, laugh, loud"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a jet engine roars "], "question": "which entity is a man?", "label": 1}, {"captions": ["a person is whistling a tune", "paper is crumpling consistently"], "sample_ids": ["scYRUkrFLiQ", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["a, tune, whistle", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person whistling a song", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["y682ml90jGw", "yajyRTUQk3U"], "start_seconds": ["11", "400"], "properties": ["beeps, series, electronic", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zofjfKhqLk8", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["background, metal, clings", "wind, blow, vehicle"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running?", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a stream of water flows as people talk and wind blows"], "sample_ids": ["u21-Z5gJCB8", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["background, voice, man", "stream, water, flow"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "plastic is tapped on while someone speaks"], "sample_ids": ["zl9Dqx-j7q4", "wvKpEYswXO0"], "start_seconds": ["6", "150"], "properties": ["motors rev, laugh, loudly", "plastic, tap, speak"], "captions_pred_video": ["footage of a man driving a car in the dark", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a jet engine roars ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sOa7g-44Dag", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["background, man, spray", "loud, laughter, intermittent"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a person is snoring while sleeping", "a child speaks in closed space"], "sample_ids": ["vJrjSeP17yE", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["a person is sleeping, snoring, person", "child, space, speak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "vehicles pass by on a roadway"], "sample_ids": ["vbZ-0lGPneg", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["a woman, a television program, a bird", "pass, vehicle, roadway"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a car is driving on the road "], "question": "which entity has more vehicles", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a child speaks in closed space"], "sample_ids": ["sWZzXuWYY", "yW6FWLSLkx4"], "start_seconds": ["420", "40"], "properties": ["male, clanks, thumps", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xM4joTqDVp4", "yajyRTUQk3U"], "start_seconds": ["160", "400"], "properties": ["background, chirp, birds", "a woman, something, fried"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["viuTg1M-dqg", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["two men, speak, follow", "loud, jet engine, roar"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "people cheer as a vehicle engine revs"], "sample_ids": ["w6RTHR6AeAg", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["call, owl, screech", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a car accelerates and wind blows"], "sample_ids": ["tGcFnX0GHI", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["ring, talk, woman", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zTLVJCo4WEE", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "stream, water, flow"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "waves crash against a shoreline and people speak"], "sample_ids": ["w8uLijTqtlU", "yFB25fqfU8I"], "start_seconds": ["70", "300"], "properties": ["wind, microphone, noise", "wave, crash, shoreline"], "captions_pred_video": ["footage is blurry and shaky", "footage of a person surfing in the ocean"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "some men converse over an engine running"], "sample_ids": ["tQWGZLItBXk", "sCiy7QS1U"], "start_seconds": ["170", "300"], "properties": ["music, person, ding", "men, converse, engine"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a person speaking?", "label": 0}, {"captions": ["some tunes played by whistling", "wind blows as people chatter quietly"], "sample_ids": ["u6BnG6YZqJ4", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["tune, play, whistling", "wind, chatter, people"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vb1fPSDI4c", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["multiple, people, yell", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a crowd of people are talking and laughing", "a duck quacks and a woman speaks"], "question": "which entity has more people speaking?", "label": 0}, {"captions": ["small dogs growl, bark and yip.", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["sShpyu2l4YQ", "rqu8iB22IY"], "start_seconds": ["0", "5"], "properties": ["growl, bark, yip", "sound, repeats, laugh"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a dog barks and a man speaks while music plays "], "question": "which entity is more likely to be repeated", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "water pouring and bubbling"], "sample_ids": ["vZAw4apG0Es", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["background, tick, repeat", "water, bubbles, pouring"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a clock is ticking and people are talking", "water is running from a faucet"], "question": "which entity is more likely to be a video of a man speaking?", "label": 0}, {"captions": ["birds chirp and pigeons vocalize while walking around", "several ducks quack and cocks crow far away"], "sample_ids": ["wIvYjuR3nrg", "sNB8zxXneIM"], "start_seconds": ["9", "20"], "properties": ["birds, pigeons, vocalize", "several, quack, cocks"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "a group of geese in a cage"], "captions_pred_audio": ["birds are chirping and cooing", "a rooster is crowing and wind is blowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a clock ticktocks"], "sample_ids": ["sLUnaPT5gM8", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["loud, laughter, intermittent", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a male speaks and another male speaks", "a fly buzzes around loudly as birds chirp"], "sample_ids": ["viuTg1M-dqg", "uJV8NDaHqqk"], "start_seconds": ["30", "100"], "properties": ["two males, speaking, male", "loud, fly, chirp"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "a bee hive in a wooden box"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a swarm of bees buzzing around"], "question": "which entity is louder", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a man speaks as a motor runs in the background"], "sample_ids": ["vr8ZXjEBhMQ", "xZepNM9qcRA"], "start_seconds": ["150", "30"], "properties": ["wind, blow, zoom", "background, motor, run"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a man speaking in the background?", "label": 1}, {"captions": ["two frogs croak at each other", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["zg0X6BnhOLQ", "zl9Dqx-j7q4"], "start_seconds": ["410", "6"], "properties": ["two frogs, croak, at each other", "engine, laugh, loud"], "captions_pred_video": ["footage of lightning in the sky at night", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a frog is croaking", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "a woman speaks happily and an animal chirps"], "sample_ids": ["xjhAnI2q6hM", "uWAAAL4CIoc"], "start_seconds": ["6", "0"], "properties": ["wind, blow, loudly", "a woman, chirps, animal"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["children speak and play together", "a power tool runs and touches a surface"], "sample_ids": ["yVVP8XvWJTo", "zfvPRf3chY"], "start_seconds": ["260", "290"], "properties": ["children, speak, play", "power tool, run, touch"], "captions_pred_video": ["footage of a playground at a school or daycare center", null], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a man is speaking while a power tool is being used "], "question": "which is not a person", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vZAw4apG0Es", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["background, tick, repeat", "a, scream, girl"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a clock is ticking and people are talking", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["leaves rustle while man speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zOZleIRqZm4", "uEU-Hg5MTN8"], "start_seconds": ["80", "27"], "properties": ["leaves, rustle, speak", "a woman, laughs, animal"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a woman is speaking and a baby is crying"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a muffled toilet flushes and the water drains", "an airplane engine spools and people speak"], "sample_ids": ["sfAvvZwdLCY", "wTjoRj1se3U"], "start_seconds": ["20", "390"], "properties": ["flushes, drains, water", "airplane, engine, spool"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a toilet is flushed", "a jet engine is running and people are talking"], "question": "which entity is a machine", "label": 1}, {"captions": ["several insects fly while two men talk", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["s-T9OVOiMLo", "zFjIWfSD-4"], "start_seconds": ["330", "410"], "properties": ["several, fly, men", "People, motor, brakes"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", null], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "the rumbling of a bus followed by a soft male voice"], "sample_ids": ["u7C-AEBQM", "vK93VuO0yNc"], "start_seconds": ["30", "30"], "properties": ["ticks, rhythmic, quiet", "male voice, bus, rumble"], "captions_pred_video": [null, "footage is blurry due to the movement of the bus as it drives through the city at night"], "captions_pred_audio": ["a ticktock of a clock", "a car drives by with wind noise in the background "], "question": "which entity is quieter", "label": 0}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a toilet flushes and water drains"], "sample_ids": ["weDbePuc-Xc", "sfAvvZwdLCY"], "start_seconds": ["40", "20"], "properties": ["music, slaps, human", "water drains, flushes, water"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a female speaks softly as paper crinkles"], "sample_ids": ["zY3icUyMdh8", "xvDdE3zNf8Y"], "start_seconds": ["20", "120"], "properties": ["dog, bark, engine", "a, female, speaks"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman speaks and crumples paper"], "question": "which entity is speaking softly", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "people applaud and hoot and chat quietly"], "sample_ids": ["w-4gHptFNuU", "wwyfGO2J4"], "start_seconds": ["21", "90"], "properties": ["engine revs, accelerates, bump", "people, applaud, hoot"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a video of a performance", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yVumC9TGknc", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["humming, clock, birds", "airplane, boy, fly"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a series of beeps and chirps", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a man speaks as a car is passing by"], "sample_ids": ["sU53zg9Jp7s", "sK4u5T8hW78"], "start_seconds": ["380", "30"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "a, car, pass"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vBslzh7saPw", "vb1fPSDI4c"], "start_seconds": ["90", "30"], "properties": ["engine, roar, louder", "multiple, people, yell"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a crowd of people are talking and laughing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "two women and a man talk while a kid cries"], "sample_ids": ["ukg5L09Wpvo", "wyllXV6PjKo"], "start_seconds": ["150", "30"], "properties": ["a train, a horn, a bell", "a kid, talk, cry"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a woman speaks and a baby cries"], "question": "which entity has a kid?", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a horn rings out as a machine runs by"], "sample_ids": ["shmR4OZtzqA", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["man, engine, idle", "a, horn, run"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man speaks while a motor runs", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sK4u5T8hW78", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["a, car, pass", "men, talk, cars"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["continuous snoring", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sLkeqCDJIyw", "uYT5gxnyMWM"], "start_seconds": ["120", "50"], "properties": ["loud, snoring, noise", "female, spraying, scream"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is louder", "label": 1}, {"captions": ["a toilet flushes and water drains", "a clock ticktocks in wind"], "sample_ids": ["sfAvvZwdLCY", "yVumC9TGknc"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "ticktocks, clock, wind"], "captions_pred_video": ["footage of the toilet in the bathroom", "game title screen of the game shadow of the colossus on sony playstation 2"], "captions_pred_audio": ["a toilet is flushed", "a series of beeps and chirps"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["a muffled toilet flushes and the water drains", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sfAvvZwdLCY", "vb1fPSDI4c"], "start_seconds": ["20", "30"], "properties": ["flushes, drains, water", "multiple, people, yell"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wRBHTgrbiwg", "vfYTJq7nU"], "start_seconds": ["50", "130"], "properties": ["birds, chirp, cooing", "rustling, ducks, quack"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a duck quacks and a woman speaks"], "question": "which entity is about birds?", "label": 0}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["ziUT9IFTkjg", "wqZ135Ssz0"], "start_seconds": ["10", "60"], "properties": ["background, birds, rustling", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "water rushes by"], "sample_ids": ["xO-Q2BlIIPU", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["two men, exclamation, speak", "water, rushes, by"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a car is driving on a wet road "], "question": "which entity is more calm", "label": 0}, {"captions": ["a man speaks while video game music plays with some clicking", "a telephone rings followed by a woman talking"], "sample_ids": ["tw76HGONaKg", "tGcFnX0GHI"], "start_seconds": ["570", "0"], "properties": ["music, click, man", "ring, talk, woman"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "someone whistles a tune"], "sample_ids": ["yI-KvObbDoY", "sIXTftIuUgw"], "start_seconds": ["260", "90"], "properties": ["sound, smack, wind", "someone, tune, whistle"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", null], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a clock ticktocks"], "sample_ids": ["zALy31PjDl0", "v-g-j2uTByM"], "start_seconds": ["21", "30"], "properties": ["a man, a vehicle, a horn", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["white noise and birds chirping", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wRBHTgrbiwg", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["noise, white, chirping", "a woman, something, fried"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a woman is speaking while food is frying in the background"], "question": "which entity is a video", "label": 1}, {"captions": ["an insect buzzes around continuously", "wind loudly blowing while people speak in the background followed by a horn blowing"], "sample_ids": ["v25l1jef3JY", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["buzzes, continuously, insect", "wind, blow, loudly"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a truck is revving its engine and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["food is frying and sizzles", "three men talk while wind blows and some liquid flows"], "sample_ids": ["zNRChLjqcU", "vJ7JPEFhyLA"], "start_seconds": ["220", "16"], "properties": ["food is frying, sizzles, food", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a person?", "label": 0}, {"captions": ["dogs bark as an engine runs and a person whistles", "an airplane engine spools and people speak"], "sample_ids": ["zY3icUyMdh8", "wTjoRj1se3U"], "start_seconds": ["20", "390"], "properties": ["dog, bark, engine", "airplane, engine, spool"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a jet engine is running and people are talking"], "question": "which entity is a machine", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "wind blows as people chatter quietly"], "sample_ids": ["tDVADusiIoc", "xBxDz0CFVn0"], "start_seconds": ["60", "30"], "properties": ["man, radio, blows", "wind, chatter, people"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["sapQIQUhFc", "w34HjHr6gAY"], "start_seconds": ["280", "30"], "properties": ["liquid, flow, distance", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a infant makes noise and is excited"], "sample_ids": ["smDKStoHBJo", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["a, talk, baby, cry", "noise, excited, infant"], "captions_pred_video": ["a man holding a crying baby in his arms", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a baby cries and a woman speaks"], "question": "which is a more active scene", "label": 1}, {"captions": ["a man speaking with light rustling", "dishes cling together then a man begins to speak"], "sample_ids": ["zOZleIRqZm4", "sQGXqGcwOTc"], "start_seconds": ["80", "3"], "properties": ["light, rustling, man", "cling, speak, dishes"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["birds chirp and a pop occurs before a man speaks", "water flows as men speak and yell"], "sample_ids": ["zuua6-5goWw", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["sound, pop, bird", "water, flow, men"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["water quietly rushes by while birds chirp in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["sYITalLZjj4", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["water, rushes, background, birds", "wind, blows, vehicle"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["wind blows and birds chirp", "a jet engine roars and wind blows "], "question": "which entity is more calm", "label": 0}, {"captions": ["ticking continues without interruption", "a horn honks followed by a loud continuous buzzing while men speak"], "sample_ids": ["v-g-j2uTByM", "wsHBIgzs9Fs"], "start_seconds": ["30", "50"], "properties": ["ticking, continuous, clock", "horn, continuous, buzzing"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "shows a motorcycle riding down a country road with a motorcycle in the foreground"], "captions_pred_audio": ["a clock is ticking loudly", "a car accelerates and revs its engine while a man speaks "], "question": "which entity is continuous", "label": 1}, {"captions": ["several insects fly while two men talk", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["s-T9OVOiMLo", "uYT5gxnyMWM"], "start_seconds": ["330", "50"], "properties": ["several, fly, men", "a, scream, girl"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has more people", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xjhAnI2q6hM", "tDVADusiIoc"], "start_seconds": ["6", "60"], "properties": ["engine revs, vehicle, people", "water, radio, man"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a vehicle?", "label": 0}, {"captions": ["an animal growls followed by birds chirping", "people speak as gunfire rings out"], "sample_ids": ["y2ZBGpgbhHM", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["animal, growl, bird", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "vehicles pass by on a roadway"], "sample_ids": ["vSeGhaZt-aI", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, speak", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["an aircraft engine runs", "water is sprayed across a hard surface"], "sample_ids": ["yLCORCnd35Q", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["engine, aircraft, runs", "water, spray, surface"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a telephone rings followed by a woman talking"], "sample_ids": ["uPDn2BFTHk", "tGcFnX0GHI"], "start_seconds": ["140", "0"], "properties": ["lady, laugh, baby", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation between a woman and a baby?", "label": 0}, {"captions": ["a clock ticktocks", "water is sprayed across a hard surface"], "sample_ids": ["v-g-j2uTByM", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["ticktocks, clock, ticktocks", "water, spray, surface"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a clock is ticking loudly", "spraying followed by silence"], "question": "which entity is not a clock?", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "people cheer as a vehicle engine revs"], "sample_ids": ["w34HjHr6gAY", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["beeps, squawk, child speaking", "engine revs, vehicle, people"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wqADXCzngMw", "xKB8O8LTs6s"], "start_seconds": ["340", "70"], "properties": ["audio, humming, revving", "music, gunfire, explosion"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a toilet flushes and a female speaks"], "sample_ids": ["xjhAnI2q6hM", "yaln9y8I7ms"], "start_seconds": ["6", "230"], "properties": ["engine revs, vehicle, people", "female, flushes, toilet"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage is blurry and out of focus"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wjsXBsc7M40", "sSMl2vc3ek"], "start_seconds": ["10", "20"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "loud, multiple, distance"], "captions_pred_video": ["footage of the baby playing with a toothbrush", null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["yZp6xizR0yU", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["animal, bleat, cry", "applause, audience, yells"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "wind blows as people chatter quietly"], "sample_ids": ["xV7Mg1QucSc", "xBxDz0CFVn0"], "start_seconds": ["14", "30"], "properties": ["alarm, ticktocks, laughs", "wind, chatter, people"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage is blurry and out of focus"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["someone whistles a song", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["sIXTftIuUgw", "uWPRNLnpy7Y"], "start_seconds": ["90", "10"], "properties": ["someone, song, whistle", "accelerate, laugh, vehicle"], "captions_pred_video": [null, "is taken from a car driving down the street"], "captions_pred_audio": ["a person whistling a song", "a car accelerates and revs its engine "], "question": "which entity is more likely to be a song", "label": 0}, {"captions": ["a machine beeps continuously", "a woman speaks and then a man speaks"], "sample_ids": ["y682ml90jGw", "vbpKkWvfOu4"], "start_seconds": ["11", "560"], "properties": ["beeps, machine, continuously", "a, man, speaks"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking and a man is speaking"], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vXlk0lIQBFo", "tDVADusiIoc"], "start_seconds": ["470", "60"], "properties": ["wind, talk, vocalize", "water, radio, man"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "multiple insects buzz over rustling wind"], "sample_ids": ["vzxHnu-SFEw", "tMJne1a4AFI"], "start_seconds": ["80", "0"], "properties": ["two objects, woman, speak", "wind, buzz, rustling"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "a swarm of bees on the ground"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a swarm of bees buzzing around"], "question": "which entity is moving", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wjsXBsc7M40", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "music, gunfire, explosion"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a baby laughs and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a person snores loudly multiple times at a close distance"], "sample_ids": ["t69a8aRKhmc", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["a, b, c", "loud, multiple, distance"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vveS8HT7Uog", "uEU-Hg5MTN8"], "start_seconds": ["100", "27"], "properties": ["a man, objects, speak", "animal, grunts, snorts"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a frog croaks as other frogs croak in the background"], "sample_ids": ["ugHJF0hfYkg", "yswmmRZFItk"], "start_seconds": ["10", "0"], "properties": ["loud, propeller, move", "background, frog, croak"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a close up of a frog in the water"], "captions_pred_audio": ["a helicopter is flying overhead ", "a frog is croaking"], "question": "which is quieter", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a woman speaks happily and an animal chirps"], "sample_ids": ["zkKdxzNC97Y", "uWAAAL4CIoc"], "start_seconds": ["27", "0"], "properties": ["loud, bang, noise", "a woman, chirps, animal"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "material crumbles into a microphone"], "sample_ids": ["vYkA3cfXp5Q", "vofpvUo6NAw"], "start_seconds": ["30", "220"], "properties": ["speed, idle, accelerate", "material, crumbles, microphone"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "person wrapping a toy car in a plastic bag"], "captions_pred_audio": ["an engine is idling", "paper is being crumpled and crinkled"], "question": "which entity is more likely to be a video of a car engine?", "label": 0}, {"captions": ["a man speaks as he moves silverware in a bowl", "an infant crying frantically"], "sample_ids": ["x6ijhqRY38s", "zwOBqeFTgiU"], "start_seconds": ["250", "30"], "properties": ["bowl, silverware, man", "cry, infant, frantically"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["v0x1odnXtP0", "yajyRTUQk3U"], "start_seconds": ["210", "400"], "properties": ["keyboard, type, computer", "a woman, something, fried"], "captions_pred_video": ["how to make money on youtube in spanish", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking while food is frying in the background"], "question": "which entity is a demonstration of cooking?", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "children cheer as a man speaks then an audience screams"], "sample_ids": ["wnpJndXuxLc", "vJvryTwuAV8"], "start_seconds": ["50", "16"], "properties": ["beeps, loud, whistle", "audience, cheer, man"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking and a crowd is shouting and whooping "], "question": "which entity is a person speaking to an audience?", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zgUgkpk78xU", "uEU-Hg5MTN8"], "start_seconds": ["70", "27"], "properties": ["horn, bells, ring", "a woman, laughs, animal"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "an infant crying as a woman laughs"], "sample_ids": ["vbpKkWvfOu4", "xhmRY9yhC7c"], "start_seconds": ["560", "20"], "properties": ["a, man, speaks", "a, laugh, infant"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a baby cries and a woman speaks"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["material crumbles into a microphone", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vofpvUo6NAw", "zl9Dqx-j7q4"], "start_seconds": ["220", "6"], "properties": ["material, crumbles, microphone", "engine, laugh, loud"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage of a man driving a car in the dark"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "a vehicle engine accelerating then running on idle"], "sample_ids": ["w0xsN8X18Y", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["rain, thunder, surface", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "an engine is idling"], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "an infant crying frantically"], "sample_ids": ["uRExseg-0XI", "zwOBqeFTgiU"], "start_seconds": ["210", "30"], "properties": ["woman, man, water", "cry, infant, frantically"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "water flows and trickles"], "sample_ids": ["t8CV69hcvF0", "tB7hWb9gTuQ"], "start_seconds": ["210", "30"], "properties": ["person, sneeze, follow", "water, flow, trickle"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman sneezes and speaks", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["someone is snoring while sleeping", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["ujMt0-D-x2k", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["snore, sleep, someone", "engine, idle, woman"], "captions_pred_video": ["of the dog playing with a toy on the floor", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a subway train is moving "], "question": "which entity is a person", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sLUnaPT5gM8", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["loud, laughter, intermittent", "stream, water, flow"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "people speak as gunfire rings out"], "sample_ids": ["vZAw4apG0Es", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["background, tick, repeat", "gunfire, ring, speak"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a child speaks", "a child speaks"], "sample_ids": ["yW6FWLSLkx4", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["a, child, speaks", "a, child, speaks"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a woman is speaking with background noise and breathing sounds "], "question": "which child speaks", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a toilet flushes and water drains"], "sample_ids": ["ylpYOorfH4o", "sfAvvZwdLCY"], "start_seconds": ["410", "20"], "properties": ["engine, run, loud", "water drains, flushes, water"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a goat bleats as a person speaks", "a car accelerates and wind blows"], "sample_ids": ["tPJvjq9QePY", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["bleats, person, speak", "accelerates, wind, blows"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a man speaks as a machine runs"], "sample_ids": ["w0xsN8X18Y", "vD6lYD1l0BY"], "start_seconds": ["30", "330"], "properties": ["music, surface, rain", "a, machine, run"], "captions_pred_video": [null, "game controller being held in the hands of the person"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking and dishes are being washed "], "question": "which entity has a machine running?", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "small dogs yip and bark sharply"], "sample_ids": ["tiDFTC-5vU", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["male, duck, laugh", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["waves crash against a shoreline and people speak", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["yFB25fqfU8I", "su6FAOcOA8c"], "start_seconds": ["300", "4"], "properties": ["wave, crash, shoreline", "engine, idle, woman"], "captions_pred_video": ["footage of a person surfing in the ocean", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sapQIQUhFc", "uYT5gxnyMWM"], "start_seconds": ["280", "50"], "properties": ["water, stream, trickles", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "a clock ticktocks"], "sample_ids": ["uYT5gxnyMWM", "v-g-j2uTByM"], "start_seconds": ["50", "30"], "properties": ["person, spray, yell", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "birds chirp and objects are moved around"], "sample_ids": ["v5P-ThUCINM", "yPUYU6t3rwo"], "start_seconds": ["400", "370"], "properties": ["background, chirp, bird", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and birds are chirping", "insects buzz and a man speaks"], "question": "which entity has birds chirping in the background?", "label": 0}, {"captions": ["a goat bleats as a person speaks", "a stream of water runs briefly"], "sample_ids": ["tPJvjq9QePY", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["bleats, person, speak", "stream, water, run"], "captions_pred_video": ["a dog and a sheep in a barn", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a baby cries and a man speaks", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sU53zg9Jp7s", "tDVADusiIoc"], "start_seconds": ["380", "60"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "water, radio, man"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman?", "label": 0}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yYEVLuqEytU", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["grunt, slurp, background", "rustling, ducks, quack"], "captions_pred_video": ["a baby goat is being petted by a person's hand", null], "captions_pred_audio": ["several sheep bleat and a man speaks", "a duck quacks and a woman speaks"], "question": "which entity is about ducks?", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a woman speaks happily and an animal chirps"], "sample_ids": ["y2bVZ7rz-5M", "uWAAAL4CIoc"], "start_seconds": ["280", "0"], "properties": ["motor noise, horn, siren", "a woman, chirps, animal"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["electronic beeps occur in a short series", "people applaud and hoot and chat quietly"], "sample_ids": ["y682ml90jGw", "wwyfGO2J4"], "start_seconds": ["11", "90"], "properties": ["beeps, series, electronic", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a person is snoring while sleeping"], "sample_ids": ["w2M4i1mklOA", "vJrjSeP17yE"], "start_seconds": ["30", "40"], "properties": ["alarm, gears, turn", "a person is sleeping, snoring, person"], "captions_pred_video": ["footage of an antique clock", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["x9JovgqUcs", "uYT5gxnyMWM"], "start_seconds": ["500", "50"], "properties": ["a, man, speaks, keyboard", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and typing on a keyboard?", "label": 0}, {"captions": ["a train horn blares as a train passes, then fades", "a infant makes noise and is excited"], "sample_ids": ["zVacuqSb4LI", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["blares, fades, train", "noise, excited, infant"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speak softly as food sizzles", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yhQ2Lg-7qDY", "vJ7JPEFhyLA"], "start_seconds": ["130", "16"], "properties": ["food, sizzle, speak", "three men, wind, flow"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a liquid flowing?", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wqN6IIHw3po", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["rain, surface, fall", "a woman, something, fried"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and water is splashing", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "a toilet flushes and a female speaks"], "sample_ids": ["slZLHwNbbt4", "yaln9y8I7ms"], "start_seconds": ["300", "230"], "properties": ["a, horn, run", "female, flushes, toilet"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage is blurry and out of focus"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "water splashes as an animal walks through"], "sample_ids": ["viuTg1M-dqg", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["two men, speak, follow", "animal, water, splashes"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["some people speak", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vbZ-0lGPneg", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "loud, multiple, distance"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wnpJndXuxLc", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["beeps, loud, whistle", "a woman, a television program, a bird"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a series of light horn beeps followed by a loud steam whistle?", "label": 0}, {"captions": ["a woman and man are speaking", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vbpKkWvfOu4", "xKB8O8LTs6s"], "start_seconds": ["560", "70"], "properties": ["two people, speaking, woman, man", "music, gunfire, explosion"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more action", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a railroad crossing bell rings as a train horn blows"], "sample_ids": ["wudZTNBtVqc", "tZGN5a7ybxo"], "start_seconds": ["60", "60"], "properties": ["accelerates, engine, wind", "ring, train, horn"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "is taken from a moving vehicle on the train tracks"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a train is moving and blowing its horn "], "question": "which entity is a warning device", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a woman speaks and dog vocalizes"], "sample_ids": ["w8uLijTqtlU", "uWAAAL4CIoc"], "start_seconds": ["70", "0"], "properties": ["wind, microphone, noise", "a, dog, vocalize"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a person is whistling a tune"], "sample_ids": ["vfYTJq7nU", "scYRUkrFLiQ"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "a, tune, whistle"], "captions_pred_video": [null, "of the man wearing a bow tie and a suit jacket in front of a red door"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "several insects fly while two men talk"], "sample_ids": ["v5P-ThUCINM", "s-T9OVOiMLo"], "start_seconds": ["400", "330"], "properties": ["background, chirp, bird", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more flying insects", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "people speak as gunfire rings out"], "sample_ids": ["sZPuqDgX2V0", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["engine, accelerate, intercom", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vfYTJq7nU", "xKB8O8LTs6s"], "start_seconds": ["130", "70"], "properties": ["ducks, quack, man", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a duck quacks and a woman speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a door opens and birds chirp", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yeFvk9x0wWI", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["door, open, birds", "engine, laugh, loud"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a jet engine roars "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a toilet flushes and water drains", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sfAvvZwdLCY", "wDVMhEdTiVw"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "gun, shoot, water"], "captions_pred_video": ["footage of the toilet in the bathroom", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a toilet is flushed", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "wind blows as people chatter quietly"], "sample_ids": ["xvDdE3zNf8Y", "xBxDz0CFVn0"], "start_seconds": ["120", "30"], "properties": ["A, crumple, paper", "wind, chatter, people"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["uYT5gxnyMWM", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["a, scream, girl", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["sxYkFKFIZD0", "y2bVZ7rz-5M"], "start_seconds": ["20", "280"], "properties": ["screech, man, door", "motor noise, horn, siren"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is a warning", "label": 1}, {"captions": ["an infant crying as a woman laughs", "a vehicle accelerates and squeals tires"], "sample_ids": ["xhmRY9yhC7c", "yRx9txMcBl0"], "start_seconds": ["20", "40"], "properties": ["a, laugh, infant", "accelerates, tires, squeals"], "captions_pred_video": ["of a baby crying in a baby bouncer", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a baby cries and a woman speaks", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "pigeons vocalize and birds chirp"], "sample_ids": ["ujMt0-D-x2k", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["snoring, rhythmical, nearby", "vocalize, bird, chirp"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of the pigeon in the cage"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "an engine runs loudly"], "sample_ids": ["yZrFNS7GFBQ", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["pigeon, buzzes, insect", "loud, engine, run"], "captions_pred_video": ["of the bird in the cage", "footage is blurry because it's raining outside"], "captions_pred_audio": ["an owl hoots in the background ", "a lawn mower is running and men are speaking "], "question": "which entity is quieter", "label": 0}, {"captions": ["a large crowd cheers and applauds", "a motor vehicle roars, drowning out people speaking in the background"], "sample_ids": ["rqfQRErjfk8", "s4Uz1Ffgo04"], "start_seconds": ["170", "100"], "properties": ["crowd, cheers, applauds", "roars, background, people speaking"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["a helicopter engine runs", "a person snores loudly multiple times at a close distance"], "sample_ids": ["t5ZbXbniOWk", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["engine, helicopter, run", "loud, multiple, distance"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["zFjIWfSD-4", "y8WEcpOlT3I"], "start_seconds": ["410", "40"], "properties": ["People, motor, brakes", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a man is speaking with wind noise in the background "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "a machine beeps continuously"], "sample_ids": ["uPDn2BFTHk", "y682ml90jGw"], "start_seconds": ["140", "11"], "properties": ["woman, laughs, speaks", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "vehicles pass by on a roadway"], "sample_ids": ["t25U-v4k4ts", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["a, chirps, bird", "pass, vehicle, roadway"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a bird chirps in response to a woman chirping for the birds"], "sample_ids": ["zY3icUyMdh8", "uOpoD0gGXcs"], "start_seconds": ["20", "120"], "properties": ["dog, bark, engine", "chirps, woman, bird"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a herd of cows grazing in the field"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "birds are chirping and a man is speaking"], "question": "which entity is a response to a human action", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "water splashes and a door squeaks"], "sample_ids": ["s4Uz1Ffgo04", "sdXV-ylviw"], "start_seconds": ["100", "190"], "properties": ["water, rushes, motorcycle", "sound, splash, door"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a dog barks and taps with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["xERFUeZONz8", "tDlysoZiA1I"], "start_seconds": ["0", "0"], "properties": ["ring, approach, traffic", "animal, grunts, chirps"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["an emergency vehicle siren blares", "birds are chirping and a rooster is crowing "], "question": "which entity is more animal like", "label": 1}, {"captions": ["an insect buzzes around continuously", "a motorcycle idles loudly as wind blows"], "sample_ids": ["v25l1jef3JY", "v7jJS8aAyA"], "start_seconds": ["0", "10"], "properties": ["buzzes, continuously, insect", "wind, blows, loudly"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a motorcycle engine is idling and vibrating"], "question": "which entity is louder", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a man speaks as a car is passing by"], "sample_ids": ["se87d6yxEOA", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["run, whistle, pass", "a, car, pass"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a man is speaking with background noise and breathing sounds "], "question": "which object is moving", "label": 0}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "music plays, a person speaks, followed by whooshes and a ding"], "sample_ids": ["uYT5gxnyMWM", "tQWGZLItBXk"], "start_seconds": ["50", "170"], "properties": ["female, spraying, scream", "music, person, ding"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "worms revolution screenshots"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has a person speaking?", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "someone is typing on a computer keyboard"], "sample_ids": ["uPDn2BFTHk", "v0x1odnXtP0"], "start_seconds": ["140", "210"], "properties": ["lady, laugh, baby", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["y2ZBGpgbhHM", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["dog, chirp, breathe", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks, then dials a rotary telephone", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["tK4VlLsNxak", "uYT5gxnyMWM"], "start_seconds": ["120", "50"], "properties": ["a, dial, telephone", "a, scream, girl"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "wind blows as people chatter quietly"], "sample_ids": ["tPJvjq9QePY", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["animal, bleat, moo", "wind, chatter, people"], "captions_pred_video": ["a dog and a sheep in a barn", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries and a man speaks", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a goat bleats as a person speaks", "wind blows as people chatter quietly"], "sample_ids": ["tPJvjq9QePY", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["bleats, person, speak", "wind, chatter, people"], "captions_pred_video": ["a dog and a sheep in a barn", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries and a man speaks", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["uqFtmnhuqA8", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["a, b, c", "a woman, something, fried"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "- a woman cooking in the kitchen"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a horse runs while two women talk", "a man speaks as horns blow"], "sample_ids": ["sdvI1mHAsc", "tHyNqRyK34A"], "start_seconds": ["20", "24"], "properties": ["two women, horse, run", "a, man, speaks"], "captions_pred_video": [null, "being taken from inside a vehicle on the street at night"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a man is speaking and a car is honking with background noise "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "paper is crumpling consistently"], "sample_ids": ["x9JovgqUcs", "v5cSxLaHADY"], "start_seconds": ["500", "0"], "properties": ["a, man, speaks, keyboard", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man speaks and types on a keyboard", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["some men converse over an engine running", "wind blows as people chatter quietly"], "sample_ids": ["sCiy7QS1U", "xBxDz0CFVn0"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["tapping occurs then a baby cries", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wIJK3-5y0kA", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["a, cry, baby", "loud, jet engine, roar"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a baby cries and a woman speaks", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["u7C-AEBQM", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["ticks, rhythmic, quiet", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vbZ-0lGPneg", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["a woman, a television program, a bird", "People, motor, brakes"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "an airplane engine runs"], "sample_ids": ["vZAw4apG0Es", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["background, tick, repeat", "engine, airplane, runs"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a clock is ticking and people are talking", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "water rushes by"], "sample_ids": ["u--KhUW8l1Y", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["horn, siren, life", "water, rushes, by"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet flushes and water drains", "dishes cling together then a man begins to speak"], "sample_ids": ["sfAvvZwdLCY", "sQGXqGcwOTc"], "start_seconds": ["20", "3"], "properties": ["water drains, flushes, water", "cling, speak, dishes"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a toilet is flushed", "mechanisms are operating and water is splashing "], "question": "which entity is about water", "label": 0}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["voJh2gJxXhA", "tiDFTC-5vU"], "start_seconds": ["50", "30"], "properties": ["music, frog, croak", "male, duck, laugh"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", null], "captions_pred_audio": ["music is playing and crickets are chirping ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["birds coo incessantly", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["yZrFNS7GFBQ", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["coo, bird, incessant", "two men, woman, birds"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a video of birds?", "label": 0}, {"captions": ["a man speaks as a machine runs", "a machine beeps continuously"], "sample_ids": ["vD6lYD1l0BY", "y682ml90jGw"], "start_seconds": ["330", "11"], "properties": ["a, machine, run", "beeps, machine, continuously"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a beeping sound is being made "], "question": "which machine is beeping continuously?", "label": 0}, {"captions": ["a person is snoring while sleeping", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vJrjSeP17yE", "y8WEcpOlT3I"], "start_seconds": ["40", "40"], "properties": ["a person is sleeping, snoring, person", "harsh, wind, blows"], "captions_pred_video": ["a black background with a small plane flying in the sky", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking with wind noise in the background "], "question": "which entity is a person", "label": 0}, {"captions": ["several insects fly while two men talk", "two men speak as a buffeting wind blows"], "sample_ids": ["s-T9OVOiMLo", "y8WEcpOlT3I"], "start_seconds": ["330", "40"], "properties": ["several, fly, men", "wind, speak, buffeting"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is about a buffeting wind?", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "water flows and trickles"], "sample_ids": ["t97k0cejSQE", "tB7hWb9gTuQ"], "start_seconds": ["250", "30"], "properties": ["bird, chirp, insect", "water, flow, trickle"], "captions_pred_video": ["a bee on a purple thistle flower", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a mechanical buzzing getting louder"], "sample_ids": ["u--KhUW8l1Y", "sEprKHm8Sj8"], "start_seconds": ["0", "90"], "properties": ["horn, siren, life", "noise, loud, buzzing"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vZAw4apG0Es", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["background, clock, ticktocks", "wind, blow, vehicle"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a clock ticking in the background?", "label": 0}, {"captions": ["a heavy rain falls endlessly", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wP8ZKrlx3oA", "su6FAOcOA8c"], "start_seconds": ["40", "4"], "properties": ["heavy, rain, fall", "engine, idle, woman"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "small dogs growl, bark and yip."], "sample_ids": ["yRx9txMcBl0", "sShpyu2l4YQ"], "start_seconds": ["40", "0"], "properties": ["accelerates, tires, squeals", "growl, bark, yip"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "the puppies are playing with a toy"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a dog is barking and growling"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["an airplane engine runs", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["yVPZ2MNWpms", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["engine, airplane, runs", "music, gunfire, explosion"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car is driving by on the road ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a movie?", "label": 1}, {"captions": ["a power tool runs and touches a surface", "a child speaks in closed space"], "sample_ids": ["zfvPRf3chY", "yW6FWLSLkx4"], "start_seconds": ["290", "40"], "properties": ["power tool, run, touch", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not a power tool", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["vs65y4qmyBE", "vzxHnu-SFEw"], "start_seconds": ["340", "80"], "properties": ["engine, run, man", "two objects, woman, speak"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a machine?", "label": 0}, {"captions": ["popping and crackling repeats as men yell and laugh", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["rqu8iB22IY", "w34HjHr6gAY"], "start_seconds": ["5", "30"], "properties": ["sound, repeats, laugh", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "paper is crumpling consistently"], "sample_ids": ["uYT5gxnyMWM", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["a, scream, girl", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "paper is crumpled and crinkled"], "question": "which entity is a video of a girl speaking followed by a scream and more girls talking?", "label": 0}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "water splashes as an animal walks through"], "sample_ids": ["zY3icUyMdh8", "w1ir-sZ3Im8"], "start_seconds": ["20", "90"], "properties": ["dog, bark, engine", "animal, water, splashes"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "water splashes and gurgles as people speak"], "question": "which entity is a video of an animal walking through water?", "label": 1}, {"captions": ["a person is whistling a tune", "an insect buzzes around continuously"], "sample_ids": ["scYRUkrFLiQ", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["a, tune, whistle", "buzzes, continuously, insect"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a person whistling a song", "a fly is buzzing around a microphone "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a baby cries and a woman moans", "an infant crying frantically"], "sample_ids": ["smDKStoHBJo", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["a, cry, woman", "cry, infant, frantically"], "captions_pred_video": ["a man holding a crying baby in his arms", "of the baby crying in the car seat"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a baby cries loudly"], "question": "which entity is crying frantically", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a child and woman laughs and the woman speaks"], "sample_ids": ["sU53zg9Jp7s", "uPDn2BFTHk"], "start_seconds": ["380", "140"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "woman, laughs, speaks"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a baby laughs and a woman speaks"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a man speaks as a machine runs"], "sample_ids": ["sWZzXuWYY", "vD6lYD1l0BY"], "start_seconds": ["420", "330"], "properties": ["male, clanks, thumps", "a, machine, run"], "captions_pred_video": [null, "game controller being held in the hands of the person"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking and dishes are being washed "], "question": "which machine is running", "label": 1}, {"captions": ["a woman and man speak while food is frying", "people applaud and hoot and chat quietly"], "sample_ids": ["zk-xJGQU8-4", "wwyfGO2J4"], "start_seconds": ["130", "90"], "properties": ["food, man, woman", "people, applaud, hoot"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", null], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wvKpEYswXO0", "vJ7JPEFhyLA"], "start_seconds": ["150", "16"], "properties": ["water, tap, run", "three men, wind, flow"], "captions_pred_video": ["of the person preparing food in the kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing?", "label": 0}, {"captions": ["an engine runs and wind blows", "water pouring and bubbling"], "sample_ids": ["vs65y4qmyBE", "uyRfq-jKPpo"], "start_seconds": ["340", "50"], "properties": ["engine, run, wind", "water, bubbles, pouring"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["ugHJF0hfYkg", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["engine, running, continuously", "engine, revs, vehicle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which entity has a running engine", "label": 0}, {"captions": ["a toilet door squeaks as it is opened", "water splashes and a door squeaks"], "sample_ids": ["sdXV-ylviw", "sdXV-ylviw"], "start_seconds": ["190", "190"], "properties": ["door, toilet, squeaks", "sound, splash, door"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dog barks and taps with background noise ", "a dog barks and taps with background noise "], "question": "which door is squeaking", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["v-wcQf4BDY0", "uYT5gxnyMWM"], "start_seconds": ["120", "50"], "properties": ["bark, yip, sharply", "a, scream, girl"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a dog barks and growls", "a woman is speaking and a baby is crying"], "question": "which entity is a human", "label": 1}, {"captions": ["a person is snoring while sleeping", "someone is snoring while sleeping"], "sample_ids": ["vJrjSeP17yE", "ujMt0-D-x2k"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "snore, sleep, someone"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of the dog playing with a toy on the floor"], "captions_pred_audio": ["a person snoring loudly", "a person is snoring loudly"], "question": "which entity is snoring while sleeping", "label": 0}, {"captions": ["an audience gives applause as a man yells and a group sings", "waves crash against a shoreline and people speak"], "sample_ids": ["tdWhHV3X25Q", "yFB25fqfU8I"], "start_seconds": ["60", "300"], "properties": ["applause, audience, yells", "wave, crash, shoreline"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["un9VQlzgZM", "zFjIWfSD-4"], "start_seconds": ["5", "410"], "properties": ["females, talk, laugh", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a toilet flushes and water drains"], "sample_ids": ["v0x1odnXtP0", "sfAvvZwdLCY"], "start_seconds": ["210", "20"], "properties": ["keyboard, type, computer", "water drains, flushes, water"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a person is typing on a keyboard", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["water flows followed by women screaming", "a man speaks as a motor runs in the background"], "sample_ids": ["w5W5Kqtc8E", "xZepNM9qcRA"], "start_seconds": ["100", "30"], "properties": ["water, flow, women", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["people clap and speak in the distance", "an insect buzzes around continuously"], "sample_ids": ["wwyfGO2J4", "v25l1jef3JY"], "start_seconds": ["90", "0"], "properties": ["clap, distance, speak", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "water splashes as an animal walks through"], "sample_ids": ["uKCSGgof8gI", "w1ir-sZ3Im8"], "start_seconds": ["12", "90"], "properties": ["chirps, distance, signal", "animal, water, splashes"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a person is whistling a tune", "people cheer as a vehicle engine revs"], "sample_ids": ["scYRUkrFLiQ", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["a, tune, whistle", "engine revs, vehicle, people"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person whistling a song", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["children cry and people talk", "a man speaks as a car is passing by"], "sample_ids": ["xLwHe825Zs", "sK4u5T8hW78"], "start_seconds": ["18", "30"], "properties": ["people talk, children cry, people talk", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a train horn blares as a train passes, then fades", "a car accelerates and wind blows"], "sample_ids": ["zVacuqSb4LI", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["blares, fades, train", "accelerates, wind, blows"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", null], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["someone is snoring while sleeping", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ujMt0-D-x2k", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["snore, sleep, someone", "female, spraying, scream"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["x5cuQjOdM3E", "wyllXV6PjKo"], "start_seconds": ["30", "30"], "properties": ["cat, meows, young woman", "a baby, a woman, a man"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman speaks and a baby cries"], "question": "which entity is a child", "label": 1}, {"captions": ["wind blows strongly", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["w8uLijTqtlU", "yajyRTUQk3U"], "start_seconds": ["70", "400"], "properties": ["wind, blows, strongly", "a woman, something, fried"], "captions_pred_video": ["footage is blurry and shaky", "- a woman cooking in the kitchen"], "captions_pred_audio": ["the wind is blowing strongly", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["an aircraft engine runs", "water pouring and bubbling"], "sample_ids": ["yLCORCnd35Q", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["engine, aircraft, runs", "water, bubbles, pouring"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a baby cries and a woman speaks", "females talk and laugh over gusting wind"], "sample_ids": ["tMbMDvT50j8", "un9VQlzgZM"], "start_seconds": ["12", "5"], "properties": ["a, cry, woman", "females, talk, laugh"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is a group of people", "label": 1}, {"captions": ["birds coo incessantly", "a man speaks as a car is passing by"], "sample_ids": ["yZrFNS7GFBQ", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["coo, bird, incessant", "a, car, pass"], "captions_pred_video": ["of the bird in the cage", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a man speaks followed by another man speaking outside"], "sample_ids": ["wRBHTgrbiwg", "viuTg1M-dqg"], "start_seconds": ["50", "30"], "properties": ["bird, owl, speak", "two men, speak, follow"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "wind blowing followed by a zoom"], "sample_ids": ["w5W5Kqtc8E", "vr8ZXjEBhMQ"], "start_seconds": ["100", "150"], "properties": ["wind, blow, vehicle", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "wind blows and a chainsaw cuts through wood "], "question": "which entity has a zoom?", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["ul60S8TXDA8", "su6FAOcOA8c"], "start_seconds": ["60", "4"], "properties": ["sound, distance, bell", "engine, idle, woman"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "waves crash against a shoreline and people speak"], "sample_ids": ["tDlysoZiA1I", "yFB25fqfU8I"], "start_seconds": ["0", "300"], "properties": ["animal, grunt, multiple", "wave, crash, shoreline"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a person surfing in the ocean"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "an airplane engine spools and people speak"], "sample_ids": ["uYT5gxnyMWM", "wTjoRj1se3U"], "start_seconds": ["50", "390"], "properties": ["person, spray, yell", "airplane, engine, spool"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a jet engine is running and people are talking"], "question": "which entity is a video of a person speaking over spraying and another person yelling?", "label": 0}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["uqFtmnhuqA8", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["a, b, c", "loud, laughter, intermittent"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a scream", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "a duck quacks continuously"], "sample_ids": ["w8uLijTqtlU", "vh30P49Po6s"], "start_seconds": ["70", "30"], "properties": ["wind, microphone, noise", "quacks, continuously, duck"], "captions_pred_video": ["footage is blurry and shaky", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["the wind is blowing strongly", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["x5cuQjOdM3E", "vVhthZ45k3Y"], "start_seconds": ["30", "30"], "properties": ["cat, meows, young woman", "cat, purr, hiss"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage is blurry and out of focus"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and a cat is meowing"], "question": "which entity is more playful", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "an engine runs loudly"], "sample_ids": ["w5W5Kqtc8E", "vqZuVbG6-HI"], "start_seconds": ["100", "130"], "properties": ["wind, engine, scream", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["w5W5Kqtc8E", "ziUT9IFTkjg"], "start_seconds": ["100", "10"], "properties": ["water, splashes, motorboat", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone snores nearby", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["spJCm8tD9Zo", "siJFXfGWgDk"], "start_seconds": ["90", "50"], "properties": ["someone snores, nearby, someone", "a, bird, vehicle"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and birds are chirping in the background "], "question": "which entity is more likely to be a person", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "an insect buzzes around continuously"], "sample_ids": ["vf9xf3vMsGM", "v25l1jef3JY"], "start_seconds": ["540", "0"], "properties": ["A man speaks while turning a water faucet on.", "buzzes, continuously, insect"], "captions_pred_video": ["of the person washing their hands under the faucet", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["food is frying while a woman speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["yhQ2Lg-7qDY", "xjhAnI2q6hM"], "start_seconds": ["130", "6"], "properties": ["food, woman, speak", "engine revs, vehicle, people"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a truck is revving its engine and a man is speaking "], "question": "what is the woman doing while the food is frying?", "label": 0}, {"captions": ["water is sprayed across a hard surface", "a man is snoring loudly and repeatedly"], "sample_ids": ["sQwlkXjQabo", "sncRqQ67iJU"], "start_seconds": ["10", "460"], "properties": ["water, spray, surface", "loud, repeatedly, man"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["spraying followed by silence", "a person is snoring"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "wind blowing followed by a zoom"], "sample_ids": ["yajyRTUQk3U", "vr8ZXjEBhMQ"], "start_seconds": ["400", "150"], "properties": ["noise, woman, speak", "wind, blow, zoom"], "captions_pred_video": ["- a woman cooking in the kitchen", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "food is frying while a woman speaks"], "sample_ids": ["wz7N8YRy74I", "yhQ2Lg-7qDY"], "start_seconds": ["30", "130"], "properties": ["rooster, crow, background, people", "food, woman, speak"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a faucet is running and a man is speaking"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "a dog whimpers as someone inhales/exhales briefly"], "sample_ids": ["uiS58TNyUiw", "vmrxwuAMb2I"], "start_seconds": ["430", "40"], "properties": ["vocalize, bird, chirp", "a dog, inhales, exhales"], "captions_pred_video": ["of the pigeon in the cage", "of the dog laying on the bed with his head out of the blanket"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a dog barks and growls"], "question": "which animal is not vocalizing", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a propeller rotates loudly and intensely"], "sample_ids": ["y4tPJXBKDig", "ugHJF0hfYkg"], "start_seconds": ["20", "10"], "properties": ["a, noise, talk", "loud, intense, propeller"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a helicopter is flying overhead "], "question": "which noise is louder", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wqUmIEzuNz4", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["frog, bird, vocalize", "music, gunfire, explosion"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a cat meows and rustles", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["x6ijhqRY38s", "xKB8O8LTs6s"], "start_seconds": ["250", "70"], "properties": ["bowl, silverware, man", "music, gunfire, explosion"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zl9Dqx-j7q4", "yajyRTUQk3U"], "start_seconds": ["6", "400"], "properties": ["motors rev, laugh, loudly", "a woman, something, fried"], "captions_pred_video": ["footage of a man driving a car in the dark", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a jet engine roars ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a stream of water runs briefly"], "sample_ids": ["sSMl2vc3ek", "x-PeY8Yb8M4"], "start_seconds": ["20", "300"], "properties": ["a person, laughs, snores", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person snoring loudly", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["some men converse over an engine running", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sCiy7QS1U", "w5W5Kqtc8E"], "start_seconds": ["300", "100"], "properties": ["men, converse, engine", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vW4x7S1VfQc", "wz7N8YRy74I"], "start_seconds": ["150", "30"], "properties": ["clacking, oil, woman", "rooster, crow, background, men"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["food sizzles in a frying pan", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "someone is typing on a computer keyboard"], "sample_ids": ["sjlVMgdGSK0", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["accelerates, vehicle, race car", "keyboard, type, computer"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "how to make money on youtube in spanish"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person is typing on a keyboard"], "question": "which is a stationary object", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "people speak as gunfire rings out"], "sample_ids": ["yeFvk9x0wWI", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["clack, bird, chirp", "gunfire, ring, speak"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "an insect buzzes around continuously"], "sample_ids": ["x6ijhqRY38s", "v25l1jef3JY"], "start_seconds": ["250", "0"], "properties": ["something metal, glass, hit", "buzzes, continuously, insect"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a train horn sounds and railroad crossing ring", "children cheer as a man speaks then an audience screams"], "sample_ids": ["s7knHCFW82w", "vJvryTwuAV8"], "start_seconds": ["30", "16"], "properties": ["horn, sound, train", "audience, cheer, man"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man is speaking and a crowd is shouting and whooping "], "question": "which entity is a person speaking to an audience?", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zl9Dqx-j7q4", "zj2R0XoFr5k"], "start_seconds": ["6", "50"], "properties": ["motors rev, laugh, loudly", "airplane, boy, fly"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a jet engine roars ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["zj2R0XoFr5k", "xjvTpk2Zpr8"], "start_seconds": ["50", "70"], "properties": ["airplane, fly, overhead", "wind, blows, vehicle"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a jet engine roars and wind blows "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "water is sprayed across a hard surface"], "sample_ids": ["y1saVTXsKwc", "sQwlkXjQabo"], "start_seconds": ["80", "10"], "properties": ["a, dog, talk", "water, spray, surface"], "captions_pred_video": ["a dog playing with a pink ball", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a dog barks and a man speaks", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vmrxwuAMb2I", "wz7N8YRy74I"], "start_seconds": ["40", "30"], "properties": ["a dog, inhales, exhales", "rooster, crow, background, men"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "an airplane engine roars increasingly louder"], "sample_ids": ["sjlVMgdGSK0", "vBslzh7saPw"], "start_seconds": ["30", "90"], "properties": ["accelerates, vehicle, race car", "engine, roar, louder"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a jet engine roars and accelerates "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a toilet flushes and water drains unevenly"], "sample_ids": ["yYEVLuqEytU", "vhJWZheqaE"], "start_seconds": ["40", "0"], "properties": ["grunt, slurp, background", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["a baby goat is being petted by a person's hand", null], "captions_pred_audio": ["several sheep bleat and a man speaks", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "people speaking indiscriminately in the distance with a person snoring loudly nearby"], "sample_ids": ["sAam2NqGhLY", "w2JXXIAdUdg"], "start_seconds": ["20", "10"], "properties": ["snoring, breathing, child", "snoring, distance, person"], "captions_pred_video": ["of a little girl sleeping on a couch", "a close up shot of a person's mouth with a toothbrush in it"], "captions_pred_audio": ["a person is snoring", "a person snoring and a dog whimpering"], "question": "which entity is a person", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a telephone rings followed by a woman talking"], "sample_ids": ["x9JovgqUcs", "tGcFnX0GHI"], "start_seconds": ["500", "0"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a keyboard", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["someone is snoring while sleeping", "a woman speaks as she rubs two objects together"], "sample_ids": ["ujMt0-D-x2k", "vzxHnu-SFEw"], "start_seconds": ["0", "80"], "properties": ["snore, sleep, someone", "two objects, woman, speak"], "captions_pred_video": ["of the dog playing with a toy on the floor", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "people speak as gunfire rings out"], "sample_ids": ["uJV8NDaHqqk", "wqTCwqVRDlk"], "start_seconds": ["100", "80"], "properties": ["loud, fly, chirp", "gunfire, ring, speak"], "captions_pred_video": ["a bee hive in a wooden box", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a telephone rings followed by a woman talking"], "sample_ids": ["y8WEcpOlT3I", "tGcFnX0GHI"], "start_seconds": ["40", "0"], "properties": ["harsh, wind, blows", "ring, talk, woman"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a drill drills through something then people begin laughing"], "sample_ids": ["ylpYOorfH4o", "tEE3MpBt1sg"], "start_seconds": ["410", "50"], "properties": ["engine, run, loud", "drill, something, laugh"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a man is speaking and an engine is revving", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["a infant makes noise and is excited", "water splashes as an animal walks through"], "sample_ids": ["wIJK3-5y0kA", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["noise, excited, infant", "animal, water, splashes"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a baby cries and a woman speaks", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "winds blows roughly as a vehicle races past"], "sample_ids": ["xyL9F5VrjkE", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["wind, blows, vehicle", "wind, blows, vehicle"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a jet engine roars and wind blows "], "question": "which entity shows a vehicle running?", "label": 0}, {"captions": ["an infant crying as a woman laughs", "three men talk while wind blows and some liquid flows"], "sample_ids": ["xhmRY9yhC7c", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["a, laugh, infant", "three men, wind, flow"], "captions_pred_video": ["of a baby crying in a baby bouncer", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman and an infant?", "label": 0}, {"captions": ["food is frying while a woman speaks", "an airplane engine runs"], "sample_ids": ["yhQ2Lg-7qDY", "yVPZ2MNWpms"], "start_seconds": ["130", "0"], "properties": ["food, woman, speak", "engine, airplane, runs"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "some tunes played by whistling"], "sample_ids": ["sZvwOuuPGP0", "u6BnG6YZqJ4"], "start_seconds": ["50", "0"], "properties": ["engine, diesel, truck", "tune, play, whistling"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a medium engine is running ", "a person whistling a song"], "question": "which entity is not a vehicle", "label": 1}, {"captions": ["an aircraft engine runs", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yLCORCnd35Q", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["engine, aircraft, runs", "a woman, laughs, animal"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "waves crash against a shoreline and people speak"], "sample_ids": ["uqFtmnhuqA8", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["a, b, c", "wave, crash, shoreline"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "footage of a person surfing in the ocean"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more like a natural phenomenon", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xvDdE3zNf8Y", "wqZ135Ssz0"], "start_seconds": ["120", "60"], "properties": ["A, crumple, paper", "two men, woman, birds"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", null], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["footsteps followed by a flushing toilet", "a man speaks while water drains"], "sample_ids": ["yXrw3GRMZag", "vSeGhaZt-aI"], "start_seconds": ["40", "50"], "properties": ["sound, toilet, flush", "water, drain, man"], "captions_pred_video": ["footage of a toilet bowl with water in it", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["rustling followed by a toilet flushing", "a man is speaking and pouring liquid with background noise "], "question": "which entity has a man speaking while water drains?", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "an electric engine works nearby followed by a child talking"], "sample_ids": ["wTjoRj1se3U", "xSKJGCItUWE"], "start_seconds": ["390", "10"], "properties": ["engine, run, people", "engine, work, child"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a jet engine is running and people are talking", "a high pitched engine is running and a child speaks"], "question": "which entity has a child talking?", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vhJWZheqaE", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["water drains unevenly, toilet flushes, water drains", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a train horn blows as it passes by"], "sample_ids": ["xV7Mg1QucSc", "zVacuqSb4LI"], "start_seconds": ["14", "30"], "properties": ["alarm, ticktocks, laughs", "horn, blows, train"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "an insect buzzes around continuously"], "sample_ids": ["ugHJF0hfYkg", "v25l1jef3JY"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "buzzes, continuously, insect"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a helicopter is flying overhead ", "a fly is buzzing around a microphone "], "question": "which entity is not a helicopter?", "label": 1}, {"captions": ["a child speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yW6FWLSLkx4", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["a, child, speaks", "a woman, laughs, animal"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["wSVhSdj0F0", "xfaoyyzw2WU"], "start_seconds": ["10", "180"], "properties": ["horn honks, keys jingle, electronic beep", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a propeller rotates loudly and intensely"], "sample_ids": ["zl9Dqx-j7q4", "ugHJF0hfYkg"], "start_seconds": ["6", "10"], "properties": ["motors rev, laugh, loudly", "loud, intense, propeller"], "captions_pred_video": ["footage of a man driving a car in the dark", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a jet engine roars ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "frogs croak and vocalize"], "sample_ids": ["xl2PIWyXaM", "yswmmRZFItk"], "start_seconds": ["160", "0"], "properties": ["chirp, man, younger person", "croak, vocalize, frog"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["birds are chirping and people are talking", "a frog is croaking"], "question": "which animal is more vocal", "label": 1}, {"captions": ["some people speak", "continuous snoring"], "sample_ids": ["vbZ-0lGPneg", "sLkeqCDJIyw"], "start_seconds": ["30", "120"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "loud, snoring, noise"], "captions_pred_video": ["of a man holding a baby duck in his hands", ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a person is snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a small engine idles continuously", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["y5WII6cTH7k", "xKB8O8LTs6s"], "start_seconds": ["40", "70"], "properties": ["engine, idle, continuously", "music, gunfire, explosion"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["an engine is knocking and vibrating ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "a man speaks as a car is passing by"], "sample_ids": ["yDoT73BWsdA", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["engine, revs, vehicle", "a, car, pass"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which vehicle is passing by", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["wSVhSdj0F0", "w34HjHr6gAY"], "start_seconds": ["10", "30"], "properties": ["horn honks, keys jingle, slam", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "frogs croak and vocalize"], "sample_ids": ["uWAAAL4CIoc", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["a woman, chirps, animal", "croak, vocalize, frog"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a frog is croaking"], "question": "which animal is speaking", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xjvTpk2Zpr8", "yajyRTUQk3U"], "start_seconds": ["70", "400"], "properties": ["wind, blows, vehicle", "a woman, something, fried"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yRx9txMcBl0", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["motors, tires, screech", "clickety-clack, train, whistle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a stream of water runs briefly"], "sample_ids": ["y2bVZ7rz-5M", "x-PeY8Yb8M4"], "start_seconds": ["280", "300"], "properties": ["engine, horn, siren", "stream, water, run"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a car is driving on a wet road "], "question": "which entity is a moving object", "label": 0}, {"captions": ["food is frying while a woman speaks", "an airplane engine spools and people speak"], "sample_ids": ["yhQ2Lg-7qDY", "wTjoRj1se3U"], "start_seconds": ["130", "390"], "properties": ["food, woman, speak", "airplane, engine, spool"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a jet engine is running and people are talking"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["vehicles pass by on a roadway", "paper folding and crinkling"], "sample_ids": ["tgbONvsP47Y", "zPpG3RD8lSs"], "start_seconds": ["0", "20"], "properties": ["pass, vehicle, roadway", "paper, fold, crinkle"], "captions_pred_video": ["footage of a fire truck entering a garage", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a car is driving on the road ", "the wind blows and a mouse clicks "], "question": "which is not a vehicle", "label": 1}, {"captions": ["a toilet flushes and water drains", "a woman speaks as she rubs two objects together"], "sample_ids": ["sfAvvZwdLCY", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["water drains, flushes, water", "two objects, woman, speak"], "captions_pred_video": ["footage of the toilet in the bathroom", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a person is burping while a girl speaks"], "sample_ids": ["wqZ135Ssz0", "vdoxuJn9lTc"], "start_seconds": ["60", "40"], "properties": ["man, woman, squawks", "person, burp, girl"], "captions_pred_video": [null, "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a child speaks followed by a burp"], "question": "which entity has a girl speaking?", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["uoGVs9yUqY4", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["multiple, vocalize, wind", "a woman, something, fried"], "captions_pred_video": ["for how to make a wooden shed door youtube", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["xyL9F5VrjkE", "vVhthZ45k3Y"], "start_seconds": ["20", "30"], "properties": ["wind, blows, vehicle", "cat, purr, hiss"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage is blurry and out of focus"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking and a cat is meowing"], "question": "which entity is more likely to be a cat", "label": 1}, {"captions": ["water flows followed by women screaming", "a man speaks as a car is passing by"], "sample_ids": ["w5W5Kqtc8E", "sK4u5T8hW78"], "start_seconds": ["100", "30"], "properties": ["water, flow, women", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "people applaud and hoot and chat quietly"], "sample_ids": ["u5RmF3c3Aw", "wwyfGO2J4"], "start_seconds": ["60", "90"], "properties": ["engine, car, zoom", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vh30P49Po6s", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["loud, continuous, quacks", "men, talk, cars"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["ugHJF0hfYkg", "xfaoyyzw2WU"], "start_seconds": ["10", "180"], "properties": ["loud, propeller, move", "loud, jet engine, roar"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a helicopter is flying overhead ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "an insect buzzes around continuously"], "sample_ids": ["t8CV69hcvF0", "v25l1jef3JY"], "start_seconds": ["210", "0"], "properties": ["person, sneeze, follow", "buzzes, continuously, insect"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a woman sneezes and speaks", "a fly is buzzing around a microphone "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vf44CgrjT0A", "tdWhHV3X25Q"], "start_seconds": ["20", "60"], "properties": ["loud, long, person", "applause, audience, yells"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a loud burp", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xSKJGCItUWE", "vb1fPSDI4c"], "start_seconds": ["10", "30"], "properties": ["engine, work, child", "multiple, people, yell"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["tDVADusiIoc", "xfaoyyzw2WU"], "start_seconds": ["60", "180"], "properties": ["water, radio, man", "loud, jet engine, roar"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vKrYfzleLB8", "xfaoyyzw2WU"], "start_seconds": ["110", "180"], "properties": ["a, ring, gunshots", "loud, jet engine, roar"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "vehicles pass by on a roadway"], "sample_ids": ["s6DESzUTGjY", "tgbONvsP47Y"], "start_seconds": ["16", "0"], "properties": ["wind, laugh, woman", "pass, vehicle, roadway"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a car is driving on the road "], "question": "which entity is more likely to be seen in a movie", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vzxHnu-SFEw", "zj2R0XoFr5k"], "start_seconds": ["80", "50"], "properties": ["two objects, woman, speak", "airplane, boy, fly"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which object is moving", "label": 1}, {"captions": ["a child speaks", "an airplane engine runs"], "sample_ids": ["yW6FWLSLkx4", "yVPZ2MNWpms"], "start_seconds": ["40", "0"], "properties": ["a, child, speaks", "engine, airplane, runs"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "birds chirp and objects are moved around"], "sample_ids": ["sTpirNYo8vQ", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["a, tone, fast", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "someone is typing on a computer keyboard"], "sample_ids": ["tOj4tdLRaA", "v0x1odnXtP0"], "start_seconds": ["70", "210"], "properties": ["woman, laugh, baby", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "water flows and trickles"], "sample_ids": ["uzQnlJXBbOM", "tB7hWb9gTuQ"], "start_seconds": ["50", "30"], "properties": ["ringing, beep, stop", "water, flow, trickle"], "captions_pred_video": ["footage of a person using a cell phone on a table", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a telephone rings and a man speaks", "water is splashing and gurgling"], "question": "which entity is a continuous flow", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["rqu8iB22IY", "t97k0cejSQE"], "start_seconds": ["5", "250"], "properties": ["sound, repeats, laugh", "sound, chirp, buzz"], "captions_pred_video": [null, "a bee on a purple thistle flower"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a bee buzzes and a woman speaks"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["a machine runs continuously", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wdXV3Pv0jiY", "sSMl2vc3ek"], "start_seconds": ["11", "20"], "properties": ["machine, running, continuously", "loud, multiple, distance"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a person snoring loudly"], "question": "which entity is not a machine?", "label": 1}, {"captions": ["an animal quacks rapidly", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vh30P49Po6s", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["animal, quacks, rapidly", "stream, water, flow"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage is blurry and out of focus"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "people applaud and hoot and chat quietly"], "sample_ids": ["sLUnaPT5gM8", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["loud, laughter, intermittent", "people, applaud, hoot"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "a car speeding up in the distance"], "sample_ids": ["w34HjHr6gAY", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["beeps, squawk, child speaking", "distance, car, speed"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "an infant crying frantically"], "sample_ids": ["vdoxuJn9lTc", "zwOBqeFTgiU"], "start_seconds": ["40", "30"], "properties": ["burp, loud, girl", "cry, infant, frantically"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of the baby crying in the car seat"], "captions_pred_audio": ["a child speaks followed by a burp", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a man speaks as a car is passing by", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["sK4u5T8hW78", "vXlk0lIQBFo"], "start_seconds": ["30", "470"], "properties": ["a, car, pass", "wind, speak, vocalize"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity is about a man speaking as a car is passing by?", "label": 0}, {"captions": ["a man speaks then rubs two hard objects together", "paper folding and crinkling"], "sample_ids": ["yJ0TePmaOo", "zPpG3RD8lSs"], "start_seconds": ["390", "20"], "properties": ["two hard objects, man, speak", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "the wind blows and a mouse clicks "], "question": "which object is softer", "label": 1}, {"captions": ["tapping occurs then a baby cries", "a man is snoring loudly and repeatedly"], "sample_ids": ["wIJK3-5y0kA", "sncRqQ67iJU"], "start_seconds": ["30", "460"], "properties": ["a, cry, baby", "loud, repeatedly, man"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a baby cries and a woman speaks", "a person is snoring"], "question": "which entity is louder", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "wind blows as people chatter quietly"], "sample_ids": ["zCrAfDfv6-A", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["person, mouse, click", "wind, chatter, people"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistles a song", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["water splashes as an animal walks through", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["w1ir-sZ3Im8", "w5W5Kqtc8E"], "start_seconds": ["90", "100"], "properties": ["animal, water, splashes", "wind, blow, vehicle"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["continuous snoring", "a train horn blows as it passes by"], "sample_ids": ["sLkeqCDJIyw", "zVacuqSb4LI"], "start_seconds": ["120", "30"], "properties": ["loud, snoring, noise", "horn, blows, train"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a person is snoring loudly", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which noise is louder", "label": 0}, {"captions": ["a girl speaks followed by a scream and more girls talking", "someone whistles a tune"], "sample_ids": ["uYT5gxnyMWM", "sIXTftIuUgw"], "start_seconds": ["50", "90"], "properties": ["a, scream, girl", "someone, tune, whistle"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a motor runs in the distance as a soft wind periodically gusts"], "sample_ids": ["ugHJF0hfYkg", "xyL9F5VrjkE"], "start_seconds": ["10", "20"], "properties": ["engine, running, continuously", "wind, motor, distance"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of a caterpillar truck loading logs into a trailer"], "captions_pred_audio": ["a helicopter is flying overhead ", "the wind is blowing and a car is passing by "], "question": "which entity is running continuously", "label": 0}, {"captions": ["a girl speaks followed by a scream and more girls talking", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["uYT5gxnyMWM", "vXlk0lIQBFo"], "start_seconds": ["50", "470"], "properties": ["a, scream, girl", "wind, speak, vocalize"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["tapping occurs then a baby cries", "some men converse over an engine running"], "sample_ids": ["wIJK3-5y0kA", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["a, cry, baby", "men, converse, engine"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a baby crying?", "label": 0}, {"captions": ["a toilet flushes and water drains", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sfAvvZwdLCY", "tiDFTC-5vU"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "male, duck, laugh"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "water pouring and bubbling"], "sample_ids": ["t8CV69hcvF0", "uyRfq-jKPpo"], "start_seconds": ["210", "50"], "properties": ["person, sneeze, follow", "water, bubbles, pouring"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman sneezes and speaks", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["a motorcycle engine works nearby", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tOSWIURC-4", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["engine, work, nearby", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a duck quacks and a woman speaks"], "question": "which entity is a natural event", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vmrxwuAMb2I", "wqZ135Ssz0"], "start_seconds": ["40", "60"], "properties": ["a dog, inhales, exhales", "two men, woman, birds"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", null], "captions_pred_audio": ["a dog barks and growls", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "people applaud and hoot and chat quietly"], "sample_ids": ["w2JXXIAdUdg", "wwyfGO2J4"], "start_seconds": ["10", "90"], "properties": ["snoring, distance, person", "people, applaud, hoot"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", null], "captions_pred_audio": ["a person snoring and a dog whimpering", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "small dogs yip and bark sharply"], "sample_ids": ["vBslzh7saPw", "v-wcQf4BDY0"], "start_seconds": ["90", "120"], "properties": ["power, scream, increase", "bark, yip, sharply"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["wRBHTgrbiwg", "w5W5Kqtc8E"], "start_seconds": ["50", "100"], "properties": ["bird, owl, speak", "wind, blow, vehicle"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "an engine runs loudly"], "sample_ids": ["vZAw4apG0Es", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["background, tick, repeat", "loud, engine, run"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a clock is ticking and people are talking", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["ukxt9I7eMMg", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["continuous, woman, speaking", "music, gunfire, explosion"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more action", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a man speaks as a motor runs in the background"], "sample_ids": ["yPUYU6t3rwo", "xZepNM9qcRA"], "start_seconds": ["370", "30"], "properties": ["birds chirp, objects are moved around, birds", "background, motor, run"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["insects buzz and a man speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "some men converse over an engine running"], "sample_ids": ["zALy31PjDl0", "sCiy7QS1U"], "start_seconds": ["21", "300"], "properties": ["a man, a vehicle, a horn", "men, converse, engine"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", null], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has more people", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "some men converse over an engine running"], "sample_ids": ["zsLxS-uLJTw", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["horn, blast, train", "men, converse, engine"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", null], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a train", "label": 0}, {"captions": ["water splashes and a motorboat passes as people yell", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["w5W5Kqtc8E", "xKB8O8LTs6s"], "start_seconds": ["100", "70"], "properties": ["water, splashes, motorboat", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "someone is typing on a computer keyboard"], "sample_ids": ["yDoT73BWsdA", "v0x1odnXtP0"], "start_seconds": ["10", "210"], "properties": ["engine, revs, vehicle", "keyboard, type, computer"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "how to make money on youtube in spanish"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person is typing on a keyboard"], "question": "which is not a vehicle", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "pigeons vocalize and birds chirp"], "sample_ids": ["vW4x7S1VfQc", "uiS58TNyUiw"], "start_seconds": ["150", "430"], "properties": ["clacking, oil, woman", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "of the pigeon in the cage"], "captions_pred_audio": ["food sizzles in a frying pan", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["continuous sneezing together with speech", "a car accelerates and wind blows"], "sample_ids": ["x4dZyf9Gbj0", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["continuous, sneeze, speech", "accelerates, wind, blows"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a woman sneezes and speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "birds chirp and objects are moved around"], "sample_ids": ["t25U-v4k4ts", "yPUYU6t3rwo"], "start_seconds": ["40", "370"], "properties": ["a, chirps, bird", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "insects buzz and a man speaks"], "question": "which entity has more birds", "label": 1}, {"captions": ["water flows followed by women screaming", "a child speaks in closed space"], "sample_ids": ["w5W5Kqtc8E", "yW6FWLSLkx4"], "start_seconds": ["100", "40"], "properties": ["water, flow, women", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["bees buzz and wind blows", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tMJne1a4AFI", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["bees buzz, wind blows, bees", "applause, audience, yells"], "captions_pred_video": ["a swarm of bees on the ground", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a person speaks briefly", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["zOZleIRqZm4", "zFjIWfSD-4"], "start_seconds": ["80", "410"], "properties": ["person, talk, brief", "People, motor, brakes"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a person talking?", "label": 0}, {"captions": ["a small engine idles continuously", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["y5WII6cTH7k", "su6FAOcOA8c"], "start_seconds": ["40", "4"], "properties": ["engine, idle, continuously", "engine, idle, woman"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a dark barks and whimpers", "a horn honks followed by a loud continuous buzzing while men speak"], "sample_ids": ["sYj4hpDUZDQ", "wsHBIgzs9Fs"], "start_seconds": ["30", "50"], "properties": ["barks, whimpers, dark", "horn, continuous, buzzing"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "shows a motorcycle riding down a country road with a motorcycle in the foreground"], "captions_pred_audio": ["a dog barks and a cat meows", "a car accelerates and revs its engine while a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "a toilet flushes and a female speaks"], "sample_ids": ["sapQIQUhFc", "yaln9y8I7ms"], "start_seconds": ["280", "230"], "properties": ["water, trickles, flow", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["w5W5Kqtc8E", "yDoT73BWsdA"], "start_seconds": ["100", "10"], "properties": ["wind, engine, scream", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a clock ticktocks briefly", "a race car approaches quickly and slows down squealing tires"], "sample_ids": ["u7C-AEBQM", "sEprKHm8Sj8"], "start_seconds": ["30", "90"], "properties": ["ticktocks, clock, ticktocks briefly", "car, tires, slows"], "captions_pred_video": [null, "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a ticktock of a clock", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a stream of water runs briefly"], "sample_ids": ["tEE3MpBt1sg", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["drill, something, laugh", "stream, water, run"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "people applaud and hoot and chat quietly"], "sample_ids": ["viuTg1M-dqg", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["two men, speak, follow", "people, applaud, hoot"], "captions_pred_video": ["footage of water coming out of a hole in the ground", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "a train horn blares as a train passes, then fades"], "sample_ids": ["tOSWIURC-4", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["noise, engine, revs", "blares, fades, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a lawn mower is running ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a clock ticktocks briefly", "an infant crying as a woman laughs"], "sample_ids": ["u7C-AEBQM", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["ticktocks, clock, ticktocks briefly", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a ticktock of a clock", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman and man speak while food is frying", "a small voice speaks, music plays followed by a double whoosh, and then a bell dings"], "sample_ids": ["zk-xJGQU8-4", "tQWGZLItBXk"], "start_seconds": ["130", "170"], "properties": ["food, man, woman", "voice, music, whoosh"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "worms revolution screenshots"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has a voice speaking?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vh30P49Po6s", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["loud, continuous, quacks", "a woman, laughs, animal"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and a baby is crying"], "question": "which entity is not continuous", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tK4VlLsNxak", "tiDFTC-5vU"], "start_seconds": ["120", "30"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "male, duck, laugh"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", null], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking and ducks are quacking"], "question": "which entity is a spoof", "label": 1}, {"captions": ["water runs into a sink while men speak", "water splashes as an animal walks through"], "sample_ids": ["vzceMbklWc", "w1ir-sZ3Im8"], "start_seconds": ["180", "90"], "properties": ["water, sink, run", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["water is running and a man is speaking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wqN6IIHw3po", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["rain, surface, fall", "engine, accelerate, idle"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and water is splashing", "an engine is idling"], "question": "which entity is a moving object", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a man speaks as a motor runs in the background"], "sample_ids": ["wvKpEYswXO0", "xZepNM9qcRA"], "start_seconds": ["150", "30"], "properties": ["sound, water, running", "background, motor, run"], "captions_pred_video": ["of the person preparing food in the kitchen", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "dishes cling together then a man begins to speak"], "sample_ids": ["x9JovgqUcs", "sQGXqGcwOTc"], "start_seconds": ["500", "3"], "properties": ["a, man, speaks, keyboard", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man speaks and types on a keyboard", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a woman and man speak while food is frying", "a duck quacks loudly and continuously"], "sample_ids": ["zk-xJGQU8-4", "vh30P49Po6s"], "start_seconds": ["130", "30"], "properties": ["food, man, woman", "loud, continuous, quacks"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "a duck is quacking loudly"], "question": "which entity is speaking", "label": 0}, {"captions": ["birds chirp and a dog breathes heavily", "water splashes as an animal walks through"], "sample_ids": ["y2ZBGpgbhHM", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["dog, chirp, breathe", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["birds chirping and a dog panting", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["yJ0TePmaOo", "w5W5Kqtc8E"], "start_seconds": ["390", "100"], "properties": ["two hard objects, man, speak", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running?", "label": 1}, {"captions": ["continuous sneezing together with speech", "winds blows roughly as a vehicle races past"], "sample_ids": ["x4dZyf9Gbj0", "xjvTpk2Zpr8"], "start_seconds": ["130", "70"], "properties": ["continuous, sneeze, speech", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman sneezes and speaks", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a man speaks while playing a video game on a keyboard", "water flows and trickles"], "sample_ids": ["tw76HGONaKg", "tB7hWb9gTuQ"], "start_seconds": ["570", "30"], "properties": ["A, game, keyboard", "water, flow, trickle"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["water rushes by", "a man speaks followed by another man speaking outside"], "sample_ids": ["x-PeY8Yb8M4", "viuTg1M-dqg"], "start_seconds": ["300", "30"], "properties": ["water, rushes, by", "two men, speak, follow"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zl9Dqx-j7q4", "vb1fPSDI4c"], "start_seconds": ["6", "30"], "properties": ["engine, laugh, loud", "multiple, people, yell"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a crowd of people are talking and laughing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "a train horn blows as it passes by"], "sample_ids": ["yswmmRZFItk", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["background, frog, croak", "horn, blows, train"], "captions_pred_video": ["a close up of a frog in the water", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a frog is croaking", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["a telephone rings and a bird vocalizes", "birds chirp and objects are moved around"], "sample_ids": ["skd2PphS6oI", "yPUYU6t3rwo"], "start_seconds": ["190", "370"], "properties": ["ring, bird, vocalize", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["leaves rustle while man speaks", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zOZleIRqZm4", "sLUnaPT5gM8"], "start_seconds": ["80", "0"], "properties": ["leaves, rustle, speak", "loud, laughter, intermittent"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a propeller rotates loudly and intensely"], "sample_ids": ["sxYkFKFIZD0", "ugHJF0hfYkg"], "start_seconds": ["20", "10"], "properties": ["screech, man, door", "loud, intense, propeller"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["w2bYrCVLT60", "uZesmtKZGSw"], "start_seconds": ["120", "250"], "properties": ["ducks, speak, quack", "men, talk, cars"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a child speaks", "pigeons vocalize and birds chirp"], "sample_ids": ["yW6FWLSLkx4", "uiS58TNyUiw"], "start_seconds": ["40", "430"], "properties": ["a, child, speaks", "vocalize, bird, chirp"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of the pigeon in the cage"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["someone snores nearby", "a man speaks as horns blow"], "sample_ids": ["spJCm8tD9Zo", "tHyNqRyK34A"], "start_seconds": ["90", "24"], "properties": ["someone snores, nearby, someone", "a, man, speaks"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "being taken from inside a vehicle on the street at night"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a car is honking with background noise "], "question": "which entity is a person", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a soft wind underscores a woman laughing"], "sample_ids": ["sOa7g-44Dag", "s6DESzUTGjY"], "start_seconds": ["30", "16"], "properties": ["background, man, spray", "wind, laugh, woman"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a motorboat is moving with wind noise in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "water flows and trickles"], "sample_ids": ["vZAw4apG0Es", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["background, clock, ticktocks", "water, flow, trickle"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a clock is ticking and people are talking", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "a toilet flushes and water drains"], "sample_ids": ["uiS58TNyUiw", "sfAvvZwdLCY"], "start_seconds": ["430", "20"], "properties": ["audio, man, speaking", "water drains, flushes, water"], "captions_pred_video": ["of the pigeon in the cage", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "people speak in a closed space"], "sample_ids": ["tZGN5a7ybxo", "sTpirNYo8vQ"], "start_seconds": ["60", "30"], "properties": ["ring, train, horn", "people, space, speak"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a man is speaking while a car is revving and accelerating "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["zY3icUyMdh8", "vlS6YMeWAPo"], "start_seconds": ["20", "40"], "properties": ["dog, bark, engine", "sheep, baa, birds"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a stream of water runs briefly"], "sample_ids": ["wEBlkGWVWwE", "x-PeY8Yb8M4"], "start_seconds": ["260", "300"], "properties": ["a, babble, woman", "stream, water, run"], "captions_pred_video": ["shows a person writing on the whiteboard", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a drill runs and two people laugh"], "sample_ids": ["xV7Mg1QucSc", "tEE3MpBt1sg"], "start_seconds": ["14", "50"], "properties": ["alarm, ticktocks, laughs", "two people, laugh, drill"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["a clock ticktocks in wind", "paper is crumpling consistently"], "sample_ids": ["yVumC9TGknc", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["ticktocks, clock, wind", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a series of beeps and chirps", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "a duck quacks continuously"], "sample_ids": ["tDlysoZiA1I", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, multiple", "quacks, continuously, duck"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a duck is quacking loudly"], "question": "which animal is making a noise", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["voJh2gJxXhA", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["music, frog, croak", "airplane, boy, fly"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a man speaking with light rustling", "water is sprayed across a hard surface"], "sample_ids": ["zOZleIRqZm4", "sQwlkXjQabo"], "start_seconds": ["80", "10"], "properties": ["light, rustling, man", "water, spray, surface"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a car speeding up in the distance", "a woman speaks and a baby laughs"], "sample_ids": ["u0TrcHhkPQ", "tOj4tdLRaA"], "start_seconds": ["20", "70"], "properties": ["distance, car, speed", "woman, laugh, baby"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a baby laughs and a woman speaks"], "question": "which entity is not moving", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "a stream of water flows as people talk and wind blows"], "sample_ids": ["x4a9YGIw4ok", "xBxDz0CFVn0"], "start_seconds": ["120", "30"], "properties": ["water, gurgles, stops", "stream, water, flow"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a toilet flushes and water splashes", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["xjvTpk2Zpr8", "uYT5gxnyMWM"], "start_seconds": ["70", "50"], "properties": ["engine, run, wind", "female, spraying, scream"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zY3icUyMdh8", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["dog, bark, engine", "rustling, ducks, quack"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "someone whistles a tune"], "sample_ids": ["sNB8zxXneIM", "sIXTftIuUgw"], "start_seconds": ["20", "90"], "properties": ["several, quack, cocks", "someone, tune, whistle"], "captions_pred_video": ["a group of geese in a cage", null], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["an animal quacks rapidly", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vh30P49Po6s", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["animal, quacks, rapidly", "a, scream, girl"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "wind blowing followed by a zoom"], "sample_ids": ["w5W5Kqtc8E", "vr8ZXjEBhMQ"], "start_seconds": ["100", "150"], "properties": ["water, splashes, motorboat", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more calm", "label": 1}, {"captions": ["an engine starts and increases in power", "a propeller rotates loudly and intensely"], "sample_ids": ["zjTG0gaGCUI", "ugHJF0hfYkg"], "start_seconds": ["80", "10"], "properties": ["power, increase, engine", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["water splashes as an animal walks through", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["w1ir-sZ3Im8", "wyllXV6PjKo"], "start_seconds": ["90", "30"], "properties": ["animal, water, splashes", "a baby, a woman, a man"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a woman speaks and a baby cries"], "question": "which entity is a person", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "an engine runs loudly"], "sample_ids": ["y1saVTXsKwc", "vqZuVbG6-HI"], "start_seconds": ["80", "130"], "properties": ["a, dog, talk", "loud, engine, run"], "captions_pred_video": ["a dog playing with a pink ball", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a dog barks and a man speaks", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "a whistling owl calls out repeatedly and insects screech"], "sample_ids": ["sSMl2vc3ek", "w6RTHR6AeAg"], "start_seconds": ["20", "40"], "properties": ["loud, multiple, distance", "call, owl, screech"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person snoring loudly", "an owl hoots and mechanisms operate "], "question": "which entity is a bird", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "some tunes played by whistling"], "sample_ids": ["wTideSjRFS0", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["food, sizzle, woman", "tune, play, whistling"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "water flows and trickles"], "sample_ids": ["yRx9txMcBl0", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["accelerates, tires, squeals", "water, flow, trickle"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a car is revving its engine and skidding ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["birds chirp as a bell rings", "several insects fly while two men talk"], "sample_ids": ["ziUT9IFTkjg", "s-T9OVOiMLo"], "start_seconds": ["10", "330"], "properties": ["chirp, bell, ring", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zuua6-5goWw", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["birds, chirp, quiet, man, speaks", "a woman, a television program, a bird"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program playing?", "label": 1}, {"captions": ["food is frying while a woman speaks", "water flows as men speak and yell"], "sample_ids": ["yhQ2Lg-7qDY", "vJ7JPEFhyLA"], "start_seconds": ["130", "16"], "properties": ["food, woman, speak", "water, flow, men"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a woman speaking?", "label": 0}, {"captions": ["an airplane engine roars increasingly louder", "paper is crumpling consistently"], "sample_ids": ["vBslzh7saPw", "v5cSxLaHADY"], "start_seconds": ["90", "0"], "properties": ["engine, roar, louder", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a jet engine roars and accelerates ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "an airplane engine runs"], "sample_ids": ["vW4x7S1VfQc", "yVPZ2MNWpms"], "start_seconds": ["150", "0"], "properties": ["clacking, oil, woman", "engine, airplane, runs"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["food sizzles in a frying pan", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a propeller rotates loudly and intensely"], "sample_ids": ["yYEVLuqEytU", "ugHJF0hfYkg"], "start_seconds": ["40", "10"], "properties": ["animal, pig, background", "loud, intense, propeller"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yJ0TePmaOo", "sSMl2vc3ek"], "start_seconds": ["390", "20"], "properties": ["two hard objects, man, speak", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "people speak as gunfire rings out"], "sample_ids": ["zj2R0XoFr5k", "wqTCwqVRDlk"], "start_seconds": ["50", "80"], "properties": ["airplane, boy, fly", "gunfire, ring, speak"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["a woman sneezes then speaks", "some men converse over an engine running"], "sample_ids": ["x4dZyf9Gbj0", "sCiy7QS1U"], "start_seconds": ["130", "300"], "properties": ["sneezes, speaks, woman", "men, converse, engine"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a woman sneezes and speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a woman?", "label": 0}, {"captions": ["a goat bleats and someone makes a calling noise", "a car accelerates and wind blows"], "sample_ids": ["vlS6YMeWAPo", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["noise, bleat, call", "accelerates, wind, blows"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a dog barks and whimpers", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sShpyu2l4YQ", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["barks, whimpers, dog", "water, radio, man"], "captions_pred_video": ["the puppies are playing with a toy", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "wind blowing followed by a zoom"], "sample_ids": ["zgUgkpk78xU", "vr8ZXjEBhMQ"], "start_seconds": ["70", "150"], "properties": ["horn, bells, ring", "wind, blow, zoom"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "wind blowing followed by a zoom"], "sample_ids": ["sTpirNYo8vQ", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["a, tone, fast", "wind, blow, zoom"], "captions_pred_video": ["of a man taking a selfie on a bus", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "winds blows roughly as a vehicle races past"], "sample_ids": ["s6DESzUTGjY", "xjvTpk2Zpr8"], "start_seconds": ["16", "70"], "properties": ["wind, laugh, woman", "wind, blows, vehicle"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a jet engine roars and wind blows "], "question": "which entity is more calm", "label": 0}, {"captions": ["an insect buzzes around continuously", "a cat meows as a young woman speaks"], "sample_ids": ["v25l1jef3JY", "x5cuQjOdM3E"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "cat, meows, young woman"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a cat meows and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["some clanking with distant murmuring", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uMTTDZ2mb4", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["clanking, murmuring, distant", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["people speak then an engine runs", "dishes cling together then a man begins to speak"], "sample_ids": ["uMTTDZ2mb4", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["engine, run, people", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a person screams glaringly", "paper is crumpling consistently"], "sample_ids": ["xC8kbrKJmco", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["glaringly, screams, person", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a goat is bleating ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["birds coo incessantly", "a woman speaks and then a man speaks"], "sample_ids": ["yZrFNS7GFBQ", "vbpKkWvfOu4"], "start_seconds": ["30", "560"], "properties": ["coo, bird, incessant", "a, man, speaks"], "captions_pred_video": ["of the bird in the cage", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["an owl hoots in the background ", "a woman is speaking and a man is speaking"], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tiDFTC-5vU", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a man is speaking and ducks are quacking"], "question": "which entity is speaking and a duck quacks as others laugh", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a child speaks in closed space"], "sample_ids": ["x5cuQjOdM3E", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["cat, meows, young woman", "child, space, speak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a train engine runs and a horn blows", "water pouring and bubbling"], "sample_ids": ["zPX9o1uDiI", "uyRfq-jKPpo"], "start_seconds": ["40", "50"], "properties": ["engine, horn, run", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a car speeds away loudly followed by a car revving loudly and driving away while outside"], "sample_ids": ["sncRqQ67iJU", "sjlVMgdGSK0"], "start_seconds": ["460", "30"], "properties": ["loud, repeatedly, man", "car, revving, loudly"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a person is snoring", "a car accelerates and revs its engine "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["xl2PIWyXaM", "xjvTpk2Zpr8"], "start_seconds": ["160", "70"], "properties": ["chirp, man, younger person", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["birds are chirping and people are talking", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["water is sprayed across a hard surface", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sQwlkXjQabo", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["water, spray, surface", "applause, audience, yells"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "water flows and trickles"], "sample_ids": ["y8WEcpOlT3I", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["wind, speak, buffeting", "water, flow, trickle"], "captions_pred_video": ["on how to use a sewing machine youtube", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "vehicles pass by on a roadway"], "sample_ids": ["u7C-AEBQM", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["ticks, rhythmic, quiet", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a ticktock of a clock", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["xZepNM9qcRA", "uWPRNLnpy7Y"], "start_seconds": ["30", "10"], "properties": ["background, motor, run", "accelerate, laugh, vehicle"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "is taken from a car driving down the street"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a man speaks as a motor runs in the background"], "sample_ids": ["w5W5Kqtc8E", "xZepNM9qcRA"], "start_seconds": ["100", "30"], "properties": ["water, splashes, motorboat", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a man speaks as a car is passing by"], "sample_ids": ["zl9Dqx-j7q4", "sK4u5T8hW78"], "start_seconds": ["6", "30"], "properties": ["motors rev, laugh, loudly", "a, car, pass"], "captions_pred_video": ["footage of a man driving a car in the dark", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tiDFTC-5vU", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["male, duck, laugh", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["voJh2gJxXhA", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["music, frog, croak", "a woman, laughs, animal"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a woman is speaking and a baby is crying"], "question": "which entity has a frog in it?", "label": 0}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zALy31PjDl0", "tiDFTC-5vU"], "start_seconds": ["21", "30"], "properties": ["a man, a vehicle, a horn", "male, duck, laugh"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", null], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a man is speaking and ducks are quacking"], "question": "which entity has a duck?", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "water flows and trickles"], "sample_ids": ["vfYTJq7nU", "tB7hWb9gTuQ"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a duck quacks and a woman speaks", "water is splashing and gurgling"], "question": "which entity is a video of water flowing?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["tOSWIURC-4", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["engine, work, nearby", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a lawn mower is running ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a duck quacks continuously", "a train horn blows as it passes by"], "sample_ids": ["vh30P49Po6s", "zVacuqSb4LI"], "start_seconds": ["30", "30"], "properties": ["quacks, continuously, duck", "horn, blows, train"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a duck is quacking loudly", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "men speak and a nozzle sprays liquid"], "sample_ids": ["s59PfAghdkM", "wRV8yMk886E"], "start_seconds": ["0", "0"], "properties": ["bird, chirp, background, horse, neigh", "liquid, spray, nozzle"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "two cars are parked in a parking lot at night"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a man speaks followed by a loud burst"], "question": "which entity is more likely to be used in a science class", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xMXvkIcaG0Y", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["sound, humming, rattling", "airplane, boy, fly"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["an engine is revving and accelerating ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["a small engine spits as it runs", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sZvwOuuPGP0", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["spits, engine, runs", "engine, laugh, loud"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a medium engine is running ", "a jet engine roars "], "question": "which entity is followed by a man laughing", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "paper is crumpling consistently"], "sample_ids": ["uYT5gxnyMWM", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["female, spraying, scream", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "paper is crumpled and crinkled"], "question": "which entity is a video of a person speaking", "label": 0}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "people cheer as a vehicle engine revs"], "sample_ids": ["sU53zg9Jp7s", "xjhAnI2q6hM"], "start_seconds": ["380", "6"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "engine revs, vehicle, people"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "an insect buzzes around continuously"], "sample_ids": ["wqUmIEzuNz4", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["frog, bird, vocalize", "buzzes, continuously, insect"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a cat meows and rustles", "a fly is buzzing around a microphone "], "question": "which entity is not a frog?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["vZAw4apG0Es", "xV7Mg1QucSc"], "start_seconds": ["30", "14"], "properties": ["background, tick, repeat", "alarm, ticktocks, laughs"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a clock is ticking and people are talking", "an alarm clock ticks and a woman laughs"], "question": "which entity has a tick that repeats", "label": 0}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a woman speaks happily and an animal chirps"], "sample_ids": ["wvKpEYswXO0", "uWAAAL4CIoc"], "start_seconds": ["150", "0"], "properties": ["sound, water, running", "a woman, chirps, animal"], "captions_pred_video": ["of the person preparing food in the kitchen", null], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman is speaking and a dog is barking "], "question": "which entity has a woman speaking softly?", "label": 0}, {"captions": ["speaking following by laughing and clapping", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["u2f5NpsoHBg", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["person, laugh, clap", "engine, laugh, loud"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a car accelerates and wind blows"], "sample_ids": ["v7jJS8aAyA", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["wind, blows, loudly", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a race car accelerates and revs its engine "], "question": "which vehicle is moving faster", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a stream of water flows quickly"], "sample_ids": ["wyllXV6PjKo", "wbHTKEJZyhc"], "start_seconds": ["30", "20"], "properties": ["a kid, talk, cry", "stream, water, flow"], "captions_pred_video": [null, "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a woman speaks and a baby cries", "a waterfall is flowing and people are speaking "], "question": "which entity is moving", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vhJWZheqaE", "sLUnaPT5gM8"], "start_seconds": ["0", "0"], "properties": ["water drains unevenly, toilet flushes, water drains", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a toilet is flushed", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["smGI3C1NZc", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["water, drain, toilet", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking while food is frying in the background"], "question": "which entity is about a woman talking?", "label": 1}, {"captions": ["a beep occurs briefly", "bird squawks are accompanied by a man and woman speaking"], "sample_ids": ["xtWeJ56-U-g", "wqZ135Ssz0"], "start_seconds": ["20", "60"], "properties": ["beep, occur, briefly", "man, woman, squawks"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", null], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is accompanied by a man and woman speaking", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "some men converse over an engine running"], "sample_ids": ["wyllXV6PjKo", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["a kid, talk, cry", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a kid?", "label": 0}, {"captions": ["people converse in the distance as a clock ticks", "an engine works in idle nearby followed by a man talking"], "sample_ids": ["vZAw4apG0Es", "wqADXCzngMw"], "start_seconds": ["30", "340"], "properties": ["people, clock, converse", "engine, idle, man"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "of a man working on a vintage volkswagen beetle"], "captions_pred_audio": ["a clock is ticking and people are talking", "a lawn mower is running and a man is speaking "], "question": "which entity is about a clock?", "label": 0}, {"captions": ["race cars go around a track as a man commentates", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["uZesmtKZGSw", "yajyRTUQk3U"], "start_seconds": ["250", "400"], "properties": ["car, track, man", "a woman, something, fried"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["an adult woman and an adult man speak", "someone is typing on a computer keyboard"], "sample_ids": ["zTLVJCo4WEE", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["two people, adult, speak", "keyboard, type, computer"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["w8uLijTqtlU", "yks4cLgIDMc"], "start_seconds": ["70", "170"], "properties": ["wind, microphone, noise", "background, speaking, child"], "captions_pred_video": ["footage is blurry and shaky", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "an engine runs loudly"], "sample_ids": ["sa6TLVbooCc", "vqZuVbG6-HI"], "start_seconds": ["240", "130"], "properties": ["people, laugh, child", "loud, engine, run"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yajyRTUQk3U", "vb1fPSDI4c"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "multiple, people, yell"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["wind blowing followed by a zoom", "water is sprayed across a hard surface"], "sample_ids": ["vr8ZXjEBhMQ", "sQwlkXjQabo"], "start_seconds": ["150", "10"], "properties": ["wind, blow, zoom", "water, spray, surface"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "a duck quacks loudly and continuously"], "sample_ids": ["wqUmIEzuNz4", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["frog, bird, vocalize", "loud, continuous, quacks"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a cat meows and rustles", "a duck is quacking loudly"], "question": "which animal is making a noise", "label": 1}, {"captions": ["a man speaks uses a drill", "roadway noise occurs and a truck accelerates"], "sample_ids": ["x5eIC7S0fbg", "tgbONvsP47Y"], "start_seconds": ["60", "0"], "properties": ["A man is speaking, uses a drill, and is a tool", "noise, truck, accelerate"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a car is driving on the road "], "question": "which is a tool", "label": 0}, {"captions": ["a machine runs continuously", "a stream of water runs briefly"], "sample_ids": ["wdXV3Pv0jiY", "x-PeY8Yb8M4"], "start_seconds": ["11", "300"], "properties": ["machine, running, continuously", "stream, water, run"], "captions_pred_video": ["footage is blurry and shaky", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a car is driving on a wet road "], "question": "which entity is running continuously", "label": 0}, {"captions": ["people clap and speak in the distance", "winds blows roughly as a vehicle races past"], "sample_ids": ["wwyfGO2J4", "xjvTpk2Zpr8"], "start_seconds": ["90", "70"], "properties": ["clap, distance, speak", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a person is snoring while sleeping", "an engine runs loudly"], "sample_ids": ["vJrjSeP17yE", "vqZuVbG6-HI"], "start_seconds": ["40", "130"], "properties": ["a person is sleeping, snoring, person", "loud, engine, run"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a person snoring loudly", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "birds chirp and objects are moved around"], "sample_ids": ["vVhthZ45k3Y", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["cat, purr, hiss", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage is blurry and out of focus", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a person is snoring while sleeping", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vJrjSeP17yE", "vlS6YMeWAPo"], "start_seconds": ["40", "40"], "properties": ["a person is sleeping, snoring, person", "sheep, baa, birds"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a person snoring loudly", "a goat bleats and birds chirp"], "question": "which entity is a person", "label": 0}, {"captions": ["an animal growls followed by birds chirping", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["y2ZBGpgbhHM", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["animal, growl, bird", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["t25U-v4k4ts", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["a, chirps, bird", "animal, grunts, snorts"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a woman is speaking and a baby is crying"], "question": "which entity has a bird chirp?", "label": 0}, {"captions": ["a person speaks briefly", "wind blowing followed by a zoom"], "sample_ids": ["zOZleIRqZm4", "vr8ZXjEBhMQ"], "start_seconds": ["80", "150"], "properties": ["person, talk, brief", "wind, blow, zoom"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not talking", "label": 1}, {"captions": ["some men converse over an engine running", "a man speaks as a car is passing by"], "sample_ids": ["sCiy7QS1U", "sK4u5T8hW78"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "some tunes played by whistling"], "sample_ids": ["w2bYrCVLT60", "u6BnG6YZqJ4"], "start_seconds": ["120", "0"], "properties": ["ducks, speak, quack", "tune, play, whistling"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["a small engine spits as it runs", "a propeller rotates loudly and intensely"], "sample_ids": ["sZvwOuuPGP0", "ugHJF0hfYkg"], "start_seconds": ["50", "10"], "properties": ["spits, engine, runs", "loud, intense, propeller"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a medium engine is running ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a dog barks and whimpers", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sShpyu2l4YQ", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["barks, whimpers, dog", "music, gunfire, explosion"], "captions_pred_video": ["the puppies are playing with a toy", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a dog is barking and growling", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["wind blows strongly and a young man speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vs65y4qmyBE", "su6FAOcOA8c"], "start_seconds": ["340", "4"], "properties": ["wind, blows, strongly", "engine, idle, woman"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "birds chirp in the background while a horse neighs followed by a girl speaking"], "sample_ids": ["zcDwZ6W7E3E", "s59PfAghdkM"], "start_seconds": ["180", "0"], "properties": ["a, man, speak", "bird, chirp, background, horse, neigh"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "is an anime scene featuring two people and a horse in the foreground and a fence in the background"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "birds are chirping a horse is neighing and a woman is speaking "], "question": "which entity has a horse neighing?", "label": 1}, {"captions": ["a baby laugh at a sputter", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sLUnaPT5gM8", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["laugh, sputter, baby", "People, motor, brakes"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", null], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["someone is snoring while sleeping", "water flows and trickles"], "sample_ids": ["ujMt0-D-x2k", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["snore, sleep, someone", "water, flow, trickle"], "captions_pred_video": ["of the dog playing with a toy on the floor", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a person is snoring loudly", "water is splashing and gurgling"], "question": "which entity is not a person", "label": 1}, {"captions": ["an animal quacks rapidly", "people cheer as a vehicle engine revs"], "sample_ids": ["vh30P49Po6s", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["animal, quacks, rapidly", "engine revs, vehicle, people"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a duck is quacking loudly", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["yJ0TePmaOo", "y8WEcpOlT3I"], "start_seconds": ["390", "40"], "properties": ["two hard objects, man, speak", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["birds chirps while a siren signals in the distance", "a stream runs then someone speaks"], "sample_ids": ["uKCSGgof8gI", "wbHTKEJZyhc"], "start_seconds": ["12", "20"], "properties": ["chirps, distance, signal", "stream, run, someone"], "captions_pred_video": ["footage of a street in a small town on a sunny day", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a waterfall is flowing and people are speaking "], "question": "which entity is a stream?", "label": 1}, {"captions": ["someone snores nearby", "people applaud and hoot and chat quietly"], "sample_ids": ["spJCm8tD9Zo", "wwyfGO2J4"], "start_seconds": ["90", "90"], "properties": ["someone snores, nearby, someone", "people, applaud, hoot"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man talks as several small engines run", "an infant crying frantically"], "sample_ids": ["u9A6VZQCZpU", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a man speaks as a car is passing by"], "sample_ids": ["vzxHnu-SFEw", "sK4u5T8hW78"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "a, car, pass"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which object is moving", "label": 0}, {"captions": ["people speak softly as food sizzles", "paper folding and crinkling"], "sample_ids": ["yhQ2Lg-7qDY", "zPpG3RD8lSs"], "start_seconds": ["130", "20"], "properties": ["food, sizzle, speak", "paper, fold, crinkle"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a faucet is running and a man is speaking", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "people speak as gunfire rings out"], "sample_ids": ["y8WEcpOlT3I", "wqTCwqVRDlk"], "start_seconds": ["40", "80"], "properties": ["harsh, wind, blows", "gunfire, ring, speak"], "captions_pred_video": ["on how to use a sewing machine youtube", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["bees buzz and wind blows", "dishes cling together then a man begins to speak"], "sample_ids": ["tMJne1a4AFI", "sQGXqGcwOTc"], "start_seconds": ["0", "3"], "properties": ["bees buzz, wind blows, bees", "cling, speak, dishes"], "captions_pred_video": ["a swarm of bees on the ground", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a swarm of bees buzzing around", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a frog croaks as other frogs croak in the background"], "sample_ids": ["uEU-Hg5MTN8", "yswmmRZFItk"], "start_seconds": ["27", "0"], "properties": ["a woman, laughs, animal", "background, frog, croak"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a close up of a frog in the water"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["a toilet door squeaks as it is opened", "a man speaks as a car is passing by"], "sample_ids": ["sdXV-ylviw", "sK4u5T8hW78"], "start_seconds": ["190", "30"], "properties": ["door, toilet, squeaks", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["people speak as gunfire rings out", "a woman speaks happily and an animal chirps"], "sample_ids": ["wqTCwqVRDlk", "uWAAAL4CIoc"], "start_seconds": ["80", "0"], "properties": ["gunfire, ring, speak", "a woman, chirps, animal"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", null], "captions_pred_audio": ["a man is speaking and a gun is fired", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["a helicopter engine idles continuously", "females talk and laugh over gusting wind"], "sample_ids": ["ugHJF0hfYkg", "un9VQlzgZM"], "start_seconds": ["10", "5"], "properties": ["engine, idle, continuously", "females, talk, laugh"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is not a person?", "label": 0}, {"captions": ["a man speaks and is typing on a keyboard", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["x9JovgqUcs", "wDVMhEdTiVw"], "start_seconds": ["500", "30"], "properties": ["a, man, speaks, keyboard", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["a weapon fires multiple times", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sMC07Ucy7kg", "vfYTJq7nU"], "start_seconds": ["10", "130"], "properties": ["weapon, fire, multiple", "rustling, ducks, quack"], "captions_pred_video": ["footage is from a car's point of view", null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a duck quacks and a woman speaks"], "question": "which entity is more likely to be used in a hunting situation", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "water pouring and bubbling"], "sample_ids": ["ukg5L09Wpvo", "uyRfq-jKPpo"], "start_seconds": ["150", "50"], "properties": ["clickety-clack, train, whistle", "water, bubbles, pouring"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "water splashes and a motorboat passes as people yell"], "sample_ids": ["vveS8HT7Uog", "w5W5Kqtc8E"], "start_seconds": ["100", "100"], "properties": ["a man, objects, speak", "water, splashes, motorboat"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more active", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vBHyYJ8pL0", "vbZ-0lGPneg"], "start_seconds": ["2", "30"], "properties": ["noise, door, opening", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a video of a door opening and closing?", "label": 0}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wnpJndXuxLc", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["beeps, loud, whistle", "a woman, laughs, animal"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["viuTg1M-dqg", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["two men, speak, follow", "People, motor, brakes"], "captions_pred_video": ["footage of water coming out of a hole in the ground", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has more people", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["uoGVs9yUqY4", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["multiple, vocalize, wind", "rooster, crow, background, men"], "captions_pred_video": ["for how to make a wooden shed door youtube", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a rooster?", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "plastic is tapped on while someone speaks"], "sample_ids": ["sTpirNYo8vQ", "wvKpEYswXO0"], "start_seconds": ["30", "150"], "properties": ["a, tone, fast", "plastic, tap, speak"], "captions_pred_video": ["of a man taking a selfie on a bus", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a woman speaks as frying food sizzles", "a toilet flushes and a female speaks"], "sample_ids": ["wTideSjRFS0", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["food, sizzle, woman", "female, flushes, toilet"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a toilet flushes and a man speaks"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a man speaks as a motor runs in the background"], "sample_ids": ["uOpoD0gGXcs", "xZepNM9qcRA"], "start_seconds": ["120", "30"], "properties": ["chirps, woman, bird", "background, motor, run"], "captions_pred_video": ["a herd of cows grazing in the field", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a man speaking to a motor?", "label": 1}, {"captions": ["a dog barks and whimpers", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["sShpyu2l4YQ", "zY3icUyMdh8"], "start_seconds": ["0", "20"], "properties": ["barks, whimpers, dog", "dog, bark, engine"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a dog is barking and growling", "a car is driving and dogs are barking and squealing "], "question": "which dog barks and whimpers", "label": 0}, {"captions": ["a woman talking as an infant is crying", "a man speaks then multiple motorcycles pass by"], "sample_ids": ["tMbMDvT50j8", "zcDwZ6W7E3E"], "start_seconds": ["12", "180"], "properties": ["a, talk, infant", "a, man, speak"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "2 people riding motorcycles down a mountain road with trees lining the sides of the road"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking while a car accelerates and revs its engine "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a weapon fires multiple times"], "sample_ids": ["zcDwZ6W7E3E", "sMC07Ucy7kg"], "start_seconds": ["180", "10"], "properties": ["man, speak, motorcycles", "weapon, fire, multiple"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage is from a car's point of view"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["water runs into a sink while men speak", "running water in a faucet with some clinks"], "sample_ids": ["vzceMbklWc", "zNRChLjqcU"], "start_seconds": ["180", "220"], "properties": ["water, sink, run", "water, faucet, run"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and a man is speaking", "water is running from a faucet into a sink"], "question": "which entity has water running into it", "label": 0}, {"captions": ["a man speaks as a car is passing by", "a car accelerates and wind blows"], "sample_ids": ["sK4u5T8hW78", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["a, car, pass", "accelerates, wind, blows"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a race car accelerates and revs its engine "], "question": "which car is moving faster", "label": 1}, {"captions": ["an airplane engine spools and people speak", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wTjoRj1se3U", "sLUnaPT5gM8"], "start_seconds": ["390", "0"], "properties": ["airplane, engine, spool", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a jet engine is running and people are talking", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["people speak and tapping occurs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tFCUUGdREgA", "zj2R0XoFr5k"], "start_seconds": ["70", "50"], "properties": ["people, tap, speak", "airplane, boy, fly"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a plane flying by?", "label": 1}, {"captions": ["people speak as gunfire rings out", "a toilet flushes and a female speaks"], "sample_ids": ["wqTCwqVRDlk", "yaln9y8I7ms"], "start_seconds": ["80", "230"], "properties": ["gunfire, ring, speak", "female, flushes, toilet"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a toilet flushes and a man speaks"], "question": "which entity is more likely to be in a bathroom?", "label": 1}, {"captions": ["water rushes by", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["x-PeY8Yb8M4", "vfYTJq7nU"], "start_seconds": ["300", "130"], "properties": ["water, rushes, by", "rustling, ducks, quack"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", null], "captions_pred_audio": ["a car is driving on a wet road ", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "people cheer as a vehicle engine revs"], "sample_ids": ["sQGXqGcwOTc", "xjhAnI2q6hM"], "start_seconds": ["3", "6"], "properties": ["audio, kid, giggles", "engine revs, vehicle, people"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["ukg5L09Wpvo", "zj2R0XoFr5k"], "start_seconds": ["150", "50"], "properties": ["a train, a horn, a bell", "airplane, boy, fly"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vVhthZ45k3Y", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["cat, purr, hiss", "gun, shoot, water"], "captions_pred_video": ["footage is blurry and out of focus", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause injury", "label": 1}, {"captions": ["water splashes as an animal walks through", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w1ir-sZ3Im8", "vJ7JPEFhyLA"], "start_seconds": ["90", "16"], "properties": ["animal, water, splashes", "three men, wind, flow"], "captions_pred_video": ["footage of a group of people riding horses through a river", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a woman speaks and then a man speaks"], "sample_ids": ["yPUYU6t3rwo", "vbpKkWvfOu4"], "start_seconds": ["370", "560"], "properties": ["birds chirp, objects are moved around, birds", "a, man, speaks"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["insects buzz and a man speaks", "a woman is speaking and a man is speaking"], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "water splashes as an animal walks through"], "sample_ids": ["vXlk0lIQBFo", "w1ir-sZ3Im8"], "start_seconds": ["470", "90"], "properties": ["wind, speak, vocalize", "animal, water, splashes"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vh30P49Po6s", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["loud, continuous, quacks", "gun, shoot, water"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a duck is quacking loudly", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is louder", "label": 0}, {"captions": ["running water in a faucet with some clinks", "running water in a faucet with some clinks"], "sample_ids": ["zNRChLjqcU", "zNRChLjqcU"], "start_seconds": ["220", "220"], "properties": ["water, faucet, run", "water, faucet, run"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running from a faucet into a sink", "water is running from a faucet into a sink"], "question": "which entity is a faucet?", "label": 0}, {"captions": ["an airplane engine spools and people speak", "water splashes as an animal walks through"], "sample_ids": ["wTjoRj1se3U", "w1ir-sZ3Im8"], "start_seconds": ["390", "90"], "properties": ["airplane, engine, spool", "animal, water, splashes"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a jet engine is running and people are talking", "water splashes and gurgles as people speak"], "question": "which entity is a moving object", "label": 1}, {"captions": ["an animal quacks rapidly", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vh30P49Po6s", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["animal, quacks, rapidly", "loud, laughter, intermittent"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a duck is quacking loudly", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "loud ringing of a telephone stops followed by a man speaking and a digital beep"], "sample_ids": ["wz7N8YRy74I", "uzQnlJXBbOM"], "start_seconds": ["30", "50"], "properties": ["rooster, crow, background, people", "ringing, beep, stop"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of a person using a cell phone on a table"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a telephone rings and a man speaks"], "question": "which entity is a recording of a telephone call?", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["uRExseg-0XI", "wwyfGO2J4"], "start_seconds": ["210", "90"], "properties": ["woman, man, water", "people, applaud, hoot"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", null], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "people are clapping and speaking with background noise "], "question": "which entity has more people", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "a car accelerates and wind blows"], "sample_ids": ["sHbXC6na9hg", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["a person, saw, wood", "accelerates, wind, blows"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", null], "captions_pred_audio": ["an engine is idling and vibrating", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a woman speaks as she rubs two objects together"], "sample_ids": ["uOpoD0gGXcs", "vzxHnu-SFEw"], "start_seconds": ["120", "80"], "properties": ["chirps, woman, bird", "two objects, woman, speak"], "captions_pred_video": ["a herd of cows grazing in the field", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman is speaking?", "label": 1}, {"captions": ["a man speaks uses a drill", "birds chirp and objects are moved around"], "sample_ids": ["x5eIC7S0fbg", "yPUYU6t3rwo"], "start_seconds": ["60", "370"], "properties": ["A man is speaking, uses a drill, and is a tool", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and using a power tool ", "insects buzz and a man speaks"], "question": "which entity is a tool", "label": 0}, {"captions": ["a man speaks uses a drill", "an airplane engine runs"], "sample_ids": ["x5eIC7S0fbg", "yVPZ2MNWpms"], "start_seconds": ["60", "0"], "properties": ["A man is speaking, uses a drill, and is a tool", "engine, airplane, runs"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a car is driving by on the road "], "question": "which entity is a tool", "label": 0}, {"captions": ["a horn honks and then loudly blares", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wnpJndXuxLc", "ukg5L09Wpvo"], "start_seconds": ["50", "150"], "properties": ["horn, honk, loud", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wtDqrBygTcU", "vYkA3cfXp5Q"], "start_seconds": ["30", "30"], "properties": ["man, engine, run", "engine, accelerate, idle"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and a motor is running", "an engine is idling"], "question": "which engine is running", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "loud clanking and banging with brief male speech"], "sample_ids": ["vbpKkWvfOu4", "sWZzXuWYY"], "start_seconds": ["560", "420"], "properties": ["a, woman, man", "male, speech, banging"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a sewing machine runs and a man speaks"], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a man speaks followed by another man speaking outside"], "sample_ids": ["sAam2NqGhLY", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["snoring, breathing, child", "two men, speak, follow"], "captions_pred_video": ["of a little girl sleeping on a couch", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a person is snoring", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "people cheer as a vehicle engine revs"], "sample_ids": ["soTOh3zYJfY", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["vehicle, skid, tires", "engine revs, vehicle, people"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a truck is revving its engine and a man is speaking "], "question": "which vehicle is skidding", "label": 0}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "people applaud and hoot and chat quietly"], "sample_ids": ["zALy31PjDl0", "wwyfGO2J4"], "start_seconds": ["21", "90"], "properties": ["a man, a vehicle, a horn", "people, applaud, hoot"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", null], "captions_pred_audio": ["a man is speaking and a car horn is honking", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "people cheer as a vehicle engine revs"], "sample_ids": ["y8WEcpOlT3I", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["wind, speak, buffeting", "engine revs, vehicle, people"], "captions_pred_video": ["on how to use a sewing machine youtube", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sjlVMgdGSK0", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["accelerates, vehicle, race car", "male, duck, laugh"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a stream of water runs briefly"], "sample_ids": ["weDbePuc-Xc", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["music, slaps, human", "stream, water, run"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks as a machine runs", "paper folding and crinkling"], "sample_ids": ["vD6lYD1l0BY", "zPpG3RD8lSs"], "start_seconds": ["330", "20"], "properties": ["a, machine, run", "paper, fold, crinkle"], "captions_pred_video": ["game controller being held in the hands of the person", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of a machine?", "label": 0}, {"captions": ["a motor idles, accelerates, then slows down.", "the revving of an engine throttle followed by a man speaking"], "sample_ids": ["vYkA3cfXp5Q", "tezvROoo4bs"], "start_seconds": ["30", "40"], "properties": ["speed, idle, accelerate", "audio, throttle, speaking"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage of a busy city street with cars parked on both sides of the road"], "captions_pred_audio": ["an engine is idling", "a car accelerates and revs while a man speaks "], "question": "which entity is a video of a motor?", "label": 0}, {"captions": ["dishes cling together then a man begins to speak", "a sleeping person emits a gravely snore"], "sample_ids": ["sQGXqGcwOTc", "w2JXXIAdUdg"], "start_seconds": ["3", "10"], "properties": ["cling, speak, dishes", "emits, sleeping, person"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a close up shot of a person's mouth with a toothbrush in it"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a person snoring and a dog whimpering"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "a woman talking as an infant is crying"], "sample_ids": ["x6ijhqRY38s", "tMbMDvT50j8"], "start_seconds": ["250", "12"], "properties": ["bowl, silverware, man", "a, talk, infant"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "shows a little girl covering her face with her hands while sitting at a table"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a baby cries and a woman speaks"], "question": "which entity is about a person talking?", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a train horn blows as it passes by"], "sample_ids": ["sncRqQ67iJU", "zVacuqSb4LI"], "start_seconds": ["460", "30"], "properties": ["loud, repeatedly, man", "horn, blows, train"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a person is snoring", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["a man talks while vehicles pass by", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sK4u5T8hW78", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["a, man, talk", "loud, laughter, intermittent"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["someone snores nearby", "a man speaks as a car is passing by"], "sample_ids": ["spJCm8tD9Zo", "sK4u5T8hW78"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "a, car, pass"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a woman speaks over sizzling noise"], "sample_ids": ["uWPRNLnpy7Y", "yajyRTUQk3U"], "start_seconds": ["10", "400"], "properties": ["accelerate, laugh, vehicle", "noise, woman, speak"], "captions_pred_video": ["is taken from a car driving down the street", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person speaking over noise?", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "winds blows roughly as a vehicle races past"], "sample_ids": ["w5W5Kqtc8E", "xjvTpk2Zpr8"], "start_seconds": ["100", "70"], "properties": ["wind, engine, scream", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle racing past?", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "water flows and trickles"], "sample_ids": ["sDSppXIlJrs", "tB7hWb9gTuQ"], "start_seconds": ["27", "30"], "properties": ["microphone, water, wind", "water, flow, trickle"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["the wind is blowing and water is splashing", "water is splashing and gurgling"], "question": "which entity is a video of water flowing?", "label": 1}, {"captions": ["a person sniffles and sneezes", "water quietly rushes by while birds chirp in the background"], "sample_ids": ["uRlbY6aoBU", "sYITalLZjj4"], "start_seconds": ["0", "30"], "properties": ["sneezes, sniffles, person", "water, rushes, background, birds"], "captions_pred_video": [null, "two ducks are swimming in the water near each other"], "captions_pred_audio": ["a man is sneezing ", "wind blows and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a baby cries and a woman moans", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["smDKStoHBJo", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["a, cry, woman", "female, spraying, scream"], "captions_pred_video": ["a man holding a crying baby in his arms", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a baby is crying"], "question": "which entity has a woman spraying?", "label": 1}, {"captions": ["a baby laugh at a sputter", "winds blows roughly as a vehicle races past"], "sample_ids": ["sLUnaPT5gM8", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["laugh, sputter, baby", "wind, blows, vehicle"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "someone whistles a tune"], "sample_ids": ["wwyfGO2J4", "sIXTftIuUgw"], "start_seconds": ["90", "90"], "properties": ["people, applaud, hoot", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a machine beeps continuously"], "sample_ids": ["zuua6-5goWw", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["birds, chirp, quiet, man, speaks", "beeps, machine, continuously"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", null], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 0}, {"captions": ["a infant makes noise and is excited", "a dog barks and whimpers"], "sample_ids": ["wIJK3-5y0kA", "sShpyu2l4YQ"], "start_seconds": ["30", "0"], "properties": ["noise, excited, infant", "barks, whimpers, dog"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "the puppies are playing with a toy"], "captions_pred_audio": ["a baby cries and a woman speaks", "a dog is barking and growling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["bees buzz and wind blows", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tMJne1a4AFI", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["bees buzz, wind blows, bees", "engine, laugh, loud"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a swarm of bees buzzing around", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "people cheer as a vehicle engine revs"], "sample_ids": ["xZepNM9qcRA", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["background, motor, run", "engine revs, vehicle, people"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a truck is revving its engine and a man is speaking "], "question": "which entity has a motor running in the background?", "label": 0}, {"captions": ["waves crash against a shoreline and wind blows", "a man speaks followed by another man speaking outside"], "sample_ids": ["zdYdyF9-m8U", "viuTg1M-dqg"], "start_seconds": ["7", "30"], "properties": ["wind, crash, shoreline", "two men, speak, follow"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["waves crash and wind blows ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["ul60S8TXDA8", "uYT5gxnyMWM"], "start_seconds": ["60", "50"], "properties": ["sound, distance, bell", "a, scream, girl"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "a piece of wood is being placed down and sawed"], "sample_ids": ["zO-LSSY92ZM", "uiItxDsDMFI"], "start_seconds": ["30", "30"], "properties": ["liquid, surface, sound", "wood, piece, saw"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["steam is hissing and hissing", "a saw is being used with background noise "], "question": "which entity is being cut", "label": 1}, {"captions": ["a clock ticktocks", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["v-g-j2uTByM", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["ticktocks, clock, ticktocks", "loud, jet engine, roar"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a clock is ticking loudly", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["children speak and play together", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["yVVP8XvWJTo", "uYT5gxnyMWM"], "start_seconds": ["260", "50"], "properties": ["children, speak, play", "a, scream, girl"], "captions_pred_video": ["footage of a playground at a school or daycare center", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is about a girl speaking followed by a scream?", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "a duck quacks loudly and continuously"], "sample_ids": ["sd7xVssqlw", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["accelerates, tires, squealing", "loud, continuous, quacks"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["bees buzz as wind blows", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tMJne1a4AFI", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["bees, buzz, wind", "male, duck, laugh"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["a small engine spits as it runs", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["sZvwOuuPGP0", "ukg5L09Wpvo"], "start_seconds": ["50", "150"], "properties": ["spits, engine, runs", "clickety-clack, train, whistle"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a medium engine is running ", "a train blows its whistle and blows its horn "], "question": "which train whistle keeps going off", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "a man speaks as a car is passing by"], "sample_ids": ["wRBHTgrbiwg", "sK4u5T8hW78"], "start_seconds": ["50", "30"], "properties": ["birds, chirp, cooing", "a, car, pass"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["yYJksgsxx5U", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["audio, woman, silverware", "wind, blow, vehicle"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", null], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a man speaks followed by another man speaking outside"], "sample_ids": ["w1mlz3Pe4fU", "viuTg1M-dqg"], "start_seconds": ["300", "30"], "properties": ["vocalize, chirp, continuously", "two men, speak, follow"], "captions_pred_video": ["of a bird in a cage", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a human", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "an airplane engine spools and people speak"], "sample_ids": ["tDVADusiIoc", "wTjoRj1se3U"], "start_seconds": ["60", "390"], "properties": ["water, radio, man", "airplane, engine, spool"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine is running and people are talking"], "question": "which entity is a video of an airplane engine spooling?", "label": 1}, {"captions": ["a person snoring", "speaking following by laughing and clapping"], "sample_ids": ["t8tv5YRMJUg", "u2f5NpsoHBg"], "start_seconds": ["0", "30"], "properties": ["a person, snore, loud", "person, laugh, clap"], "captions_pred_video": ["of a man getting his face licked by another man", "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a woman is speaking and a crowd is clapping"], "question": "which person is more likely to be clapping", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "some men converse over an engine running"], "sample_ids": ["yeFvk9x0wWI", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["clack, bird, chirp", "men, converse, engine"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["x5cuQjOdM3E", "tDlysoZiA1I"], "start_seconds": ["30", "0"], "properties": ["cat, meows, young woman", "animal, grunts, chirps"], "captions_pred_video": ["a black background with an airplane flying in the sky", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a cat meows and a woman speaks", "birds are chirping and a rooster is crowing "], "question": "which entity is a domesticated animal", "label": 0}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "a duck quacks continuously"], "sample_ids": ["wSVhSdj0F0", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["beep, clang, footsteps", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a infant makes noise and is excited"], "sample_ids": ["sYITalLZjj4", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["water, rushes, background, birds", "noise, excited, infant"], "captions_pred_video": ["two ducks are swimming in the water near each other", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["wind blows and birds chirp", "a baby cries and a woman speaks"], "question": "which entity is quieter", "label": 0}, {"captions": ["water splashes as an animal walks through", "a car accelerates and wind blows"], "sample_ids": ["w1ir-sZ3Im8", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["animal, water, splashes", "accelerates, wind, blows"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "people speak as gunfire rings out"], "sample_ids": ["zCrAfDfv6-A", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["person, mouse, click", "gunfire, ring, speak"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person whistles a song", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["shmR4OZtzqA", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["man, engine, idle", "clickety-clack, train, whistle"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man speaks while a motor runs", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "people cheer as a vehicle engine revs"], "sample_ids": ["xvDdE3zNf8Y", "xjhAnI2q6hM"], "start_seconds": ["120", "6"], "properties": ["A, crumple, paper", "engine revs, vehicle, people"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman speaks and crumples paper", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a man speaks as a motor runs in the background"], "sample_ids": ["su6FAOcOA8c", "xZepNM9qcRA"], "start_seconds": ["4", "30"], "properties": ["engine, run, woman", "background, motor, run"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xKB8O8LTs6s", "yDoT73BWsdA"], "start_seconds": ["70", "10"], "properties": ["music, gunfire, explosion", "engine, revs, vehicle"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a race car accelerates and revs its engine "], "question": "which entity is more quiet", "label": 1}, {"captions": ["children speak and play together", "a toilet flushes and water drains"], "sample_ids": ["yVVP8XvWJTo", "sfAvvZwdLCY"], "start_seconds": ["260", "20"], "properties": ["children, speak, play", "water drains, flushes, water"], "captions_pred_video": ["footage of a playground at a school or daycare center", "footage of the toilet in the bathroom"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vKrYfzleLB8", "vb1fPSDI4c"], "start_seconds": ["110", "30"], "properties": ["a, ring, gunshots", "multiple, people, yell"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", null], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking?", "label": 1}, {"captions": ["someone whistles a song", "winds blows roughly as a vehicle races past"], "sample_ids": ["sIXTftIuUgw", "xjvTpk2Zpr8"], "start_seconds": ["90", "70"], "properties": ["someone, song, whistle", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a person whistling a song", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uoGVs9yUqY4", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["multiple, vocalize, wind", "engine, idle, woman"], "captions_pred_video": ["for how to make a wooden shed door youtube", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vXlk0lIQBFo", "tDVADusiIoc"], "start_seconds": ["470", "60"], "properties": ["wind, speak, vocalize", "water, radio, man"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a piece of wood is being placed down and sawed"], "sample_ids": ["uRExseg-0XI", "uiItxDsDMFI"], "start_seconds": ["210", "30"], "properties": ["woman, man, water", "wood, piece, saw"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "a man cutting a log with an axe in the woods"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a saw is being used with background noise "], "question": "which entity is about a piece of wood being sawed?", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "pigeons vocalize and birds chirp"], "sample_ids": ["sShpyu2l4YQ", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["growl, bark, yip", "vocalize, bird, chirp"], "captions_pred_video": ["the puppies are playing with a toy", "of the pigeon in the cage"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["tDlfY3nmx1A", "zFjIWfSD-4"], "start_seconds": ["160", "410"], "properties": ["applause, laugh, man", "People, motor, brakes"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", null], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a large bell chimes back and forth loudly", "pigeons vocalize and birds chirp"], "sample_ids": ["w2M4i1mklOA", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["loud, chime, bell", "vocalize, bird, chirp"], "captions_pred_video": ["footage of an antique clock", "of the pigeon in the cage"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["wind blows and people scream while an engine revs", "a stream of water flows as people talk and wind blows"], "sample_ids": ["w5W5Kqtc8E", "xBxDz0CFVn0"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a man speaks as horns blow", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["tHyNqRyK34A", "ukg5L09Wpvo"], "start_seconds": ["24", "150"], "properties": ["a, man, speaks", "clickety-clack, train, whistle"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "vehicles pass by on a roadway"], "sample_ids": ["xyL9F5VrjkE", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["wind, motor, distance", "pass, vehicle, roadway"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a fire truck entering a garage"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["y1saVTXsKwc", "tDVADusiIoc"], "start_seconds": ["80", "60"], "properties": ["a, dog, talk", "water, radio, man"], "captions_pred_video": ["a dog playing with a pink ball", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a dog barks and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a woman talking?", "label": 0}, {"captions": ["a fly buzzes around loudly as birds chirp", "a train horn blows as it passes by"], "sample_ids": ["uJV8NDaHqqk", "zVacuqSb4LI"], "start_seconds": ["100", "30"], "properties": ["loud, fly, chirp", "horn, blows, train"], "captions_pred_video": ["a bee hive in a wooden box", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a swarm of bees buzzing around", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "paper is crumpling consistently"], "sample_ids": ["uYT5gxnyMWM", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["person, spray, yell", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "paper is crumpled and crinkled"], "question": "which entity is a video of a person speaking over spraying and another person yelling?", "label": 0}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a telephone rings followed by a woman talking"], "sample_ids": ["weDbePuc-Xc", "tGcFnX0GHI"], "start_seconds": ["40", "0"], "properties": ["cartoon character, music, vocalize", "ring, talk, woman"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a person snores loudly multiple times at a close distance"], "sample_ids": ["y2bVZ7rz-5M", "sSMl2vc3ek"], "start_seconds": ["280", "20"], "properties": ["motor noise, horn, siren", "loud, multiple, distance"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a person snoring loudly"], "question": "which entity is louder", "label": 0}, {"captions": ["an adult male speaks and dials a rotary phone", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tK4VlLsNxak", "tdWhHV3X25Q"], "start_seconds": ["120", "60"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "applause, audience, yells"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uiS58TNyUiw", "wDVMhEdTiVw"], "start_seconds": ["430", "30"], "properties": ["audio, man, speaking", "gun, shoot, water"], "captions_pred_video": ["of the pigeon in the cage", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "birds chirp and objects are moved around"], "sample_ids": ["vKrYfzleLB8", "yPUYU6t3rwo"], "start_seconds": ["110", "370"], "properties": ["a, ring, gunshots", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "wind blowing followed by a zoom"], "sample_ids": ["uPDn2BFTHk", "vr8ZXjEBhMQ"], "start_seconds": ["140", "150"], "properties": ["woman, laughs, speaks", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a baby laughs and a woman speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be a video of a child and a woman laughing?", "label": 0}, {"captions": ["people speak and laugh as a child speaks", "a car accelerates and wind blows"], "sample_ids": ["sa6TLVbooCc", "u0TrcHhkPQ"], "start_seconds": ["240", "20"], "properties": ["people, laugh, child", "accelerates, wind, blows"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", null], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["someone whistles a song", "a train engine runs and a horn blows"], "sample_ids": ["sIXTftIuUgw", "zPX9o1uDiI"], "start_seconds": ["90", "40"], "properties": ["someone, song, whistle", "engine, horn, run"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a train moves with its horn blowing and wheels squealing "], "question": "which entity is a train", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["uPDn2BFTHk", "xjhAnI2q6hM"], "start_seconds": ["140", "6"], "properties": ["woman, laughs, speaks", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a woman speaks as she rubs two objects together"], "sample_ids": ["yajyRTUQk3U", "vzxHnu-SFEw"], "start_seconds": ["400", "80"], "properties": ["noise, woman, speak", "two objects, woman, speak"], "captions_pred_video": ["- a woman cooking in the kitchen", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman speaks over sizzling noise", "label": 0}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "some tunes played by whistling"], "sample_ids": ["slZLHwNbbt4", "u6BnG6YZqJ4"], "start_seconds": ["300", "0"], "properties": ["clap, distance, horn", "tune, play, whistling"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "water flows as men speak and yell"], "sample_ids": ["t97k0cejSQE", "vJ7JPEFhyLA"], "start_seconds": ["250", "16"], "properties": ["bird, chirp, insect", "water, flow, men"], "captions_pred_video": ["a bee on a purple thistle flower", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "birds chirp and an insect buzzes around"], "sample_ids": ["sYITalLZjj4", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["water, rushes, background, birds", "bird, chirp, insect"], "captions_pred_video": ["two ducks are swimming in the water near each other", "a bee on a purple thistle flower"], "captions_pred_audio": ["wind blows and birds chirp", "a bee buzzes and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "an infant crying as a woman laughs"], "sample_ids": ["w2JXXIAdUdg", "xhmRY9yhC7c"], "start_seconds": ["10", "20"], "properties": ["emits, sleeping, person", "a, laugh, infant"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a person is snoring while sleeping", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vJrjSeP17yE", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["a person is sleeping, snoring, person", "three men, wind, flow"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a person", "label": 0}, {"captions": ["people applaud and hoot and chat quietly", "an engine runs loudly"], "sample_ids": ["wwyfGO2J4", "vqZuVbG6-HI"], "start_seconds": ["90", "130"], "properties": ["people, applaud, hoot", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "some men converse over an engine running"], "sample_ids": ["wnpJndXuxLc", "sCiy7QS1U"], "start_seconds": ["50", "300"], "properties": ["beeps, loud, whistle", "men, converse, engine"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a train?", "label": 0}, {"captions": ["an adult woman and an adult man speak", "a man speaks, then dials a rotary telephone"], "sample_ids": ["zTLVJCo4WEE", "tK4VlLsNxak"], "start_seconds": ["30", "120"], "properties": ["two people, adult, speak", "a, dial, telephone"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "person is wearing a headset and holding a remote control in his hand"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking and using a sewing machine"], "question": "which entity shows a man speaking?", "label": 1}, {"captions": ["a weapon fires multiple times", "a duck quacks continuously"], "sample_ids": ["sMC07Ucy7kg", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["weapon, fire, multiple", "quacks, continuously, duck"], "captions_pred_video": ["footage is from a car's point of view", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wjsXBsc7M40", "tDVADusiIoc"], "start_seconds": ["10", "60"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "water, radio, man"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["y2bVZ7rz-5M", "uEU-Hg5MTN8"], "start_seconds": ["280", "27"], "properties": ["engine, horn, siren", "animal, grunts, snorts"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a woman is speaking and a baby is crying"], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a frog vocalizes while birds chirp", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vMf1dLD6Sng", "vJ7JPEFhyLA"], "start_seconds": ["6", "16"], "properties": ["frog, bird, vocalize", "three men, wind, flow"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a frog croaks loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a frog?", "label": 1}, {"captions": ["people speak and tapping occurs", "some tunes played by whistling"], "sample_ids": ["tFCUUGdREgA", "u6BnG6YZqJ4"], "start_seconds": ["70", "0"], "properties": ["people, tap, speak", "tune, play, whistling"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "an airplane flies overhead as a woman speaks"], "sample_ids": ["su6FAOcOA8c", "zj2R0XoFr5k"], "start_seconds": ["4", "50"], "properties": ["engine, run, woman", "airplane, fly, overhead"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying overhead", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a person screams glaringly"], "sample_ids": ["zFjIWfSD-4", "xC8kbrKJmco"], "start_seconds": ["410", "0"], "properties": ["People, motor, brakes", "glaringly, screams, person"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a goat is bleating "], "question": "which entity is more agressive", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yPUYU6t3rwo", "tDVADusiIoc"], "start_seconds": ["370", "60"], "properties": ["birds chirp, objects are moved around, birds", "water, radio, man"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["insects buzz and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a helicopter engine runs continuously"], "sample_ids": ["tOSWIURC-4", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["engine, work, nearby", "engine, running, continuously"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a lawn mower is running ", "a helicopter is flying overhead "], "question": "which entity has a running engine", "label": 1}, {"captions": ["multiple ducks quack continuously", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["wfHeoPDLMaM", "s7knHCFW82w"], "start_seconds": ["30", "30"], "properties": ["multiple, quack, continuously", "blow horn, get close, train"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["ducks are quacking", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is a train", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zF8yoL0rkbI", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["engine, run, someone", "rustling, ducks, quack"], "captions_pred_video": ["footage of the traffic on the street at night", null], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a duck quacks and a woman speaks"], "question": "which entity is about a vehicle engine running?", "label": 0}, {"captions": ["a man talks followed by a woman shouting", "a car accelerates and wind blows"], "sample_ids": ["s3cTDAj31g", "u0TrcHhkPQ"], "start_seconds": ["80", "20"], "properties": ["man, talk, woman", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a baby is crying", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sa6TLVbooCc", "xKB8O8LTs6s"], "start_seconds": ["240", "70"], "properties": ["people, laugh, child", "music, gunfire, explosion"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["an airplane accelerates briefly", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["zjTG0gaGCUI", "yDoT73BWsdA"], "start_seconds": ["80", "10"], "properties": ["accelerates, airplane, briefly", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a race car accelerates and revs its engine "], "question": "which is a vehicle", "label": 1}, {"captions": ["a toilet flushes and water drains", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sfAvvZwdLCY", "vfYTJq7nU"], "start_seconds": ["20", "130"], "properties": ["water drains, flushes, water", "rustling, ducks, quack"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a duck quacks and a woman speaks"], "question": "which entity is about water?", "label": 0}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["y2ZBGpgbhHM", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["birds, tweet, pant", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about animals?", "label": 0}, {"captions": ["a rumble grows louder", "people applaud and hoot and chat quietly"], "sample_ids": ["y4MY9mp8-TA", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["loudness, increase, rumble", "people, applaud, hoot"], "captions_pred_video": ["a helicopter flying in the sky", null], "captions_pred_audio": ["a helicopter flies overhead ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a stream of water runs briefly"], "sample_ids": ["weDbePuc-Xc", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["cartoon character, music, vocalize", "stream, water, run"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "an engine runs loudly"], "sample_ids": ["xO-Q2BlIIPU", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["two men, exclamation, speak", "loud, engine, run"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "a power tool runs and touches a surface"], "sample_ids": ["s3cTDAj31g", "zfvPRf3chY"], "start_seconds": ["80", "290"], "properties": ["man, talk, woman", "power tool, run, touch"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking while a power tool is being used "], "question": "which entity is a machine", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a man speaks as a machine runs"], "sample_ids": ["tDlysoZiA1I", "vD6lYD1l0BY"], "start_seconds": ["0", "330"], "properties": ["animal, grunts, chirps", "a, machine, run"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "game controller being held in the hands of the person"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking and dishes are being washed "], "question": "which entity is a man speaking to a machine?", "label": 1}, {"captions": ["birds vocalize and a man speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["v0wPrLBI3hg", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["vocalize, bird, speak", "three men, wind, flow"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["paper is crumpling consistently", "water is sprayed across a hard surface"], "sample_ids": ["v5cSxLaHADY", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "water, spray, surface"], "captions_pred_video": ["footage of the person holding a pair of scissors", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["paper is crumpled and crinkled", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a horn honks and then loudly blares", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wnpJndXuxLc", "su6FAOcOA8c"], "start_seconds": ["50", "4"], "properties": ["horn, honk, loud", "engine, idle, woman"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is quieter", "label": 1}, {"captions": ["ticking continues without interruption", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["v-g-j2uTByM", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["ticking, continuous, clock", "loud, laughter, intermittent"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a clock is ticking loudly", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is continuous", "label": 0}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "a man speaks followed by another man speaking outside"], "sample_ids": ["wRBHTgrbiwg", "viuTg1M-dqg"], "start_seconds": ["50", "30"], "properties": ["birds, chirp, cooing", "two men, speak, follow"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a car accelerates and wind blows"], "sample_ids": ["siJFXfGWgDk", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["man, woman, vehicle", "accelerates, wind, blows"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a race car accelerates and revs its engine "], "question": "which entity is about a car?", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a whistling owl calls out repeatedly and insects screech"], "sample_ids": ["uZesmtKZGSw", "w6RTHR6AeAg"], "start_seconds": ["250", "40"], "properties": ["car, track, man", "call, owl, screech"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "an owl hoots and mechanisms operate "], "question": "which entity is a bird?", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["tIY7qOV3rEM", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "a, scream, girl"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["birds tweet and squawk", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["w1mlz3Pe4fU", "yajyRTUQk3U"], "start_seconds": ["300", "400"], "properties": ["squawk, tweet, scream", "a woman, something, fried"], "captions_pred_video": ["of a bird in a cage", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and singing", "a woman is speaking while food is frying in the background"], "question": "which entity is a video", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "water pouring and bubbling"], "sample_ids": ["vMf1dLD6Sng", "uyRfq-jKPpo"], "start_seconds": ["6", "50"], "properties": ["frog, bird, vocalize", "water, bubbles, pouring"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a frog croaks loudly", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vXlk0lIQBFo", "su6FAOcOA8c"], "start_seconds": ["470", "4"], "properties": ["wind, talk, vocalize", "engine, idle, woman"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["frogs croak and vocalize", "an engine runs loudly"], "sample_ids": ["yswmmRZFItk", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["croak, vocalize, frog", "loud, engine, run"], "captions_pred_video": ["a close up of a frog in the water", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a frog is croaking", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a clock ticktocks briefly"], "sample_ids": ["tiDFTC-5vU", "u7C-AEBQM"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "ticktocks, clock, ticktocks briefly"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a ticktock of a clock"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "vehicles pass by on a roadway"], "sample_ids": ["xzKKf9bKNUo", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["background, noise, snoring", "pass, vehicle, roadway"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person snoring loudly", "a car is driving on the road "], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["a drill runs and two people laugh", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tEE3MpBt1sg", "wDVMhEdTiVw"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "gun, shoot, water"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a person whistles a meandering tune", "a woman speaks as frying food sizzles"], "sample_ids": ["uFoga8sHpiw", "wTideSjRFS0"], "start_seconds": ["90", "30"], "properties": ["person, tune, whistle", "food, sizzle, woman"], "captions_pred_video": ["footage of a bird in a cage", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a person whistles a song", "a woman is speaking while water is running in the background"], "question": "which entity is a person", "label": 0}, {"captions": ["a vehicle accelerates and squeals tires", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["yRx9txMcBl0", "xKB8O8LTs6s"], "start_seconds": ["40", "70"], "properties": ["accelerates, tires, squeals", "music, gunfire, explosion"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a car is revving its engine and skidding ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vYkA3cfXp5Q", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["speed, idle, accelerate", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and a dog is whimpering"], "question": "which entity is a still image?", "label": 0}, {"captions": ["three men talk while wind blows and some liquid flows", "a woman speaks as she rubs two objects together"], "sample_ids": ["vJ7JPEFhyLA", "vzxHnu-SFEw"], "start_seconds": ["16", "80"], "properties": ["three men, wind, flow", "two objects, woman, speak"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity shows a woman speaking?", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "paper is crumpling consistently"], "sample_ids": ["vbpKkWvfOu4", "v5cSxLaHADY"], "start_seconds": ["560", "0"], "properties": ["a, man, speaks", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "paper is crumpled and crinkled"], "question": "which entity is a video of a woman speaking and then a man speaking?", "label": 0}, {"captions": ["the wind blows while a vehicle engine runs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xyL9F5VrjkE", "tDVADusiIoc"], "start_seconds": ["20", "60"], "properties": ["wind, blows, vehicle", "water, radio, man"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a vehicle running?", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["y8WEcpOlT3I", "xfaoyyzw2WU"], "start_seconds": ["40", "180"], "properties": ["wind, speak, buffeting", "loud, jet engine, roar"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tDlysoZiA1I", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["animal, grunt, chirp", "engine, laugh, loud"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "people cheer as a vehicle engine revs"], "sample_ids": ["tDlysoZiA1I", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["animal, grunt, multiple", "engine revs, vehicle, people"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "three men talk while wind blows and some liquid flows"], "sample_ids": ["x4a9YGIw4ok", "vJ7JPEFhyLA"], "start_seconds": ["120", "16"], "properties": ["water, gurgles, stops", "three men, wind, flow"], "captions_pred_video": ["footage is blurry and out of focus", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a toilet flushes and water splashes", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about liquid flowing?", "label": 1}, {"captions": ["an audience gives applause", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["x6iCUDmRpKQ", "uYT5gxnyMWM"], "start_seconds": ["38", "50"], "properties": ["applause, audience, give", "female, spraying, scream"], "captions_pred_video": ["a black background with the moon and stars in the sky", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a group of people are clapping and cheering", "a woman is speaking and a baby is crying"], "question": "which entity is a performance", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a small engine idles continuously"], "sample_ids": ["zgUgkpk78xU", "y5WII6cTH7k"], "start_seconds": ["70", "40"], "properties": ["clinking, humming, horn", "engine, idle, continuously"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "an engine is knocking and vibrating "], "question": "which entity is a train", "label": 0}, {"captions": ["running water in a faucet with some clinks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zNRChLjqcU", "tdWhHV3X25Q"], "start_seconds": ["220", "60"], "properties": ["water, faucet, run", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "some men converse over an engine running"], "sample_ids": ["u6jIvCtKarQ", "sCiy7QS1U"], "start_seconds": ["70", "300"], "properties": ["a, man, speaks", "men, converse, engine"], "captions_pred_video": ["footage of a person using a blender on a stove top", null], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has more people", "label": 1}, {"captions": ["wind blows strongly", "a motorcycle idles loudly as wind blows"], "sample_ids": ["w8uLijTqtlU", "v7jJS8aAyA"], "start_seconds": ["70", "10"], "properties": ["wind, blows, strongly", "wind, blows, loudly"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a motorcycle engine is idling and vibrating"], "question": "which entity is louder", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "a clock ticktocks"], "sample_ids": ["xERFUeZONz8", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["an emergency vehicle siren blares", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vf9xf3vMsGM", "wz7N8YRy74I"], "start_seconds": ["540", "30"], "properties": ["A man speaks while turning a water faucet on.", "rooster, crow, background, men"], "captions_pred_video": ["of the person washing their hands under the faucet", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in it?", "label": 1}, {"captions": ["water rushes by", "water flows as men speak and yell"], "sample_ids": ["x-PeY8Yb8M4", "vJ7JPEFhyLA"], "start_seconds": ["300", "16"], "properties": ["water, rushes, by", "water, flow, men"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["an insect buzzes around continuously", "a man speaks followed by another man speaking outside"], "sample_ids": ["v25l1jef3JY", "viuTg1M-dqg"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "two men, speak, follow"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a single speaker?", "label": 0}, {"captions": ["a kid speaks followed by music playing", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tQWGZLItBXk", "tiDFTC-5vU"], "start_seconds": ["170", "30"], "properties": ["music, kid, speak", "male, duck, laugh"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a telephone rings followed by a woman talking"], "sample_ids": ["sapQIQUhFc", "tGcFnX0GHI"], "start_seconds": ["280", "0"], "properties": ["liquid, flow, distance", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wTjoRj1se3U", "yDoT73BWsdA"], "start_seconds": ["390", "10"], "properties": ["engine, run, people", "engine, revs, vehicle"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a jet engine is running and people are talking", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "an infant crying as a woman laughs"], "sample_ids": ["uYT5gxnyMWM", "xhmRY9yhC7c"], "start_seconds": ["50", "20"], "properties": ["female, spraying, scream", "a, laugh, infant"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a person is snoring while sleeping", "people cheer as a vehicle engine revs"], "sample_ids": ["vJrjSeP17yE", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["a person is sleeping, snoring, person", "engine revs, vehicle, people"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person snoring loudly", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an airplane engine runs", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yVPZ2MNWpms", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["engine, airplane, runs", "People, motor, brakes"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["se87d6yxEOA", "tiDFTC-5vU"], "start_seconds": ["10", "30"], "properties": ["run, whistle, pass", "male, duck, laugh"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", null], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["an insect buzzes around continuously", "a person sniffs and sneezes"], "sample_ids": ["v25l1jef3JY", "uRlbY6aoBU"], "start_seconds": ["0", "0"], "properties": ["buzzes, continuously, insect", "sneezes, person, sniffs"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is sneezing "], "question": "which entity is not a person?", "label": 0}, {"captions": ["a crowd yells, reacts and applauds", "waves crash against a shoreline and people speak"], "sample_ids": ["wztCSUxOf8", "yFB25fqfU8I"], "start_seconds": ["130", "300"], "properties": ["a crowd, yells, applauds", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which is a natural phenomenon", "label": 1}, {"captions": ["a stream runs then someone speaks", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["wbHTKEJZyhc", "tiDFTC-5vU"], "start_seconds": ["20", "30"], "properties": ["stream, run, someone", "male, duck, laugh"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking and ducks are quacking"], "question": "which entity is a video of a duck quacking?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a woman speaks as she rubs two objects together"], "sample_ids": ["vZAw4apG0Es", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["background, tick, repeat", "two objects, woman, speak"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a clock is ticking and people are talking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a woman speaks and then a man speaks", "a man speaks as a car is passing by"], "sample_ids": ["vbpKkWvfOu4", "sK4u5T8hW78"], "start_seconds": ["560", "30"], "properties": ["a, man, speaks", "a, car, pass"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["heavy rain splashes as it falls", "multiple birds chirp and an animal grunts"], "sample_ids": ["wP8ZKrlx3oA", "tDlysoZiA1I"], "start_seconds": ["40", "0"], "properties": ["fall, rain, splash", "animal, grunt, multiple"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a heavy rain is falling on a surface", "birds are chirping and a rooster is crowing "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["speaking following by laughing and clapping", "birds chirp and objects are moved around"], "sample_ids": ["u2f5NpsoHBg", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["person, laugh, clap", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "insects buzz and a man speaks"], "question": "which entity is a bird", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "water flows as men speak and yell"], "sample_ids": ["u6jIvCtKarQ", "vJ7JPEFhyLA"], "start_seconds": ["70", "16"], "properties": ["a, man, speaks", "water, flow, men"], "captions_pred_video": ["footage of a person using a blender on a stove top", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water flowing", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zgUgkpk78xU", "xKB8O8LTs6s"], "start_seconds": ["70", "70"], "properties": ["clinking, humming, horn", "music, gunfire, explosion"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["a toilet door squeaks as it is opened", "repeated tapping is accompanied by water running and a woman speaking softly"], "sample_ids": ["sdXV-ylviw", "wvKpEYswXO0"], "start_seconds": ["190", "150"], "properties": ["door, toilet, squeaks", "sound, water, running"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is accompanied by water running", "label": 1}, {"captions": ["male speech with light ticking", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xO-Q2BlIIPU", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["male, speech, ticking", "two men, woman, birds"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a duck quacks several times", "water rushes and then a vehicle zooms past"], "sample_ids": ["vh30P49Po6s", "s4Uz1Ffgo04"], "start_seconds": ["30", "100"], "properties": ["quacks, duck, several", "water, rushes, vehicle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sTpirNYo8vQ", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["a, tone, fast", "three men, wind, flow"], "captions_pred_video": ["of a man taking a selfie on a bus", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a more natural setting", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "water splashes as an animal walks through"], "sample_ids": ["tGcFnX0GHI", "w1ir-sZ3Im8"], "start_seconds": ["0", "90"], "properties": ["ring, talk, woman", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a consistent ticking pattern", "a car accelerates and wind blows"], "sample_ids": ["sCeWURVHfOM", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["ticking, pattern, clock", "accelerates, wind, blows"], "captions_pred_video": ["- a close-up view of the clock's inner workings", null], "captions_pred_audio": ["ticking of a clock", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a child speaks in closed space"], "sample_ids": ["yJ0TePmaOo", "yW6FWLSLkx4"], "start_seconds": ["390", "40"], "properties": ["two hard objects, man, speak", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zj2R0XoFr5k", "wqZ135Ssz0"], "start_seconds": ["50", "60"], "properties": ["airplane, fly, woman", "two men, woman, birds"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", null], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["scraping and female speech with distant music", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yHeVV-xeOxQ", "tDVADusiIoc"], "start_seconds": ["130", "60"], "properties": ["female, speech, music", "water, radio, man"], "captions_pred_video": ["of a girl milking a goat's udder", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a clock ticktocks briefly", "small dogs yip and bark sharply"], "sample_ids": ["u7C-AEBQM", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["ticktocks, clock, ticktocks briefly", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a ticktock of a clock", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a man talks while a clock does ticktock"], "sample_ids": ["xvDdE3zNf8Y", "spYNpeN7rPY"], "start_seconds": ["120", "1"], "properties": ["a, female, speaks", "a clock, ticktock, man"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking and breathing with background noise "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a man speaks as a motor runs in the background"], "sample_ids": ["sfAvvZwdLCY", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "background, motor, run"], "captions_pred_video": ["footage of the toilet in the bathroom", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a toilet is flushed", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a car speeding up in the distance"], "sample_ids": ["sG7TyPnFDR0", "u0TrcHhkPQ"], "start_seconds": ["180", "20"], "properties": ["beeps, machine, smoke alarm", "distance, car, speed"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", null], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "winds blows roughly as a vehicle races past"], "sample_ids": ["w2M4i1mklOA", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["loud, chime, bell", "wind, blows, vehicle"], "captions_pred_video": ["footage of an antique clock", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a woman speaks and other women and a man talk with her", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vbpKkWvfOu4", "su6FAOcOA8c"], "start_seconds": ["560", "4"], "properties": ["a, woman, man", "engine, idle, woman"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a woman is speaking and a subway train is moving "], "question": "which woman is speaking", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "someone is typing on a computer keyboard"], "sample_ids": ["tEE3MpBt1sg", "v0x1odnXtP0"], "start_seconds": ["50", "210"], "properties": ["drill, something, laugh", "keyboard, type, computer"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "how to make money on youtube in spanish"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a person is typing on a keyboard"], "question": "which is not a drill", "label": 1}, {"captions": ["motors runs briefly and tires screech", "vehicles pass by on a roadway"], "sample_ids": ["yRx9txMcBl0", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["motors, tires, screech", "pass, vehicle, roadway"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "a vehicle accelerates and squeals tires"], "sample_ids": ["sQGXqGcwOTc", "yRx9txMcBl0"], "start_seconds": ["3", "40"], "properties": ["audio, kid, giggles", "accelerates, tires, squeals"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "some men converse over an engine running"], "sample_ids": ["zhx6hoYrHeI", "sCiy7QS1U"], "start_seconds": ["160", "300"], "properties": ["engine, sputter, rough", "men, converse, engine"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a person is snoring while sleeping", "water flows as men speak and yell"], "sample_ids": ["vJrjSeP17yE", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["a person is sleeping, snoring, person", "water, flow, men"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a person speaking and yelling?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["yYJksgsxx5U", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["audio, woman, silverware", "a woman, something, fried"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a woman is speaking while food is frying in the background"], "question": "which woman is speaking over chopping and silverware noises", "label": 0}, {"captions": ["a man speaks in the background while a slow tick repeats", "a man speaks as a car is passing by"], "sample_ids": ["vZAw4apG0Es", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["background, tick, repeat", "a, car, pass"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a moving object", "label": 1}, {"captions": ["a child speaks", "an infant crying as a woman laughs"], "sample_ids": ["yW6FWLSLkx4", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["a, child, speaks", "a, laugh, infant"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "birds chirp and objects are moved around"], "sample_ids": ["w1mlz3Pe4fU", "yPUYU6t3rwo"], "start_seconds": ["300", "370"], "properties": ["vocalize, chirp, continuously", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a bird in a cage", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["birds are chirping and singing", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["continuous snoring", "a person sniffles and then sneezes in the distance"], "sample_ids": ["sLkeqCDJIyw", "uRlbY6aoBU"], "start_seconds": ["120", "0"], "properties": ["loud, snoring, noise", "a, distance, sneeze"], "captions_pred_video": [", what is the man doing on the couch? sleeping", null], "captions_pred_audio": ["a person is snoring loudly", "a man is sneezing "], "question": "which entity is not loud", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "a duck quacks continuously"], "sample_ids": ["s3cTDAj31g", "vh30P49Po6s"], "start_seconds": ["80", "30"], "properties": ["man, talk, woman", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "paper is crumpling consistently"], "sample_ids": ["zY3icUyMdh8", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["dog, bark, engine", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a motorcycle engine is idling", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["vZAqdHZ81yA", "wSVhSdj0F0"], "start_seconds": ["180", "10"], "properties": ["engine, motorcycle, idling", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a car horn honks and keys jangle with background noise "], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "an insect buzzes around continuously"], "sample_ids": ["zl9Dqx-j7q4", "v25l1jef3JY"], "start_seconds": ["6", "0"], "properties": ["engine, laugh, loud", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a man driving a car in the dark", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a jet engine roars ", "a fly is buzzing around a microphone "], "question": "which entity is not a human", "label": 1}, {"captions": ["a small engine spits as it runs", "water flows as men speak and yell"], "sample_ids": ["sZvwOuuPGP0", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["spits, engine, runs", "water, flow, men"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a medium engine is running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking and yelling?", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "an engine runs loudly"], "sample_ids": ["sZPuqDgX2V0", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["commentator, race, track", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a lawn mower is running and men are speaking "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a telephone rings followed by a woman talking", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tGcFnX0GHI", "vYkA3cfXp5Q"], "start_seconds": ["0", "30"], "properties": ["ring, talk, woman", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "an adult woman and an adult man speak"], "sample_ids": ["xjhAnI2q6hM", "zTLVJCo4WEE"], "start_seconds": ["6", "30"], "properties": ["wind, blow, loudly", "two people, adult, speak"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a woman speaks and crickets chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone is burping continuously", "a drill runs and two people laugh"], "sample_ids": ["y636gklDioE", "tEE3MpBt1sg"], "start_seconds": ["20", "50"], "properties": ["burps, burps, burps", "two people, laugh, drill"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a person burps loudly several times", "people are laughing breathing and speaking with background noise "], "question": "which entity is more likely to be a joke", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wqUmIEzuNz4", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["frog, bird, vocalize", "stream, water, flow"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "footage is blurry and out of focus"], "captions_pred_audio": ["a cat meows and rustles", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a child speaks in closed space"], "sample_ids": ["tDVADusiIoc", "yW6FWLSLkx4"], "start_seconds": ["60", "40"], "properties": ["water, radio, man", "child, space, speak"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["multiple ducks quack continuously", "a dog barks and whimpers"], "sample_ids": ["wfHeoPDLMaM", "sShpyu2l4YQ"], "start_seconds": ["30", "0"], "properties": ["multiple, quack, continuously", "barks, whimpers, dog"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "the puppies are playing with a toy"], "captions_pred_audio": ["ducks are quacking", "a dog is barking and growling"], "question": "which entity is a dog", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "an airplane engine spools and people speak"], "sample_ids": ["vZAw4apG0Es", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["background, clock, ticktocks", "airplane, engine, spool"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a clock is ticking and people are talking", "a jet engine is running and people are talking"], "question": "which entity is a video of a clock ticking?", "label": 0}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["sofxkNWaP0s", "siJFXfGWgDk"], "start_seconds": ["30", "50"], "properties": ["wind, engine, louder", "a, bird, vehicle"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a man is speaking and birds are chirping in the background "], "question": "which entity is about a vehicle passing nearby?", "label": 1}, {"captions": ["a cat meows and children speak", "pigeons vocalize and birds chirp"], "sample_ids": ["x5cuQjOdM3E", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["cat, speak, children", "vocalize, bird, chirp"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of the pigeon in the cage"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a propeller moves loudly nearby", "a person uses a saw to cut some wood"], "sample_ids": ["ugHJF0hfYkg", "sHbXC6na9hg"], "start_seconds": ["10", "0"], "properties": ["loud, propeller, move", "a person, saw, wood"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a helicopter is flying overhead ", "an engine is idling and vibrating"], "question": "which entity is quieter", "label": 1}, {"captions": ["an animal quacks rapidly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vh30P49Po6s", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["animal, quacks, rapidly", "music, gunfire, explosion"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a duck is quacking loudly", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be a movie", "label": 1}, {"captions": ["people speak and tapping occurs", "a vehicle is skidding and squealing tires"], "sample_ids": ["tFCUUGdREgA", "soTOh3zYJfY"], "start_seconds": ["70", "40"], "properties": ["people, tap, speak", "vehicle, skid, tires"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["ul60S8TXDA8", "yajyRTUQk3U"], "start_seconds": ["60", "400"], "properties": ["sound, distance, bell", "a woman, something, fried"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["people clap and speak in the distance", "water pouring and bubbling"], "sample_ids": ["wwyfGO2J4", "uyRfq-jKPpo"], "start_seconds": ["90", "50"], "properties": ["clap, distance, speak", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sShpyu2l4YQ", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["growl, bark, yip", "stream, water, flow"], "captions_pred_video": ["the puppies are playing with a toy", "footage is blurry and out of focus"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["bees buzz as wind blows", "frogs croak and vocalize"], "sample_ids": ["tMJne1a4AFI", "yswmmRZFItk"], "start_seconds": ["0", "0"], "properties": ["bees, buzz, wind", "croak, vocalize, frog"], "captions_pred_video": ["a swarm of bees on the ground", "a close up of a frog in the water"], "captions_pred_audio": ["a swarm of bees buzzing around", "a frog is croaking"], "question": "which animal is more likely to be a frog", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["sWZzXuWYY", "xjvTpk2Zpr8"], "start_seconds": ["420", "70"], "properties": ["male, clanks, thumps", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["siJFXfGWgDk", "su6FAOcOA8c"], "start_seconds": ["50", "4"], "properties": ["a, bird, vehicle", "engine, idle, woman"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tDlfY3nmx1A", "yajyRTUQk3U"], "start_seconds": ["160", "400"], "properties": ["applause, laugh, man", "a woman, something, fried"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a woman is speaking while food is frying in the background"], "question": "which entity is about food?", "label": 1}, {"captions": ["a duck quacks several times", "a man speaks as a car is passing by"], "sample_ids": ["vh30P49Po6s", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["quacks, duck, several", "a, car, pass"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is moving", "label": 1}, {"captions": ["a man is filing a hard object", "a telephone rings and a bird vocalizes"], "sample_ids": ["vveS8HT7Uog", "skd2PphS6oI"], "start_seconds": ["100", "190"], "properties": ["a man, hard, object", "ring, bird, vocalize"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a telephone bell rings repeatedly "], "question": "which entity is not a bird?", "label": 0}, {"captions": ["some liquid flows while a woman laughs and man talks", "vehicles pass by on a roadway"], "sample_ids": ["vddP56-ogds", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["liquid, laughs, man", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a car is driving on the road "], "question": "which entity shows vehicles moving", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "a horn blasts loudly as a train passes"], "sample_ids": ["w2bYrCVLT60", "zsLxS-uLJTw"], "start_seconds": ["120", "20"], "properties": ["ducks, speak, quack", "horn, blast, train"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "footage of the train on the tracks at sunrise or sunset"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a train blows its horn and moves on the tracks "], "question": "which is louder", "label": 0}, {"captions": ["a motorcycle engine is idling", "people speak as gunfire rings out"], "sample_ids": ["vZAqdHZ81yA", "wqTCwqVRDlk"], "start_seconds": ["180", "80"], "properties": ["engine, motorcycle, idling", "gunfire, ring, speak"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["an engine is idling loudly", "a man is speaking and a gun is fired"], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "water is sprayed across a hard surface"], "sample_ids": ["zcDwZ6W7E3E", "sQwlkXjQabo"], "start_seconds": ["180", "10"], "properties": ["a, man, speak", "water, spray, surface"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "spraying followed by silence"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a person whistles a meandering tune", "a car accelerates and wind blows"], "sample_ids": ["uFoga8sHpiw", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["person, tune, whistle", "accelerates, wind, blows"], "captions_pred_video": ["footage of a bird in a cage", null], "captions_pred_audio": ["a person whistles a song", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman and man are speaking", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vbpKkWvfOu4", "vb1fPSDI4c"], "start_seconds": ["560", "30"], "properties": ["two people, speaking, woman, man", "multiple, people, yell"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["people speak then an engine runs", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["uMTTDZ2mb4", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["engine, run, people", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["an emergency siren wails as it passes", "a clock ticktocks"], "sample_ids": ["vGj1XLJvNrw", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["wails, wails, pass", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a police car driving down a city street", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["food is frying then a woman speaks", "water splashes and a door squeaks"], "sample_ids": ["ukxt9I7eMMg", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["food, woman, speak", "sound, splash, door"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a dog barks and taps with background noise "], "question": "which entity is silent", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["t97k0cejSQE", "tDlysoZiA1I"], "start_seconds": ["250", "0"], "properties": ["bird, chirp, insect", "animal, grunts, chirps"], "captions_pred_video": ["a bee on a purple thistle flower", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "birds are chirping and a rooster is crowing "], "question": "which entity has a bird chirp and an animal grunts?", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["u6jIvCtKarQ", "uZesmtKZGSw"], "start_seconds": ["70", "250"], "properties": ["a, man, speaks", "men, talk, cars"], "captions_pred_video": ["footage of a person using a blender on a stove top", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a cat meows and children speak", "a clock ticktocks"], "sample_ids": ["x5cuQjOdM3E", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["cat, speak, children", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a black background with an airplane flying in the sky", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a cat meows and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sEprKHm8Sj8", "uYT5gxnyMWM"], "start_seconds": ["90", "50"], "properties": ["car, tires, slows", "female, spraying, scream"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaking with light rustling", "a man speaks over intermittent keyboard taps"], "sample_ids": ["zOZleIRqZm4", "tw76HGONaKg"], "start_seconds": ["80", "570"], "properties": ["light, rustling, man", "audio, man, keyboard"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a machine runs continuously", "a machine engine runs and a man speaks"], "sample_ids": ["wdXV3Pv0jiY", "vs65y4qmyBE"], "start_seconds": ["11", "340"], "properties": ["machine, running, continuously", "engine, run, man"], "captions_pred_video": ["footage is blurry and shaky", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a heavy engine is running and men are speaking "], "question": "which machine is running continuously", "label": 0}, {"captions": ["an electronic device bleeps once", "people applaud and hoot and chat quietly"], "sample_ids": ["tHJ6JSa8Y4", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["bleeps, electronic, device", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a clock is ticking and beeping", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xOZfdgAgJ9o", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["woman, whimpering, speaking", "multiple, people, yell"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an airplane engine spools and people speak", "winds blows roughly as a vehicle races past"], "sample_ids": ["wTjoRj1se3U", "xjvTpk2Zpr8"], "start_seconds": ["390", "70"], "properties": ["airplane, engine, spool", "wind, blows, vehicle"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a jet engine is running and people are talking", "a jet engine roars and wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "an electric engine works nearby followed by a child talking"], "sample_ids": ["vK93VuO0yNc", "xSKJGCItUWE"], "start_seconds": ["30", "10"], "properties": ["male voice, bus, rumble", "engine, work, child"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a high pitched engine is running and a child speaks"], "question": "which entity is a machine", "label": 1}, {"captions": ["multiple ducks quack continuously", "a drill drills through something then people begin laughing"], "sample_ids": ["wfHeoPDLMaM", "tEE3MpBt1sg"], "start_seconds": ["30", "50"], "properties": ["multiple, quack, continuously", "drill, something, laugh"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["ducks are quacking", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zj2R0XoFr5k", "wqZ135Ssz0"], "start_seconds": ["50", "60"], "properties": ["airplane, fly, overhead", "two men, woman, birds"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", null], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a bird?", "label": 1}, {"captions": ["an emergency siren wails as it passes", "a toilet flushes and a female speaks"], "sample_ids": ["vGj1XLJvNrw", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["wails, wails, pass", "female, flushes, toilet"], "captions_pred_video": ["footage of a police car driving down a city street", "footage is blurry and out of focus"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a toilet flushes and a man speaks"], "question": "which entity is silent", "label": 1}, {"captions": ["a toilet flushes and water drains", "a vehicle is skidding and squealing tires"], "sample_ids": ["sfAvvZwdLCY", "soTOh3zYJfY"], "start_seconds": ["20", "40"], "properties": ["water drains, flushes, water", "vehicle, skid, tires"], "captions_pred_video": ["footage of the toilet in the bathroom", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a toilet is flushed", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a motorcycle engine works nearby", "winds blows roughly as a vehicle races past"], "sample_ids": ["tOSWIURC-4", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["engine, work, nearby", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a lawn mower is running ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a man talks followed by a woman shouting", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["s3cTDAj31g", "tdWhHV3X25Q"], "start_seconds": ["80", "60"], "properties": ["man, talk, woman", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking and a crowd is clapping"], "question": "which entity has more people", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "paper folding and crinkling"], "sample_ids": ["ugHJF0hfYkg", "zPpG3RD8lSs"], "start_seconds": ["10", "20"], "properties": ["loud, intense, propeller", "paper, fold, crinkle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a helicopter is flying overhead ", "the wind blows and a mouse clicks "], "question": "which is quieter", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a clock ticktocks briefly"], "sample_ids": ["zFjIWfSD-4", "u7C-AEBQM"], "start_seconds": ["410", "30"], "properties": ["People, motor, brakes", "ticktocks, clock, ticktocks briefly"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a ticktock of a clock"], "question": "which entity is silent", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a train horn blows as it passes by"], "sample_ids": ["wnpJndXuxLc", "zVacuqSb4LI"], "start_seconds": ["50", "30"], "properties": ["beeps, loud, whistle", "horn, blows, train"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a power tool runs and touches a surface", "birds chirp and objects are moved around"], "sample_ids": ["zfvPRf3chY", "yPUYU6t3rwo"], "start_seconds": ["290", "370"], "properties": ["power tool, run, touch", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "insects buzz and a man speaks"], "question": "which is not a power tool", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "paper is crumpling consistently"], "sample_ids": ["xSKJGCItUWE", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["engine, run, boy", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["some people speak", "birds chirp and objects are moved around"], "sample_ids": ["vbZ-0lGPneg", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a dog barks and whimpers", "sucking and grunting followed by slurping with birds in the background"], "sample_ids": ["sShpyu2l4YQ", "yYEVLuqEytU"], "start_seconds": ["0", "40"], "properties": ["barks, whimpers, dog", "grunt, slurp, background"], "captions_pred_video": ["the puppies are playing with a toy", "a baby goat is being petted by a person's hand"], "captions_pred_audio": ["a dog is barking and growling", "several sheep bleat and a man speaks"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "birds chirp and a dog breathes heavily"], "sample_ids": ["w2JXXIAdUdg", "y2ZBGpgbhHM"], "start_seconds": ["10", "30"], "properties": ["emits, sleeping, person", "dog, chirp, breathe"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", null], "captions_pred_audio": ["a person snoring and a dog whimpering", "birds chirping and a dog panting"], "question": "which entity is a person", "label": 0}, {"captions": ["someone sprays liquid onto a hard surface", "a man speaks as a motor runs in the background"], "sample_ids": ["sQwlkXjQabo", "xZepNM9qcRA"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "background, motor, run"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["spraying followed by silence", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "an airplane engine spools and people speak"], "sample_ids": ["yRx9txMcBl0", "wTjoRj1se3U"], "start_seconds": ["40", "390"], "properties": ["accelerates, tires, squeals", "airplane, engine, spool"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a jet engine is running and people are talking"], "question": "which entity is a vehicle", "label": 0}, {"captions": ["people speak as gunfire rings out", "water pouring and bubbling"], "sample_ids": ["wqTCwqVRDlk", "uyRfq-jKPpo"], "start_seconds": ["80", "50"], "properties": ["gunfire, ring, speak", "water, bubbles, pouring"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and a gun is fired", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a car accelerates and wind blows"], "sample_ids": ["w2M4i1mklOA", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["alarm, gears, turn", "accelerates, wind, blows"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["water running down a sink while a man is talking", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vSeGhaZt-aI", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["water, sink, talk", "music, gunfire, explosion"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "birds chirp quietly and an adult man speaks"], "sample_ids": ["zF8yoL0rkbI", "zuua6-5goWw"], "start_seconds": ["30", "30"], "properties": ["engine, run, someone", "birds, chirp, quiet, man, speaks"], "captions_pred_video": ["footage of the traffic on the street at night", "in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "birds are chirping and a man is speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "water is sprayed across a hard surface"], "sample_ids": ["zY3icUyMdh8", "sQwlkXjQabo"], "start_seconds": ["20", "10"], "properties": ["dog, bark, engine", "water, spray, surface"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["y682ml90jGw", "zj2R0XoFr5k"], "start_seconds": ["11", "50"], "properties": ["beeps, series, electronic", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a beeping sound is being made ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sfAvvZwdLCY", "zFjIWfSD-4"], "start_seconds": ["20", "410"], "properties": ["flushes, drains, water", "People, motor, brakes"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a machine?", "label": 0}, {"captions": ["a person screams glaringly", "people applaud and hoot and chat quietly"], "sample_ids": ["xC8kbrKJmco", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["glaringly, screams, person", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a goat is bleating ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sQGXqGcwOTc", "vb1fPSDI4c"], "start_seconds": ["3", "30"], "properties": ["cling, speak, dishes", "multiple, people, yell"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["weDbePuc-Xc", "yeFvk9x0wWI"], "start_seconds": ["40", "30"], "properties": ["music, slaps, human", "clack, bird, chirp"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "birds chirp in the background as a car drives by "], "question": "which entity has a human sniveling?", "label": 0}, {"captions": ["a stream of water flows quickly", "water flows and trickles"], "sample_ids": ["wbHTKEJZyhc", "tB7hWb9gTuQ"], "start_seconds": ["20", "30"], "properties": ["stream, water, flow", "water, flow, trickle"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "water is splashing and gurgling"], "question": "which entity is flowing more slowly", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["v-wcQf4BDY0", "yajyRTUQk3U"], "start_seconds": ["120", "400"], "properties": ["bark, yip, sharply", "a woman, something, fried"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a dog barks and growls", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks and birds chirp in the distance", "a duck quacks and men speak and laugh"], "sample_ids": ["uGS7O46tlSo", "tiDFTC-5vU"], "start_seconds": ["70", "30"], "properties": ["a, distance, chirp", "quacks, speak, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["a motorcycle engine is idling", "a machine beeps continuously"], "sample_ids": ["vZAqdHZ81yA", "y682ml90jGw"], "start_seconds": ["180", "11"], "properties": ["engine, motorcycle, idling", "beeps, machine, continuously"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a beeping sound is being made "], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "an infant crying frantically"], "sample_ids": ["vJvryTwuAV8", "zwOBqeFTgiU"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "cry, infant, frantically"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a goat screams and people speak in the background", "continuous chugging with birds chirping in the background"], "sample_ids": ["xC8kbrKJmco", "xM4joTqDVp4"], "start_seconds": ["0", "160"], "properties": ["background, goat, scream", "background, chirp, birds"], "captions_pred_video": [null, "footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack"], "captions_pred_audio": ["a goat is bleating ", "birds are chirping and a train is moving "], "question": "which entity has a more natural background", "label": 1}, {"captions": ["some clanking with distant murmuring", "a stream of water runs briefly"], "sample_ids": ["uMTTDZ2mb4", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["clanking, murmuring, distant", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vSeGhaZt-aI", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["water, sink, talk", "rooster, crow, background, men"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in it?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wSVhSdj0F0", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["horn honks, keys jingle, slam", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "a woman speaks happily and an animal chirps"], "sample_ids": ["sSMl2vc3ek", "uWAAAL4CIoc"], "start_seconds": ["20", "0"], "properties": ["loud, multiple, distance", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a dog is barking "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a series of light horn beeps is followed by a loud steam whistle"], "sample_ids": ["su6FAOcOA8c", "wnpJndXuxLc"], "start_seconds": ["4", "50"], "properties": ["engine, run, woman", "beeps, loud, whistle"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sncRqQ67iJU", "xBxDz0CFVn0"], "start_seconds": ["460", "30"], "properties": ["loud, repeatedly, man", "stream, water, flow"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a person is snoring", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a person is whistling", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sIXTftIuUgw", "xKB8O8LTs6s"], "start_seconds": ["90", "70"], "properties": ["person, whistling, person", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a person whistling a song", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["a person is burping while a girl speaks", "an infant crying frantically"], "sample_ids": ["vdoxuJn9lTc", "zwOBqeFTgiU"], "start_seconds": ["40", "30"], "properties": ["person, burp, girl", "cry, infant, frantically"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of the baby crying in the car seat"], "captions_pred_audio": ["a child speaks followed by a burp", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["an engine starts and increases in power", "a person snores loudly multiple times at a close distance"], "sample_ids": ["zjTG0gaGCUI", "sSMl2vc3ek"], "start_seconds": ["80", "20"], "properties": ["power, increase, engine", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a jet engine roars as wind blows ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["someone snores nearby", "a man speaks as a car is passing by"], "sample_ids": ["spJCm8tD9Zo", "sK4u5T8hW78"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "a, car, pass"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["paper folding and crinkling", "crowd applause while a guy laughs followed by another man speaking"], "sample_ids": ["zPpG3RD8lSs", "tDlfY3nmx1A"], "start_seconds": ["20", "160"], "properties": ["paper, fold, crinkle", "applause, laugh, man"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "a man in a suit and tie is talking to another man in a suit and tie"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a crowd is clapping and laughing and a man is speaking "], "question": "which entity is a crowd", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xyx6eNVEYRY", "w5W5Kqtc8E"], "start_seconds": ["380", "100"], "properties": ["loud, engine, muffles", "wind, blow, vehicle"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", null], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a infant makes noise and is excited"], "sample_ids": ["wvKpEYswXO0", "wIJK3-5y0kA"], "start_seconds": ["150", "30"], "properties": ["water, tap, run", "noise, excited, infant"], "captions_pred_video": ["of the person preparing food in the kitchen", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["slZLHwNbbt4", "vbZ-0lGPneg"], "start_seconds": ["300", "30"], "properties": ["a, horn, run", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a horn?", "label": 0}, {"captions": ["a child yells and another yells", "water is sprayed across a hard surface"], "sample_ids": ["vMDHu7Lxcgw", "sQwlkXjQabo"], "start_seconds": ["410", "10"], "properties": ["two, yell, child", "water, spray, surface"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a car accelerates and wind blows"], "sample_ids": ["yaln9y8I7ms", "u0TrcHhkPQ"], "start_seconds": ["230", "20"], "properties": ["female, flushes, toilet", "accelerates, wind, blows"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "people applaud and hoot and chat quietly"], "sample_ids": ["x5cuQjOdM3E", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["cat, talk, meow", "people, applaud, hoot"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "water pouring and bubbling"], "sample_ids": ["zkKdxzNC97Y", "uyRfq-jKPpo"], "start_seconds": ["27", "50"], "properties": ["loud, bang, noise", "water, bubbles, pouring"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a door is opened and closed", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "a man speaks followed by another man speaking outside"], "sample_ids": ["u6jIvCtKarQ", "viuTg1M-dqg"], "start_seconds": ["70", "30"], "properties": ["a, man, speaks", "two men, speak, follow"], "captions_pred_video": ["footage of a person using a blender on a stove top", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["zuua6-5goWw", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["birds, chirp, quiet, man, speaks", "three men, wind, flow"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "water is sprayed across a hard surface"], "sample_ids": ["u5RmF3c3Aw", "sQwlkXjQabo"], "start_seconds": ["60", "10"], "properties": ["engine, car, zoom", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["wyllXV6PjKo", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["a baby, a woman, a man", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman speaks and a baby cries", "a jet engine roars and wind blows "], "question": "which entity is a person", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "wind blowing followed by a zoom"], "sample_ids": ["yZp6xizR0yU", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["animal, bleat, cry", "wind, blow, zoom"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "some clanking with distant murmuring"], "sample_ids": ["yYEVLuqEytU", "uMTTDZ2mb4"], "start_seconds": ["40", "30"], "properties": ["animal, pig, background", "clanking, murmuring, distant"], "captions_pred_video": ["a baby goat is being petted by a person's hand", null], "captions_pred_audio": ["several sheep bleat and a man speaks", "people are talking and a car is driving by with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["children cheer as a man speaks then an audience screams", "a mechanical buzzing getting louder"], "sample_ids": ["vJvryTwuAV8", "sEprKHm8Sj8"], "start_seconds": ["16", "90"], "properties": ["audience, cheer, man", "noise, loud, buzzing"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a race car accelerates and revs its engine "], "question": "which entity is a noise", "label": 1}, {"captions": ["a helicopter engine runs continuously", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["ugHJF0hfYkg", "sLUnaPT5gM8"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "loud, laughter, intermittent"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a helicopter is flying overhead ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is intermittent", "label": 1}, {"captions": ["a duck quacks several times", "some tunes played by whistling"], "sample_ids": ["vh30P49Po6s", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["quacks, duck, several", "tune, play, whistling"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a duck is quacking loudly", "a person whistling a song"], "question": "which is not a musical instrument", "label": 0}, {"captions": ["an engine runs loudly", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vqZuVbG6-HI", "vb1fPSDI4c"], "start_seconds": ["130", "30"], "properties": ["loud, engine, run", "multiple, people, yell"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a crowd of people are talking and laughing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a small engine idles continuously", "people applaud and hoot and chat quietly"], "sample_ids": ["y5WII6cTH7k", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["engine, idle, continuously", "people, applaud, hoot"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", null], "captions_pred_audio": ["an engine is knocking and vibrating ", "people are clapping and speaking with background noise "], "question": "which entity is a human activity", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "wind blows and women speak as livestock vocalizes"], "sample_ids": ["y8WEcpOlT3I", "vXlk0lIQBFo"], "start_seconds": ["40", "470"], "properties": ["harsh, wind, blows", "wind, speak, vocalize"], "captions_pred_video": ["on how to use a sewing machine youtube", "- a woman and two donkeys in a fenced in area"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "wind chimes are ringing and people are speaking and laughing "], "question": "which entity has a harsher wind blowing", "label": 0}, {"captions": ["multiple ducks quack continuously", "vehicles pass by on a roadway"], "sample_ids": ["wfHeoPDLMaM", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["multiple, quack, continuously", "pass, vehicle, roadway"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of a fire truck entering a garage"], "captions_pred_audio": ["ducks are quacking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a child speaks in closed space"], "sample_ids": ["wyllXV6PjKo", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["a baby, a woman, a man", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman speaks and a baby cries", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a child?", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a woman speaks as frying food sizzles"], "sample_ids": ["ziUT9IFTkjg", "wTideSjRFS0"], "start_seconds": ["10", "30"], "properties": ["background, birds, rustling", "food, sizzle, woman"], "captions_pred_video": [null, "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a woman is speaking while water is running in the background"], "question": "which entity is about a woman speaking?", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "small dogs yip and bark sharply"], "sample_ids": ["t25U-v4k4ts", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["a, chirps, bird", "bark, yip, sharply"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a machine runs", "people speak as gunfire rings out"], "sample_ids": ["vD6lYD1l0BY", "wqTCwqVRDlk"], "start_seconds": ["330", "80"], "properties": ["a, machine, run", "gunfire, ring, speak"], "captions_pred_video": ["game controller being held in the hands of the person", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["someone whistles a tune", "someone whistles a tune"], "sample_ids": ["sIXTftIuUgw", "sIXTftIuUgw"], "start_seconds": ["90", "90"], "properties": ["someone, tune, whistle", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a person whistling a song"], "question": "which entity is a person", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "some men converse over an engine running"], "sample_ids": ["vBslzh7saPw", "sCiy7QS1U"], "start_seconds": ["90", "300"], "properties": ["engine, roar, louder", "men, converse, engine"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation over an engine running?", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xKB8O8LTs6s", "tDVADusiIoc"], "start_seconds": ["70", "60"], "properties": ["music, radio, gunshots", "water, radio, man"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a woman speaking over a radio?", "label": 0}, {"captions": ["water flows followed by women screaming", "dishes cling together then a man begins to speak"], "sample_ids": ["w5W5Kqtc8E", "sQGXqGcwOTc"], "start_seconds": ["100", "3"], "properties": ["water, flow, women", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a man speaks as a car is passing by"], "sample_ids": ["vlJS7LN2XyM", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["background, clocks, ticking", "a, car, pass"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a dog barks and whimpers", "water splashing and a person laughs in the distance then a man speaks nearby"], "sample_ids": ["sShpyu2l4YQ", "vddP56-ogds"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "water, splash, person, laugh"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "water is running and gurgling and a man is speaking"], "question": "which entity is more active", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "a clock ticktocks"], "sample_ids": ["wnpJndXuxLc", "v-g-j2uTByM"], "start_seconds": ["50", "30"], "properties": ["beeps, loud, whistle", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a clock is ticking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman sneezes then speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["x4dZyf9Gbj0", "xKB8O8LTs6s"], "start_seconds": ["130", "70"], "properties": ["sneezes, speaks, woman", "music, gunfire, explosion"], "captions_pred_video": ["footage is blurry and out of focus", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman sneezes and speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a motorcycle engine works nearby", "pigeons vocalize and birds chirp"], "sample_ids": ["tOSWIURC-4", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["engine, work, nearby", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a vehicle is skidding and squealing tires", "a telephone rings followed by a woman talking"], "sample_ids": ["soTOh3zYJfY", "tGcFnX0GHI"], "start_seconds": ["40", "0"], "properties": ["vehicle, skid, tires", "ring, talk, woman"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "someone is snoring while sleeping"], "sample_ids": ["tw76HGONaKg", "ujMt0-D-x2k"], "start_seconds": ["570", "0"], "properties": ["music, click, man", "snore, sleep, someone"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "of the dog playing with a toy on the floor"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a person sniffs and sneezes", "an infant crying frantically"], "sample_ids": ["uRlbY6aoBU", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["sneezes, person, sniffs", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a man is sneezing ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "water flows and trickles"], "sample_ids": ["wz7N8YRy74I", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, men", "water, flow, trickle"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a vehicle accelerates and squeals tires"], "sample_ids": ["weDbePuc-Xc", "yRx9txMcBl0"], "start_seconds": ["40", "40"], "properties": ["music, slaps, human", "accelerates, tires, squeals"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a door slams shut roughly", "an engine runs loudly"], "sample_ids": ["zkKdxzNC97Y", "vqZuVbG6-HI"], "start_seconds": ["27", "130"], "properties": ["a door, slams, shut", "loud, engine, run"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a door is opened and closed", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "a child speaks in closed space"], "sample_ids": ["vfYTJq7nU", "yW6FWLSLkx4"], "start_seconds": ["130", "40"], "properties": ["ducks, quack, man", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a dog barks and whimpers", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sShpyu2l4YQ", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["barks, whimpers, dog", "three men, wind, flow"], "captions_pred_video": ["the puppies are playing with a toy", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "a child speaks in closed space"], "sample_ids": ["ujMt0-D-x2k", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["snoring, rhythmical, nearby", "child, space, speak"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["water flows as men speak and yell", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vJ7JPEFhyLA", "zl9Dqx-j7q4"], "start_seconds": ["16", "6"], "properties": ["water, flow, men", "engine, laugh, loud"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a car accelerates and wind blows"], "sample_ids": ["ylpYOorfH4o", "u0TrcHhkPQ"], "start_seconds": ["410", "20"], "properties": ["engine, run, loud", "accelerates, wind, blows"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "wind blows as people chatter quietly"], "sample_ids": ["yYEVLuqEytU", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["animal, pig, background", "wind, chatter, people"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage is blurry and out of focus"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["y2ZBGpgbhHM", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["animal, growl, bird", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vZAw4apG0Es", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["people, clock, converse", "multiple, people, yell"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["y2bVZ7rz-5M", "su6FAOcOA8c"], "start_seconds": ["280", "4"], "properties": ["engine, horn, siren", "engine, idle, woman"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xBxDz0CFVn0", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["stream, water, flow", "multiple, people, yell"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vBslzh7saPw", "w5W5Kqtc8E"], "start_seconds": ["90", "100"], "properties": ["power, scream, increase", "wind, blow, vehicle"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a woman and man speak while food is frying"], "sample_ids": ["w2M4i1mklOA", "zk-xJGQU8-4"], "start_seconds": ["30", "130"], "properties": ["alarm, gears, turn", "food, man, woman"], "captions_pred_video": ["footage of an antique clock", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "some people speak"], "sample_ids": ["sZPuqDgX2V0", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["commentator, race, track", "some people speak English, some people speak Spanish, some people speak French"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking and a dog is whimpering"], "question": "which entity is not a race", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "dishes cling together then a man begins to speak"], "sample_ids": ["y2ZBGpgbhHM", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["birds, tweet, pant", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["birds chirping and a dog panting", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "race cars go around a track as a man commentates"], "sample_ids": ["xSKJGCItUWE", "uZesmtKZGSw"], "start_seconds": ["10", "250"], "properties": ["engine, work, child", "car, track, man"], "captions_pred_video": ["footage of the helicopter flying in the room", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has a man commentating?", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "bees buzz and wind blows"], "sample_ids": ["vVhthZ45k3Y", "tMJne1a4AFI"], "start_seconds": ["30", "0"], "properties": ["cat, purr, hiss", "bees buzz, wind blows, bees"], "captions_pred_video": ["footage is blurry and out of focus", "a swarm of bees on the ground"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a swarm of bees buzzing around"], "question": "which entity is more active", "label": 1}, {"captions": ["water runs into a sink while men speak", "water pouring and bubbling"], "sample_ids": ["vzceMbklWc", "uyRfq-jKPpo"], "start_seconds": ["180", "50"], "properties": ["water, sink, run", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["water is running and a man is speaking", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["someone is burping continuously", "a car accelerates and wind blows"], "sample_ids": ["y636gklDioE", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["burps, burps, burps", "accelerates, wind, blows"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", null], "captions_pred_audio": ["a person burps loudly several times", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "wind blowing followed by a zoom"], "sample_ids": ["s4tUs779vBA", "vr8ZXjEBhMQ"], "start_seconds": ["160", "150"], "properties": ["a, sound, stop", "wind, blow, zoom"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a car is revving and a man is speaking ", "wind blows and a chainsaw cuts through wood "], "question": "which entity has a zoom?", "label": 1}, {"captions": ["water splashes as an animal walks through", "an infant crying frantically"], "sample_ids": ["w1ir-sZ3Im8", "zwOBqeFTgiU"], "start_seconds": ["90", "30"], "properties": ["animal, water, splashes", "cry, infant, frantically"], "captions_pred_video": ["footage of a group of people riding horses through a river", "of the baby crying in the car seat"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a baby cries loudly"], "question": "which entity is a human", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "a man speaks as a car is passing by"], "sample_ids": ["vbpKkWvfOu4", "sK4u5T8hW78"], "start_seconds": ["560", "30"], "properties": ["a, woman, man", "a, car, pass"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "a person snores loudly multiple times at a close distance"], "sample_ids": ["xSKJGCItUWE", "sSMl2vc3ek"], "start_seconds": ["10", "20"], "properties": ["engine, work, child", "loud, multiple, distance"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["male speech with light ticking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xO-Q2BlIIPU", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["male, speech, ticking", "airplane, boy, fly"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["a cat meows and children speak", "a person is burping while a girl speaks"], "sample_ids": ["x5cuQjOdM3E", "vdoxuJn9lTc"], "start_seconds": ["30", "40"], "properties": ["cat, speak, children", "person, burp, girl"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a cat meows and a woman speaks", "a child speaks followed by a burp"], "question": "which entity is speaking", "label": 1}, {"captions": ["a person is whistling", "wind loudly blowing while people speak in the background followed by a horn blowing"], "sample_ids": ["sIXTftIuUgw", "xjhAnI2q6hM"], "start_seconds": ["90", "6"], "properties": ["person, whistling, person", "wind, blow, loudly"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person whistling a song", "a truck is revving its engine and a man is speaking "], "question": "which entity is blowing", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a machine beeps continuously"], "sample_ids": ["sa6TLVbooCc", "y682ml90jGw"], "start_seconds": ["240", "11"], "properties": ["people, laugh, child", "beeps, machine, continuously"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", null], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "water pouring and bubbling"], "sample_ids": ["su6FAOcOA8c", "uyRfq-jKPpo"], "start_seconds": ["4", "50"], "properties": ["engine, run, woman", "water, bubbles, pouring"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["y4tPJXBKDig", "su6FAOcOA8c"], "start_seconds": ["20", "4"], "properties": ["a, noise, talk", "engine, idle, woman"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a woman is speaking and a subway train is moving "], "question": "which entity is a person", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vh30P49Po6s", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["loud, continuous, quacks", "applause, audience, yells"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking and a crowd is clapping"], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone snores nearby", "a toilet flushes and water drains unevenly"], "sample_ids": ["spJCm8tD9Zo", "vhJWZheqaE"], "start_seconds": ["90", "0"], "properties": ["someone snores, nearby, someone", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a toilet is flushed"], "question": "which entity is a source of noise", "label": 0}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a train horn blows as it passes by"], "sample_ids": ["y8dSeubCNI", "zVacuqSb4LI"], "start_seconds": ["4", "30"], "properties": ["men, women, car", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["an engine revving and people talking in the background", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["an animal quacks rapidly", "wind blows as people chatter quietly"], "sample_ids": ["vh30P49Po6s", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["animal, quacks, rapidly", "wind, chatter, people"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage is blurry and out of focus"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a clang followed by a toilet flushing", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wNZ5thZM7XU", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["sound, flush, toilet", "loud, multiple, distance"], "captions_pred_video": ["footage of a toilet in a bathroom stall", null], "captions_pred_audio": ["a toilet flushes", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a small engine spits as it runs", "people cheer as a vehicle engine revs"], "sample_ids": ["sZvwOuuPGP0", "xjhAnI2q6hM"], "start_seconds": ["50", "6"], "properties": ["spits, engine, runs", "engine revs, vehicle, people"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a medium engine is running ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["s4Uz1Ffgo04", "yDoT73BWsdA"], "start_seconds": ["100", "10"], "properties": ["water, rushes, motorcycle", "engine, revs, vehicle"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a man speaks as a motor runs in the background"], "sample_ids": ["yajyRTUQk3U", "xZepNM9qcRA"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "background, motor, run"], "captions_pred_video": ["- a woman cooking in the kitchen", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "water running down a sink while a man is talking"], "sample_ids": ["vKrYfzleLB8", "vSeGhaZt-aI"], "start_seconds": ["110", "50"], "properties": ["a, ring, gunshots", "water, sink, talk"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video of a man talking?", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a frog vocalizes while birds chirp"], "sample_ids": ["vXlk0lIQBFo", "vMf1dLD6Sng"], "start_seconds": ["470", "6"], "properties": ["wind, talk, vocalize", "frog, bird, vocalize"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a frog in a pond with pink flowers in the background"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a frog croaks loudly"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["a woman speaks followed by clicks and scraping", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yYJksgsxx5U", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["audio, clicks, scraping", "loud, multiple, distance"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", null], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a woman sneezes then speaks"], "sample_ids": ["xV7Mg1QucSc", "x4dZyf9Gbj0"], "start_seconds": ["14", "130"], "properties": ["alarm, ticktocks, laughs", "sneezes, speaks, woman"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage is blurry and out of focus"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a woman sneezes and speaks"], "question": "which entity is a woman?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "a car speeding up in the distance"], "sample_ids": ["vb1fPSDI4c", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["multiple, people, yell", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a crowd of people are talking and laughing", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a muffled toilet flushes and the water drains"], "sample_ids": ["vqZuVbG6-HI", "sfAvvZwdLCY"], "start_seconds": ["130", "20"], "properties": ["background, male, female", "flushes, drains, water"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a toilet is flushed"], "question": "which entity is a toilet?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sK4u5T8hW78", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "a woman, a television program, a bird"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird nearby?", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["s7knHCFW82w", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["blow horn, get close, train", "music, gunfire, explosion"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "wind blows as people chatter quietly"], "sample_ids": ["tw76HGONaKg", "xBxDz0CFVn0"], "start_seconds": ["570", "30"], "properties": ["A, game, keyboard", "wind, chatter, people"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage is blurry and out of focus"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a car accelerates and wind blows"], "sample_ids": ["zuua6-5goWw", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["birds, chirp, quiet, man, speaks", "accelerates, wind, blows"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", null], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaking with light rustling", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["zOZleIRqZm4", "xV7Mg1QucSc"], "start_seconds": ["80", "14"], "properties": ["light, rustling, man", "alarm, ticktocks, laughs"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "an alarm clock ticks and a woman laughs"], "question": "which entity is about a clock ticktocking?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "a horn blasts as warning bells ring"], "sample_ids": ["tgbONvsP47Y", "zgUgkpk78xU"], "start_seconds": ["0", "70"], "properties": ["pass, vehicle, roadway", "horn, bells, ring"], "captions_pred_video": ["footage of a fire truck entering a garage", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a car is driving on the road ", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a warning", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a child speaks in closed space"], "sample_ids": ["yRx9txMcBl0", "yW6FWLSLkx4"], "start_seconds": ["40", "40"], "properties": ["accelerates, tires, squeals", "child, space, speak"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["water flows as men speak and yell", "an airplane engine runs"], "sample_ids": ["vJ7JPEFhyLA", "yVPZ2MNWpms"], "start_seconds": ["16", "0"], "properties": ["water, flow, men", "engine, airplane, runs"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["tw76HGONaKg", "wz7N8YRy74I"], "start_seconds": ["570", "30"], "properties": ["music, click, man", "rooster, crow, background, men"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more background noise", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["t25U-v4k4ts", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["bees buzz, birds chirp, man speaks", "a, scream, girl"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u7C-AEBQM", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["ticks, rhythmic, quiet", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a ticktock of a clock", "a woman speaks while a helicopter flies overhead "], "question": "which entity is moving", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zkKdxzNC97Y", "wqZ135Ssz0"], "start_seconds": ["27", "60"], "properties": ["hard, surface, door", "two men, woman, birds"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["x6ijhqRY38s", "tiDFTC-5vU"], "start_seconds": ["250", "30"], "properties": ["something metal, glass, hit", "male, duck, laugh"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck?", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vf9xf3vMsGM", "uEU-Hg5MTN8"], "start_seconds": ["540", "27"], "properties": ["A man speaks while turning a water faucet on.", "a woman, laughs, animal"], "captions_pred_video": ["of the person washing their hands under the faucet", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video of a person speaking?", "label": 0}, {"captions": ["people speak then an engine runs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["uMTTDZ2mb4", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["engine, run, people", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["food is frying then a woman speaks", "a clock ticktocks"], "sample_ids": ["ukxt9I7eMMg", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["food, woman, speak", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "some men converse over an engine running"], "sample_ids": ["xjvTpk2Zpr8", "sCiy7QS1U"], "start_seconds": ["70", "300"], "properties": ["engine, run, wind", "men, converse, engine"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a jet engine spools up and takes off", "a bird chirps in response to a woman chirping for the birds"], "sample_ids": ["vBslzh7saPw", "uOpoD0gGXcs"], "start_seconds": ["90", "120"], "properties": ["engine, spools, takes", "chirps, woman, bird"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a herd of cows grazing in the field"], "captions_pred_audio": ["a jet engine roars and accelerates ", "birds are chirping and a man is speaking"], "question": "which entity is a response to a woman chirping?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "some men converse over an engine running"], "sample_ids": ["uYT5gxnyMWM", "sCiy7QS1U"], "start_seconds": ["50", "300"], "properties": ["person, spray, yell", "men, converse, engine"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a person speaking over spraying and another person yelling?", "label": 0}, {"captions": ["water splashes and a motorboat passes as people yell", "a large crowd cheers and applauds"], "sample_ids": ["w5W5Kqtc8E", "rqfQRErjfk8"], "start_seconds": ["100", "170"], "properties": ["water, splashes, motorboat", "crowd, cheers, applauds"], "captions_pred_video": [null, "a man hugging another man in front of an orchestra"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a crowd of people clapping and cheering"], "question": "which entity is more likely to be at a sporting event", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a man laughs and speaks as cats purr and hiss"], "sample_ids": ["u7C-AEBQM", "vVhthZ45k3Y"], "start_seconds": ["30", "30"], "properties": ["ticks, rhythmic, quiet", "cat, purr, hiss"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a cat is meowing"], "question": "which entity is more active", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "several insects fly while two men talk"], "sample_ids": ["uiItxDsDMFI", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["wood, piece, saw", "several, fly, men"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a saw is being used with background noise ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is not being sawed", "label": 1}, {"captions": ["birds vocalize and a man speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["v0wPrLBI3hg", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["vocalize, bird, speak", "two men, woman, birds"], "captions_pred_video": ["footage of the pigeons feeding on the ground", null], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a horn rings out as a machine runs by"], "sample_ids": ["rwTERCUno", "slZLHwNbbt4"], "start_seconds": ["90", "300"], "properties": ["engine, idle, sputter", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["an engine is idling and vibrating", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a person screams glaringly", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xC8kbrKJmco", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["glaringly, screams, person", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a goat is bleating ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a infant makes noise and is excited", "water drips and bubbles as a man speaks"], "sample_ids": ["wIJK3-5y0kA", "vSeGhaZt-aI"], "start_seconds": ["30", "50"], "properties": ["noise, excited, infant", "water, bubbles, speak"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a liquid", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a man speaks over intermittent keyboard taps"], "sample_ids": ["sShpyu2l4YQ", "tw76HGONaKg"], "start_seconds": ["0", "570"], "properties": ["growl, bark, yip", "audio, man, keyboard"], "captions_pred_video": ["the puppies are playing with a toy", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a dog is barking and growling", "a man speaks and types on a computer keyboard "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a woman speaks as she rubs two objects together"], "sample_ids": ["t97k0cejSQE", "vzxHnu-SFEw"], "start_seconds": ["250", "80"], "properties": ["bird, chirp, insect", "two objects, woman, speak"], "captions_pred_video": ["a bee on a purple thistle flower", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["an engine runs and wind blows", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vs65y4qmyBE", "uYT5gxnyMWM"], "start_seconds": ["340", "50"], "properties": ["engine, run, wind", "female, spraying, scream"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a beep occurs briefly", "a car accelerates and wind blows"], "sample_ids": ["xtWeJ56-U-g", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["beep, occur, briefly", "accelerates, wind, blows"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", null], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine runs as a siren and horn sound", "a stream of water runs briefly"], "sample_ids": ["u--KhUW8l1Y", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["sound, vehicle, horn", "stream, water, run"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "small dogs yip and bark sharply"], "sample_ids": ["slZLHwNbbt4", "v-wcQf4BDY0"], "start_seconds": ["300", "120"], "properties": ["clap, distance, horn", "bark, yip, sharply"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "an electric engine works nearby followed by a child talking"], "sample_ids": ["zj2R0XoFr5k", "xSKJGCItUWE"], "start_seconds": ["50", "10"], "properties": ["airplane, fly, woman", "engine, work, child"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a high pitched engine is running and a child speaks"], "question": "which entity is a machine", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "people cheer as a vehicle engine revs"], "sample_ids": ["wTjoRj1se3U", "xjhAnI2q6hM"], "start_seconds": ["390", "6"], "properties": ["engine, run, people", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a jet engine is running and people are talking", "a truck is revving its engine and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["water flows and trickles", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["tB7hWb9gTuQ", "xV7Mg1QucSc"], "start_seconds": ["30", "14"], "properties": ["water, flow, trickle", "alarm, ticktocks, laughs"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["water is splashing and gurgling", "an alarm clock ticks and a woman laughs"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["men speak and a nozzle sprays liquid", "an airplane engine spools and people speak"], "sample_ids": ["wRV8yMk886E", "wTjoRj1se3U"], "start_seconds": ["0", "390"], "properties": ["liquid, spray, nozzle", "airplane, engine, spool"], "captions_pred_video": ["two cars are parked in a parking lot at night", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "several insects fly while two men talk"], "sample_ids": ["sfAvvZwdLCY", "s-T9OVOiMLo"], "start_seconds": ["20", "330"], "properties": ["flushes, drains, water", "several, fly, men"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video of a toilet flushing?", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["yZmhM1HcsyE", "ziUT9IFTkjg"], "start_seconds": ["4", "10"], "properties": ["engine, roar, water", "background, birds, rustling"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", null], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["s4Uz1Ffgo04", "wqZ135Ssz0"], "start_seconds": ["100", "60"], "properties": ["water, rushes, motorcycle", "two men, woman, birds"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "pigeons vocalize and birds chirp"], "sample_ids": ["y8WEcpOlT3I", "uiS58TNyUiw"], "start_seconds": ["40", "430"], "properties": ["harsh, wind, blows", "vocalize, bird, chirp"], "captions_pred_video": ["on how to use a sewing machine youtube", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "birds chirp and an owl hoots before a man speaks briefly"], "sample_ids": ["vf9xf3vMsGM", "wRBHTgrbiwg"], "start_seconds": ["540", "50"], "properties": ["A man speaks while turning a water faucet on.", "bird, owl, speak"], "captions_pred_video": ["of the person washing their hands under the faucet", "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a man is speaking while water is running in the background", "birds are chirping and insects are buzzing"], "question": "which entity has a man speaking to an owl?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "water bubbles and gurgles."], "sample_ids": ["vb1fPSDI4c", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["multiple, people, yell", "bubbles, gurgles, water"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a crowd of people are talking and laughing", "water is splashing and gurgling"], "question": "which entity has more bubbles", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a stream of water runs briefly"], "sample_ids": ["smDKStoHBJo", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["a, infant, speaking", "stream, water, run"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yajyRTUQk3U", "uEU-Hg5MTN8"], "start_seconds": ["400", "27"], "properties": ["noise, woman, speak", "a woman, laughs, animal"], "captions_pred_video": ["- a woman cooking in the kitchen", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman is speaking and a baby is crying"], "question": "which woman is speaking over sizzling noise", "label": 0}, {"captions": ["a woman speaks with water running", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["wTideSjRFS0", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["water, running, woman", "beeps, hit, woman"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "paper folding and crinkling"], "sample_ids": ["xO-Q2BlIIPU", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["two men, exclamation, speak", "paper, fold, crinkle"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "the wind blows and a mouse clicks "], "question": "which is not a person", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "an airplane engine runs"], "sample_ids": ["vBslzh7saPw", "yVPZ2MNWpms"], "start_seconds": ["90", "0"], "properties": ["engine, roar, louder", "engine, airplane, runs"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a car is driving by on the road "], "question": "which entity is a video of an airplane engine?", "label": 0}, {"captions": ["a goat bleats and someone makes a calling noise", "a infant makes noise and is excited"], "sample_ids": ["vlS6YMeWAPo", "wIJK3-5y0kA"], "start_seconds": ["40", "30"], "properties": ["noise, bleat, call", "noise, excited, infant"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a goat bleats and birds chirp", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "several insects fly while two men talk"], "sample_ids": ["xzKKf9bKNUo", "s-T9OVOiMLo"], "start_seconds": ["10", "330"], "properties": ["background, noise, snoring", "several, fly, men"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about insects?", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a stream of water runs briefly"], "sample_ids": ["uYT5gxnyMWM", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["a, scream, girl", "stream, water, run"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a propeller rotates loudly and intensely"], "sample_ids": ["siJFXfGWgDk", "ugHJF0hfYkg"], "start_seconds": ["50", "10"], "properties": ["a, bird, vehicle", "loud, intense, propeller"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "an insect buzzes around continuously"], "sample_ids": ["wTideSjRFS0", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["food, sizzle, woman", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "an infant crying as a woman laughs"], "sample_ids": ["ylpYOorfH4o", "xhmRY9yhC7c"], "start_seconds": ["410", "20"], "properties": ["engine, run, loud", "a, laugh, infant"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "water flows and trickles"], "sample_ids": ["ukxt9I7eMMg", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["continuous, woman, speaking", "water, flow, trickle"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "water is splashing and gurgling"], "question": "which entity is a video of water flowing and trickling?", "label": 1}, {"captions": ["a cat meows and children speak", "plastic is tapped on while someone speaks"], "sample_ids": ["x5cuQjOdM3E", "wvKpEYswXO0"], "start_seconds": ["30", "150"], "properties": ["cat, speak, children", "plastic, tap, speak"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "a man speaks as a motor runs in the background"], "sample_ids": ["xSKJGCItUWE", "xZepNM9qcRA"], "start_seconds": ["10", "30"], "properties": ["engine, work, child", "background, motor, run"], "captions_pred_video": ["footage of the helicopter flying in the room", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a drill runs and two people laugh", "a man speaks while rain falls onto a hard surface"], "sample_ids": ["tEE3MpBt1sg", "wqN6IIHw3po"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "rain, surface, fall"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "in your own words what is happening in this screenshot? blood splattered all over the place"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking and water is splashing"], "question": "which entity is a video of a drill running and two people laughing?", "label": 0}, {"captions": ["a man talks as several small engines run", "water flows and trickles"], "sample_ids": ["u9A6VZQCZpU", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xzKKf9bKNUo", "vfYTJq7nU"], "start_seconds": ["10", "130"], "properties": ["background, noise, snoring", "rustling, ducks, quack"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", null], "captions_pred_audio": ["a person snoring loudly", "a duck quacks and a woman speaks"], "question": "which entity has more rustling", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["siJFXfGWgDk", "tDlysoZiA1I"], "start_seconds": ["50", "0"], "properties": ["a, bird, vehicle", "animal, grunts, chirps"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "birds are chirping and a rooster is crowing "], "question": "which entity has more birds", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "sucking and grunting followed by slurping with birds in the background"], "sample_ids": ["sShpyu2l4YQ", "yYEVLuqEytU"], "start_seconds": ["0", "40"], "properties": ["growl, bark, yip", "grunt, slurp, background"], "captions_pred_video": ["the puppies are playing with a toy", "a baby goat is being petted by a person's hand"], "captions_pred_audio": ["a dog is barking and growling", "several sheep bleat and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["xjvTpk2Zpr8", "y8WEcpOlT3I"], "start_seconds": ["70", "40"], "properties": ["wind, blows, vehicle", "harsh, wind, blows"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vW4x7S1VfQc", "wDVMhEdTiVw"], "start_seconds": ["150", "30"], "properties": ["clacking, oil, woman", "gun, shoot, water"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["food sizzles in a frying pan", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is about shooting something?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "several insects fly while two men talk"], "sample_ids": ["yYJksgsxx5U", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["audio, woman, silverware", "several, fly, men"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a man woman speak while crickets sing", "paper is crumpling consistently"], "sample_ids": ["zTLVJCo4WEE", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["a, crickets, sing", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman speaks and crickets chirp", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "water flows and trickles"], "sample_ids": ["vf9xf3vMsGM", "tB7hWb9gTuQ"], "start_seconds": ["540", "30"], "properties": ["A man speaks while turning a water faucet on.", "water, flow, trickle"], "captions_pred_video": ["of the person washing their hands under the faucet", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking while water is running in the background", "water is splashing and gurgling"], "question": "which entity is a video of water flowing and trickling?", "label": 1}, {"captions": ["people speak and tapping occurs", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tFCUUGdREgA", "sSMl2vc3ek"], "start_seconds": ["70", "20"], "properties": ["people, tap, speak", "loud, multiple, distance"], "captions_pred_video": ["a person riding a white horse in an indoor arena", null], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a person sniffles and sneezes", "birds chirp and objects are moved around"], "sample_ids": ["uRlbY6aoBU", "yPUYU6t3rwo"], "start_seconds": ["0", "370"], "properties": ["sneezes, sniffles, person", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is sneezing ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["wind blows and a stream of water flows nearby", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sYITalLZjj4", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["stream, flow, wind", "music, gunfire, explosion"], "captions_pred_video": ["two ducks are swimming in the water near each other", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["wind blows and birds chirp", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["sofxkNWaP0s", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["wind, engine, louder", "sound, chirp, buzz"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "a bee on a purple thistle flower"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a bee buzzes and a woman speaks"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["a stream of water flows quickly", "a car speeding up in the distance"], "sample_ids": ["wbHTKEJZyhc", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["stream, water, flow", "distance, car, speed"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["wind blows strongly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["w8uLijTqtlU", "xKB8O8LTs6s"], "start_seconds": ["70", "70"], "properties": ["wind, blows, strongly", "music, gunfire, explosion"], "captions_pred_video": ["footage is blurry and shaky", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["the wind is blowing strongly", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a door opens and birds chirp", "people cheer as a vehicle engine revs"], "sample_ids": ["yeFvk9x0wWI", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["door, open, birds", "engine revs, vehicle, people"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a vehicle?", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a motorcycle idles loudly as wind blows"], "sample_ids": ["tDVADusiIoc", "v7jJS8aAyA"], "start_seconds": ["60", "10"], "properties": ["wind, radio, waves", "wind, blows, loudly"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a motorcycle engine is idling and vibrating"], "question": "which entity is moving through the water", "label": 0}, {"captions": ["someone is snoring while sleeping", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["ujMt0-D-x2k", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["snore, sleep, someone", "music, gunfire, explosion"], "captions_pred_video": ["of the dog playing with a toy on the floor", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a person is snoring loudly", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["zkKdxzNC97Y", "rqu8iB22IY"], "start_seconds": ["27", "5"], "properties": ["loud, bang, noise", "sound, repeats, laugh"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a dog barks and a man speaks while music plays "], "question": "which entity has a loud bang followed by a softer banging noise?", "label": 0}, {"captions": ["a child babbles as a woman speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wEBlkGWVWwE", "vJ7JPEFhyLA"], "start_seconds": ["260", "16"], "properties": ["a, babble, woman", "three men, wind, flow"], "captions_pred_video": ["shows a person writing on the whiteboard", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "vehicles pass by on a roadway"], "sample_ids": ["spJCm8tD9Zo", "tgbONvsP47Y"], "start_seconds": ["90", "0"], "properties": ["snores, wheezes, sleeps", "pass, vehicle, roadway"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person is snoring loudly", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "someone whistles a tune"], "sample_ids": ["sSMl2vc3ek", "sIXTftIuUgw"], "start_seconds": ["20", "90"], "properties": ["loud, multiple, distance", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person snoring loudly", "a person whistling a song"], "question": "which is not a musical instrument", "label": 0}, {"captions": ["a person snores loudly multiple times at a close distance", "a car accelerates and wind blows"], "sample_ids": ["sSMl2vc3ek", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["loud, multiple, distance", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tjmoSi330GM", "zj2R0XoFr5k"], "start_seconds": ["23", "50"], "properties": ["speed, water, boat", "airplane, boy, fly"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "ticking continues without interruption"], "sample_ids": ["vVhthZ45k3Y", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["cat, purr, hiss", "ticking, continuous, clock"], "captions_pred_video": ["footage is blurry and out of focus", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a clock is ticking loudly"], "question": "which entity is a clock", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "an infant crying as a woman laughs"], "sample_ids": ["sWZzXuWYY", "xhmRY9yhC7c"], "start_seconds": ["420", "20"], "properties": ["male, clanks, thumps", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a toilet flushes and water drains"], "sample_ids": ["zFjIWfSD-4", "sfAvvZwdLCY"], "start_seconds": ["410", "20"], "properties": ["People, motor, brakes", "water drains, flushes, water"], "captions_pred_video": [null, "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a woman speaks happily and an animal chirps"], "sample_ids": ["s7knHCFW82w", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["blow horn, get close, train", "a woman, chirps, animal"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", null], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a woman is speaking and a dog is barking "], "question": "which entity is a person", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["uPDn2BFTHk", "w34HjHr6gAY"], "start_seconds": ["140", "30"], "properties": ["lady, laugh, baby", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["uRExseg-0XI", "y2bVZ7rz-5M"], "start_seconds": ["210", "280"], "properties": ["woman, man, water", "motor noise, horn, siren"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honking?", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xMXvkIcaG0Y", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["sound, humming, rattling", "engine, revs, vehicle"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["an engine is revving and accelerating ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a person speaks briefly", "paper folding and crinkling"], "sample_ids": ["zOZleIRqZm4", "zPpG3RD8lSs"], "start_seconds": ["80", "20"], "properties": ["person, talk, brief", "paper, fold, crinkle"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "the wind blows and a mouse clicks "], "question": "which is not a person", "label": 1}, {"captions": ["heavy rain splashes as it falls", "a rumbling clap in the distance followed by a horn and the rumbling grows louder"], "sample_ids": ["wP8ZKrlx3oA", "slZLHwNbbt4"], "start_seconds": ["40", "300"], "properties": ["fall, rain, splash", "clap, distance, horn"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is not a splash", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["xfudFO976zE", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["animal, bleats, cry", "male, duck, laugh"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["distant humming of an engine", "a stream of water runs briefly"], "sample_ids": ["yVPZ2MNWpms", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["sound, distance, engine", "stream, water, run"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a car is driving by on the road ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man talks as several small engines run", "a telephone rings followed by a woman talking"], "sample_ids": ["u9A6VZQCZpU", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["a, man, talk", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["birds chirps while a siren signals in the distance", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uKCSGgof8gI", "vfYTJq7nU"], "start_seconds": ["12", "130"], "properties": ["chirps, distance, signal", "rustling, ducks, quack"], "captions_pred_video": ["footage of a street in a small town on a sunny day", null], "captions_pred_audio": ["a truck is accelerating and revving its engine ", "a duck quacks and a woman speaks"], "question": "which entity is about ducks?", "label": 1}, {"captions": ["a small engine idles continuously", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["y5WII6cTH7k", "xfaoyyzw2WU"], "start_seconds": ["40", "180"], "properties": ["engine, idle, continuously", "loud, jet engine, roar"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["an engine is knocking and vibrating ", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "water flows as men speak and yell"], "sample_ids": ["sOa7g-44Dag", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["audio, scratching, man", "water, flow, men"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["yZp6xizR0yU", "wyllXV6PjKo"], "start_seconds": ["30", "30"], "properties": ["animal, bleat, cry", "a baby, a woman, a man"], "captions_pred_video": ["footage of a woman feeding goats in a barn", null], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a woman speaks and a baby cries"], "question": "which entity is a human", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a vehicle accelerates and squeals tires"], "sample_ids": ["xSKJGCItUWE", "yRx9txMcBl0"], "start_seconds": ["10", "40"], "properties": ["engine, run, boy", "accelerates, tires, squeals"], "captions_pred_video": ["footage of the helicopter flying in the room", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a car is revving its engine and skidding "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sfAvvZwdLCY", "tiDFTC-5vU"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "male, duck, laugh"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["food is frying then a woman speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["ukxt9I7eMMg", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["food, woman, speak", "rooster, crow, background, men"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in it?", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u6jIvCtKarQ", "zj2R0XoFr5k"], "start_seconds": ["70", "50"], "properties": ["a, man, speaks", "airplane, boy, fly"], "captions_pred_video": ["footage of a person using a blender on a stove top", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["wind blows strongly", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["w8uLijTqtlU", "wqZ135Ssz0"], "start_seconds": ["70", "60"], "properties": ["wind, blows, strongly", "two men, woman, birds"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["x9JovgqUcs", "sLUnaPT5gM8"], "start_seconds": ["500", "0"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["young female child snoring and breathing deeply", "bees buzz and wind blows"], "sample_ids": ["sAam2NqGhLY", "tMJne1a4AFI"], "start_seconds": ["20", "0"], "properties": ["snoring, breathing, child", "bees buzz, wind blows, bees"], "captions_pred_video": ["of a little girl sleeping on a couch", "a swarm of bees on the ground"], "captions_pred_audio": ["a person is snoring", "a swarm of bees buzzing around"], "question": "which entity is moving", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "some men converse over an engine running"], "sample_ids": ["xyx6eNVEYRY", "sCiy7QS1U"], "start_seconds": ["380", "300"], "properties": ["loud, engine, muffles", "men, converse, engine"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", null], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a man speaking?", "label": 0}, {"captions": ["a vehicle engine revs and tires squeal", "a car speeding up in the distance"], "sample_ids": ["yDoT73BWsdA", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["engine revs, tires squeal, vehicle", "distance, car, speed"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving faster", "label": 1}, {"captions": ["a clock ticktocks in wind", "a clock ticktocks"], "sample_ids": ["yVumC9TGknc", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, wind", "ticktocks, clock, ticktocks"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a series of beeps and chirps", "a clock is ticking loudly"], "question": "which clock ticktocks", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wudZTNBtVqc", "y8WEcpOlT3I"], "start_seconds": ["60", "40"], "properties": ["accelerates, engine, wind", "harsh, wind, blows"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking with wind noise in the background "], "question": "which entity is a video of a vehicle engine accelerating and wind blowing?", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "roadway noise occurs and a truck accelerates"], "sample_ids": ["yZmhM1HcsyE", "tgbONvsP47Y"], "start_seconds": ["4", "0"], "properties": ["engine, roar, water", "noise, truck, accelerate"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "people cheer as a vehicle engine revs"], "sample_ids": ["xMXvkIcaG0Y", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["sound, humming, rattling", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["an engine is revving and accelerating ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle engine revs?", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "pigeons vocalize and birds chirp"], "sample_ids": ["xjvTpk2Zpr8", "uiS58TNyUiw"], "start_seconds": ["70", "430"], "properties": ["engine, run, wind", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "of the pigeon in the cage"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a woman speaks and is crumpling paper", "pigeons vocalize and birds chirp"], "sample_ids": ["xvDdE3zNf8Y", "uiS58TNyUiw"], "start_seconds": ["120", "430"], "properties": ["A, crumple, paper", "vocalize, bird, chirp"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of the pigeon in the cage"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a door opens and closes"], "sample_ids": ["zj2R0XoFr5k", "vBHyYJ8pL0"], "start_seconds": ["50", "2"], "properties": ["airplane, fly, woman", "open, close, door"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", null], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is stationary", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yajyRTUQk3U", "zj2R0XoFr5k"], "start_seconds": ["400", "50"], "properties": ["noise, woman, speak", "airplane, boy, fly"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "people cheer as a vehicle engine revs"], "sample_ids": ["vK93VuO0yNc", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["male voice, bus, rumble", "engine revs, vehicle, people"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man speaks as crickets sing", "multiple motorcycles pass by as a man speaks"], "sample_ids": ["ryFDPxgDOGc", "zcDwZ6W7E3E"], "start_seconds": ["570", "180"], "properties": ["a, crickets, sing", "man, speak, motorcycles"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "2 people riding motorcycles down a mountain road with trees lining the sides of the road"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking while a car accelerates and revs its engine "], "question": "which entity has a man speaking as multiple motorcycles pass by?", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vBHyYJ8pL0", "vYkA3cfXp5Q"], "start_seconds": ["2", "30"], "properties": ["noise, door, opening", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "an engine is idling"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "someone is typing on a computer keyboard"], "sample_ids": ["ukxt9I7eMMg", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["continuous, woman, speaking", "keyboard, type, computer"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a person is typing on a keyboard"], "question": "which is a video", "label": 1}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "a car speeding up in the distance"], "sample_ids": ["wRBHTgrbiwg", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["birds, chirp, cooing", "distance, car, speed"], "captions_pred_video": ["of a bee pollinating the flowers in the field", null], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a person burps loudly for a long time nearby", "a woman speaks over sizzling noise"], "sample_ids": ["vf44CgrjT0A", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["loud, long, person", "noise, woman, speak"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a loud burp", "a woman is speaking while food is frying in the background"], "question": "which entity is quieter", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["xERFUeZONz8", "siJFXfGWgDk"], "start_seconds": ["0", "50"], "properties": ["ring, approach, traffic", "a, bird, vehicle"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["an emergency vehicle siren blares", "a man is speaking and birds are chirping in the background "], "question": "which entity has a vehicle passing nearby?", "label": 1}, {"captions": ["water pouring and bubbling", "dishes cling together then a man begins to speak"], "sample_ids": ["uyRfq-jKPpo", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["water, bubbles, pouring", "cling, speak, dishes"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["water is running from a faucet", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["people speak in a closed space", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sTpirNYo8vQ", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["people, space, speak", "water, radio, man"], "captions_pred_video": ["of a man taking a selfie on a bus", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be in a closed space", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "wind blowing followed by a zoom"], "sample_ids": ["yaln9y8I7ms", "vr8ZXjEBhMQ"], "start_seconds": ["230", "150"], "properties": ["female, flushes, toilet", "wind, blow, zoom"], "captions_pred_video": ["footage is blurry and out of focus", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a toilet flushes and a man speaks", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be a video of a toilet flushing?", "label": 0}, {"captions": ["scraping and female speech with distant music", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yHeVV-xeOxQ", "zl9Dqx-j7q4"], "start_seconds": ["130", "6"], "properties": ["female, speech, music", "engine, laugh, loud"], "captions_pred_video": ["of a girl milking a goat's udder", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "vehicles pass by on a roadway"], "sample_ids": ["xyx6eNVEYRY", "tgbONvsP47Y"], "start_seconds": ["380", "0"], "properties": ["loud, engine, muffles", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "footage of a fire truck entering a garage"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a car is driving on the road "], "question": "which vehicle is moving", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a car speeding up in the distance"], "sample_ids": ["uYT5gxnyMWM", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["a, scream, girl", "distance, car, speed"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a person is whistling", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sIXTftIuUgw", "vb1fPSDI4c"], "start_seconds": ["90", "30"], "properties": ["person, whistling, person", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person whistling a song", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "people applaud and hoot and chat quietly"], "sample_ids": ["sU53zg9Jp7s", "wwyfGO2J4"], "start_seconds": ["380", "90"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "people, applaud, hoot"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a woman speaks as she rubs two objects together"], "sample_ids": ["yZmhM1HcsyE", "vzxHnu-SFEw"], "start_seconds": ["4", "80"], "properties": ["engine, roar, water", "two objects, woman, speak"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a machine?", "label": 0}, {"captions": ["a man speaks while water drains", "motors rev and run loudly as a person laughs"], "sample_ids": ["vSeGhaZt-aI", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["water, drain, man", "motors rev, laugh, loudly"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a jet engine roars "], "question": "which entity is more quiet", "label": 0}, {"captions": ["people speak softly as food sizzles", "dishes cling together then a man begins to speak"], "sample_ids": ["yhQ2Lg-7qDY", "sQGXqGcwOTc"], "start_seconds": ["130", "3"], "properties": ["food, sizzle, speak", "cling, speak, dishes"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a faucet is running and a man is speaking", "mechanisms are operating and water is splashing "], "question": "which entity is about speaking?", "label": 0}, {"captions": ["a woman speaks as frying food sizzles", "a crowd yells, reacts and applauds"], "sample_ids": ["wTideSjRFS0", "wztCSUxOf8"], "start_seconds": ["30", "130"], "properties": ["food, sizzle, woman", "a crowd, yells, applauds"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "paper folding and crinkling"], "sample_ids": ["yDoT73BWsdA", "zPpG3RD8lSs"], "start_seconds": ["10", "20"], "properties": ["engine, revs, vehicle", "paper, fold, crinkle"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "the wind blows and a mouse clicks "], "question": "which is not a vehicle", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a clock ticktocks"], "sample_ids": ["wTideSjRFS0", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["food, sizzle, woman", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "a car accelerates and wind blows"], "sample_ids": ["tMJne1a4AFI", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["wind, buzz, rustling", "accelerates, wind, blows"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "children cheer as a man speaks then an audience screams"], "sample_ids": ["wqZ135Ssz0", "vJvryTwuAV8"], "start_seconds": ["60", "16"], "properties": ["two men, woman, birds", "audience, cheer, man"], "captions_pred_video": [null, "a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a man is speaking and a crowd is shouting and whooping "], "question": "which entity has more people", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "an engine starts and increases in power"], "sample_ids": ["smDKStoHBJo", "zjTG0gaGCUI"], "start_seconds": ["0", "80"], "properties": ["a, talk, baby, cry", "power, increase, engine"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a jet engine roars as wind blows "], "question": "which entity is a moving object", "label": 1}, {"captions": ["paper folding and crinkling", "water flows and trickles"], "sample_ids": ["zPpG3RD8lSs", "tB7hWb9gTuQ"], "start_seconds": ["20", "30"], "properties": ["paper, fold, crinkle", "water, flow, trickle"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "water is splashing and gurgling"], "question": "which entity is more likely to flow", "label": 1}, {"captions": ["an emergency siren wails as it passes", "an infant crying as a woman laughs"], "sample_ids": ["vGj1XLJvNrw", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["wails, wails, pass", "a, laugh, infant"], "captions_pred_video": ["footage of a police car driving down a city street", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["un9VQlzgZM", "su6FAOcOA8c"], "start_seconds": ["5", "4"], "properties": ["wind, speak, laugh", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a frog croaks as other frogs croak in the background"], "sample_ids": ["sYITalLZjj4", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["water, rushes, background, birds", "background, frog, croak"], "captions_pred_video": ["two ducks are swimming in the water near each other", "a close up of a frog in the water"], "captions_pred_audio": ["wind blows and birds chirp", "a frog is croaking"], "question": "which entity is a croaking animal?", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["voJh2gJxXhA", "w34HjHr6gAY"], "start_seconds": ["50", "30"], "properties": ["music, frog, croak", "beeps, hit, woman"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["music is playing and crickets are chirping ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "winds blows roughly as a vehicle races past"], "sample_ids": ["wDVMhEdTiVw", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["gun, shoot, water", "wind, blows, vehicle"], "captions_pred_video": ["a blurry image of trees and water in the forest", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a man makes an exclamation, then another man speaks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["xO-Q2BlIIPU", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["two men, exclamation, speak", "beeps, hit, woman"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uiItxDsDMFI", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["wood, piece, saw", "a woman, laughs, animal"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a saw is being used with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "a telephone rings followed by a woman talking"], "sample_ids": ["x9JovgqUcs", "tGcFnX0GHI"], "start_seconds": ["500", "0"], "properties": ["a, man, speaks, keyboard", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a keyboard", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a motorcycle engine is idling", "small dogs yip and bark sharply"], "sample_ids": ["vZAqdHZ81yA", "v-wcQf4BDY0"], "start_seconds": ["180", "120"], "properties": ["engine, motorcycle, idling", "bark, yip, sharply"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["an engine is idling loudly", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet door squeaks as it is opened", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sdXV-ylviw", "vJ7JPEFhyLA"], "start_seconds": ["190", "16"], "properties": ["door, toilet, squeaks", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a still image?", "label": 0}, {"captions": ["multiple ducks quack continuously", "dishes cling together then a man begins to speak"], "sample_ids": ["wfHeoPDLMaM", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["multiple, quack, continuously", "cling, speak, dishes"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["ducks are quacking", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "people speak in the background as a clock ticktocks"], "sample_ids": ["wtDqrBygTcU", "vZAw4apG0Es"], "start_seconds": ["30", "30"], "properties": ["man, engine, run", "background, clock, ticktocks"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a man is speaking and a motor is running", "a clock is ticking and people are talking"], "question": "which entity has a clock ticking in the background?", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "an airplane engine spools and people speak"], "sample_ids": ["uC9dtII1KDI", "wTjoRj1se3U"], "start_seconds": ["150", "390"], "properties": ["wind, gusts, distance", "airplane, engine, spool"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["a clock ticktocks continuously", "a man speaks over a running engine and blowing wind"], "sample_ids": ["vlJS7LN2XyM", "ylpYOorfH4o"], "start_seconds": ["30", "410"], "properties": ["ticktocks, clock, ticktocks continuously", "engine, running, wind"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and an engine is revving"], "question": "which entity is not a clock?", "label": 1}, {"captions": ["heavy rain splashes as it falls", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["wP8ZKrlx3oA", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["fall, rain, splash", "wind, blow, vehicle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", null], "captions_pred_audio": ["a heavy rain is falling on a surface", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a man makes an exclamation, then another man speaks", "a man speaks as a car is passing by"], "sample_ids": ["xO-Q2BlIIPU", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["two men, exclamation, speak", "a, car, pass"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "an insect buzzes around continuously"], "sample_ids": ["ujMt0-D-x2k", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["snoring, rhythmical, nearby", "buzzes, continuously, insect"], "captions_pred_video": ["of the dog playing with a toy on the floor", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a person is snoring loudly", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["rustling with distant murmuring", "an engine runs loudly"], "sample_ids": ["wnNNcxAPwGQ", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["sound, distance, rustling", "loud, engine, run"], "captions_pred_video": ["footage of a yellow truck doing a burnout on a race track", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a crowd of people are talking and laughing while a skateboard rolls by ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp and objects are moved around", "roadway noise occurs and a truck accelerates"], "sample_ids": ["yPUYU6t3rwo", "tgbONvsP47Y"], "start_seconds": ["370", "0"], "properties": ["birds chirp, objects are moved around, birds", "noise, truck, accelerate"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "footage of a fire truck entering a garage"], "captions_pred_audio": ["insects buzz and a man speaks", "a car is driving on the road "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a weapon fires multiple times", "wind blowing followed by a zoom"], "sample_ids": ["sMC07Ucy7kg", "vr8ZXjEBhMQ"], "start_seconds": ["10", "150"], "properties": ["weapon, fire, multiple", "wind, blow, zoom"], "captions_pred_video": ["footage is from a car's point of view", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a weapon", "label": 1}, {"captions": ["a duck quacks several times", "a woman speaks as she rubs two objects together"], "sample_ids": ["vh30P49Po6s", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["quacks, duck, several", "two objects, woman, speak"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is speaking", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "people cheer as a vehicle engine revs"], "sample_ids": ["wAAkbZToh8", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["burp, laugh, speak", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man burps and a woman speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "a clock ticktocks"], "sample_ids": ["wudZTNBtVqc", "v-g-j2uTByM"], "start_seconds": ["60", "30"], "properties": ["accelerates, engine, wind", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["frogs croak and vocalize", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yswmmRZFItk", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["croak, vocalize, frog", "stream, water, flow"], "captions_pred_video": ["a close up of a frog in the water", "footage is blurry and out of focus"], "captions_pred_audio": ["a frog is croaking", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vlS6YMeWAPo", "tiDFTC-5vU"], "start_seconds": ["40", "30"], "properties": ["noise, bleat, call", "male, duck, laugh"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["an aircraft engine runs", "a power tool runs and touches a surface"], "sample_ids": ["yLCORCnd35Q", "zfvPRf3chY"], "start_seconds": ["0", "290"], "properties": ["engine, aircraft, runs", "power tool, run, touch"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", null], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a man is speaking while a power tool is being used "], "question": "which entity is a machine", "label": 1}, {"captions": ["a clock ticktocks continuously", "small dogs yip and bark sharply"], "sample_ids": ["vlJS7LN2XyM", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["ticktocks, clock, ticktocks continuously", "bark, yip, sharply"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a ticktock of a clock", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uYT5gxnyMWM", "wDVMhEdTiVw"], "start_seconds": ["50", "30"], "properties": ["person, spray, yell", "gun, shoot, water"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["an airplane engine runs", "an infant crying as a woman laughs"], "sample_ids": ["yVPZ2MNWpms", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["engine, airplane, runs", "a, laugh, infant"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a car is driving by on the road ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zsLxS-uLJTw", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["horn, blast, train", "a woman, something, fried"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of a woman talking?", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a whistling owl calls out repeatedly and insects screech"], "sample_ids": ["yajyRTUQk3U", "w6RTHR6AeAg"], "start_seconds": ["400", "40"], "properties": ["a woman, something, fried", "call, owl, screech"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "an owl hoots and mechanisms operate "], "question": "which entity is a bird?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a man speaks as a car is passing by"], "sample_ids": ["vs65y4qmyBE", "sK4u5T8hW78"], "start_seconds": ["340", "30"], "properties": ["engine, run, man", "a, car, pass"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking while a machine engine runs?", "label": 0}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "an infant crying frantically"], "sample_ids": ["zY3icUyMdh8", "zwOBqeFTgiU"], "start_seconds": ["20", "30"], "properties": ["dog, bark, engine", "cry, infant, frantically"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of the baby crying in the car seat"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a stream of water runs briefly"], "sample_ids": ["yZmhM1HcsyE", "x-PeY8Yb8M4"], "start_seconds": ["4", "300"], "properties": ["engine, roar, water", "stream, water, run"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "frogs croak and vocalize"], "sample_ids": ["sU53zg9Jp7s", "yswmmRZFItk"], "start_seconds": ["380", "0"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "croak, vocalize, frog"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "a close up of a frog in the water"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a frog is croaking"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yVumC9TGknc", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["humming, clock, birds", "water, radio, man"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a clock?", "label": 0}, {"captions": ["a man talks while a clock does ticktock", "a child speaks in closed space"], "sample_ids": ["spYNpeN7rPY", "yW6FWLSLkx4"], "start_seconds": ["1", "40"], "properties": ["a clock, ticktock, man", "child, space, speak"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["humming and rattling of an engine idling as it revs", "a stream of water runs briefly"], "sample_ids": ["xMXvkIcaG0Y", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["sound, humming, rattling", "stream, water, run"], "captions_pred_video": ["footage of a car's hood being opened up to reveal the engine underneath the hood", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["an engine is revving and accelerating ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a child speaks in closed space"], "sample_ids": ["wDVMhEdTiVw", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["gun, shoot, water", "child, space, speak"], "captions_pred_video": ["a blurry image of trees and water in the forest", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "paper folding and crinkling"], "sample_ids": ["xvDdE3zNf8Y", "zPpG3RD8lSs"], "start_seconds": ["120", "20"], "properties": ["A, crumple, paper", "paper, fold, crinkle"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a woman speaks and crumples paper", "the wind blows and a mouse clicks "], "question": "which entity is crumpling paper", "label": 0}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["wjsXBsc7M40", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "wind, blow, vehicle"], "captions_pred_video": ["footage of the baby playing with a toothbrush", null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a baby laughing?", "label": 0}, {"captions": ["a door opens and closes", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vBHyYJ8pL0", "y8WEcpOlT3I"], "start_seconds": ["2", "40"], "properties": ["open, close, door", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man is speaking with wind noise in the background "], "question": "which entity is more likely to be a door", "label": 0}, {"captions": ["a machine clanks and thumps and a male speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["sWZzXuWYY", "xZepNM9qcRA"], "start_seconds": ["420", "30"], "properties": ["male, clanks, thumps", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a dog barks and whimpers", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sShpyu2l4YQ", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["barks, whimpers, dog", "a woman, something, fried"], "captions_pred_video": ["the puppies are playing with a toy", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a car speeding up in the distance"], "sample_ids": ["vbZ-0lGPneg", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["a woman, a television program, a bird", "distance, car, speed"], "captions_pred_video": ["of a man holding a baby duck in his hands", null], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["xfudFO976zE", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["animal, bleats, cry", "a woman, laughs, animal"], "captions_pred_video": ["footage is blurry and shaky", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "a man speaks while turning a water faucet on"], "sample_ids": ["vZAw4apG0Es", "vf9xf3vMsGM"], "start_seconds": ["30", "540"], "properties": ["background, clock, ticktocks", "A man speaks while turning a water faucet on."], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "of the person washing their hands under the faucet"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking while water is running in the background"], "question": "which entity is more active", "label": 1}, {"captions": ["birds vocalize and a man speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["v0wPrLBI3hg", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["vocalize, bird, speak", "multiple, people, yell"], "captions_pred_video": ["footage of the pigeons feeding on the ground", null], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman talks while something is fried and objects are tapped", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["yajyRTUQk3U", "uZesmtKZGSw"], "start_seconds": ["400", "250"], "properties": ["a woman, something, fried", "men, talk, cars"], "captions_pred_video": ["- a woman cooking in the kitchen", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["a machine runs continuously", "vehicle engines race around a track as a man commentates"], "sample_ids": ["wdXV3Pv0jiY", "sZPuqDgX2V0"], "start_seconds": ["11", "30"], "properties": ["machine, running, continuously", "commentator, race, track"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a man is speaking and a helicopter is flying overhead "], "question": "which machine is running continuously", "label": 0}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "a man speaks as a motor runs in the background"], "sample_ids": ["smDKStoHBJo", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["a, infant, speaking", "background, motor, run"], "captions_pred_video": ["a man holding a crying baby in his arms", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["siJFXfGWgDk", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["man, woman, vehicle", "cling, speak, dishes"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "mechanisms are operating and water is splashing "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a person uses a saw to cut some wood", "someone snores nearby"], "sample_ids": ["sHbXC6na9hg", "spJCm8tD9Zo"], "start_seconds": ["0", "90"], "properties": ["a person, saw, wood", "someone snores, nearby, someone"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["an engine is idling and vibrating", "a person is snoring loudly"], "question": "which entity is a person", "label": 0}, {"captions": ["a man is snoring loudly and repeatedly", "wind blowing followed by a zoom"], "sample_ids": ["sncRqQ67iJU", "vr8ZXjEBhMQ"], "start_seconds": ["460", "150"], "properties": ["loud, repeatedly, man", "wind, blow, zoom"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a person is snoring", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["s4Uz1Ffgo04", "zj2R0XoFr5k"], "start_seconds": ["100", "50"], "properties": ["water, rushes, motorcycle", "airplane, boy, fly"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a car accelerates and wind blows"], "sample_ids": ["xjhAnI2q6hM", "u0TrcHhkPQ"], "start_seconds": ["6", "20"], "properties": ["engine revs, vehicle, people", "accelerates, wind, blows"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", null], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a door opens and closes", "waves crash against a shoreline and people speak"], "sample_ids": ["vBHyYJ8pL0", "yFB25fqfU8I"], "start_seconds": ["2", "300"], "properties": ["open, close, door", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be a door", "label": 0}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a woman speaks as she rubs two objects together"], "sample_ids": ["tQWGZLItBXk", "vzxHnu-SFEw"], "start_seconds": ["170", "80"], "properties": ["voice, music, whoosh", "two objects, woman, speak"], "captions_pred_video": ["worms revolution screenshots", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vSeGhaZt-aI", "vYkA3cfXp5Q"], "start_seconds": ["50", "30"], "properties": ["water, sink, talk", "engine, accelerate, idle"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "an engine is idling"], "question": "which is a vehicle", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "a stream of water runs briefly"], "sample_ids": ["yDoT73BWsdA", "x-PeY8Yb8M4"], "start_seconds": ["10", "300"], "properties": ["engine revs, tires squeal, vehicle", "stream, water, run"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["y2bVZ7rz-5M", "wDVMhEdTiVw"], "start_seconds": ["280", "30"], "properties": ["engine, horn, siren", "gun, shoot, water"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a gun", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a toilet flushes and a female speaks"], "sample_ids": ["uZesmtKZGSw", "yaln9y8I7ms"], "start_seconds": ["250", "230"], "properties": ["car, track, man", "female, flushes, toilet"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["some clanking with distant murmuring", "a propeller rotates loudly and intensely"], "sample_ids": ["uMTTDZ2mb4", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["clanking, murmuring, distant", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a clock ticktocks"], "sample_ids": ["vJ7JPEFhyLA", "v-g-j2uTByM"], "start_seconds": ["16", "30"], "properties": ["three men, wind, flow", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as a machine runs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vD6lYD1l0BY", "vb1fPSDI4c"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "multiple, people, yell"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a cat meows as a young woman speaks"], "sample_ids": ["wTjoRj1se3U", "x5cuQjOdM3E"], "start_seconds": ["390", "30"], "properties": ["engine, run, people", "cat, meows, young woman"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a jet engine is running and people are talking", "a cat meows and a woman speaks"], "question": "which entity is more likely to be a pet", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "a car speeding up in the distance"], "sample_ids": ["vVhthZ45k3Y", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["cat, purr, hiss", "distance, car, speed"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a person is whistling a tune", "someone is typing on a computer keyboard"], "sample_ids": ["scYRUkrFLiQ", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["a, tune, whistle", "keyboard, type, computer"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "how to make money on youtube in spanish"], "captions_pred_audio": ["a person whistling a song", "a person is typing on a keyboard"], "question": "which is not a musical instrument", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "two women and a man talk while a kid cries"], "sample_ids": ["sHbXC6na9hg", "wyllXV6PjKo"], "start_seconds": ["0", "30"], "properties": ["a person, saw, wood", "a kid, talk, cry"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", null], "captions_pred_audio": ["an engine is idling and vibrating", "a woman speaks and a baby cries"], "question": "which entity is about a person cutting wood?", "label": 0}, {"captions": ["a motor idles, accelerates, then slows down.", "paper is crumpling consistently"], "sample_ids": ["vYkA3cfXp5Q", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["speed, idle, accelerate", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["an engine is idling", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "vehicles pass by on a roadway"], "sample_ids": ["vSeGhaZt-aI", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, run", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a car is driving on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a door opens and birds chirp", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["yeFvk9x0wWI", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["door, open, birds", "music, gunfire, explosion"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "a woman speaks as she rubs two objects together"], "sample_ids": ["sapQIQUhFc", "vzxHnu-SFEw"], "start_seconds": ["280", "80"], "properties": ["liquid, flow, distance", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is more likely to be a video of a woman speaking?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a car speeding up in the distance"], "sample_ids": ["ugHJF0hfYkg", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["engine, running, continuously", "distance, car, speed"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a duck quacks continuously", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vh30P49Po6s", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["quacks, continuously, duck", "a woman, something, fried"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a goat bleats as a person speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tPJvjq9QePY", "uZesmtKZGSw"], "start_seconds": ["40", "250"], "properties": ["bleats, person, speak", "men, talk, cars"], "captions_pred_video": ["a dog and a sheep in a barn", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a baby cries and a man speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a child and woman laughs and the woman speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["uPDn2BFTHk", "wwyfGO2J4"], "start_seconds": ["140", "90"], "properties": ["woman, laughs, speaks", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "people are clapping and speaking with background noise "], "question": "which entity shows more people", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["wqN6IIHw3po", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["rain, surface, fall", "two men, woman, birds"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", null], "captions_pred_audio": ["a man is speaking and water is splashing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a heavy rain falls endlessly", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wP8ZKrlx3oA", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["heavy, rain, fall", "female, spraying, scream"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman and man are speaking", "someone sprays a liquid onto a hard surface making a hiss sound"], "sample_ids": ["vbpKkWvfOu4", "zO-LSSY92ZM"], "start_seconds": ["560", "30"], "properties": ["two people, speaking, woman, man", "liquid, surface, sound"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "steam is hissing and hissing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a rumble grows louder", "a man speaks as a car is passing by"], "sample_ids": ["y4MY9mp8-TA", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["loudness, increase, rumble", "a, car, pass"], "captions_pred_video": ["a helicopter flying in the sky", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a helicopter flies overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a rumble", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "some men converse over an engine running"], "sample_ids": ["yRx9txMcBl0", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["accelerates, tires, squeals", "men, converse, engine"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", null], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a diesel truck engine runs continuously", "a man speaks as a car is passing by"], "sample_ids": ["sZvwOuuPGP0", "sK4u5T8hW78"], "start_seconds": ["50", "30"], "properties": ["engine, diesel, truck", "a, car, pass"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a medium engine is running ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a vehicle", "label": 1}, {"captions": ["children speak and play together", "a telephone rings followed by a woman talking"], "sample_ids": ["yVVP8XvWJTo", "tGcFnX0GHI"], "start_seconds": ["260", "0"], "properties": ["children, speak, play", "ring, talk, woman"], "captions_pred_video": ["footage of a playground at a school or daycare center", null], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["paper is crumpling consistently", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["v5cSxLaHADY", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "rustling, ducks, quack"], "captions_pred_video": ["footage of the person holding a pair of scissors", null], "captions_pred_audio": ["paper is crumpled and crinkled", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["uqFtmnhuqA8", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "beeps, hit, woman"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a person speaks briefly", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zOZleIRqZm4", "uZesmtKZGSw"], "start_seconds": ["80", "250"], "properties": ["person, talk, brief", "men, talk, cars"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["someone is snoring while sleeping", "a helicopter engine idles continuously"], "sample_ids": ["ujMt0-D-x2k", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["snore, sleep, someone", "engine, idle, continuously"], "captions_pred_video": ["of the dog playing with a toy on the floor", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a person is snoring loudly", "a helicopter is flying overhead "], "question": "which entity is not a person", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vbr9mHKc8WM", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["noise, loudness, engine", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and a baby is crying"], "question": "which entity is silent", "label": 0}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "two men speak as a buffeting wind blows"], "sample_ids": ["x5cuQjOdM3E", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["cat, talk, meow", "wind, speak, buffeting"], "captions_pred_video": ["a black background with an airplane flying in the sky", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["slZLHwNbbt4", "uYT5gxnyMWM"], "start_seconds": ["300", "50"], "properties": ["clap, distance, horn", "female, spraying, scream"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a motorcycle engine is idling", "a man speaks as a motor runs in the background"], "sample_ids": ["vZAqdHZ81yA", "xZepNM9qcRA"], "start_seconds": ["180", "30"], "properties": ["engine, motorcycle, idling", "background, motor, run"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["an engine is idling loudly", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a duck quacks several times", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vh30P49Po6s", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["quacks, duck, several", "men, talk, cars"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a duck is quacking loudly", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man talks while a clock does ticktock", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["spYNpeN7rPY", "y8WEcpOlT3I"], "start_seconds": ["1", "40"], "properties": ["a clock, ticktock, man", "harsh, wind, blows"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is about a clock?", "label": 0}, {"captions": ["a woman speaks over sizzling noise", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["yajyRTUQk3U", "tDlysoZiA1I"], "start_seconds": ["400", "0"], "properties": ["noise, woman, speak", "animal, grunts, chirps"], "captions_pred_video": ["- a woman cooking in the kitchen", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a person snores loudly multiple times at a close distance"], "sample_ids": ["w34HjHr6gAY", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["beeps, hit, woman", "loud, multiple, distance"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "a man speaks while water trickles and flows"], "sample_ids": ["wqZ135Ssz0", "sapQIQUhFc"], "start_seconds": ["60", "280"], "properties": ["two men, woman, birds", "water, trickles, flow"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a man speaks as a motor runs in the background"], "sample_ids": ["wy1eKjR7KC0", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["people, talk, distance", "background, motor, run"], "captions_pred_video": ["two police officers riding motorcycles down the street", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "people applaud and hoot and chat quietly"], "sample_ids": ["sQwlkXjQabo", "wwyfGO2J4"], "start_seconds": ["10", "90"], "properties": ["liquid, surface, spray", "people, applaud, hoot"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a performance", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["x5cuQjOdM3E", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["cat, meows, young woman", "a woman, something, fried"], "captions_pred_video": ["a black background with an airplane flying in the sky", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking while food is frying in the background"], "question": "which entity is about a cat?", "label": 0}, {"captions": ["birds chirp and objects are moved around", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yPUYU6t3rwo", "uEU-Hg5MTN8"], "start_seconds": ["370", "27"], "properties": ["birds chirp, objects are moved around, birds", "a woman, laughs, animal"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["insects buzz and a man speaks", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a video of a person speaking and laughing?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a loud engine muffles a man as he speaks"], "sample_ids": ["vZAw4apG0Es", "xyx6eNVEYRY"], "start_seconds": ["30", "380"], "properties": ["background, tick, repeat", "loud, engine, muffles"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a clock is ticking and people are talking", "an aircraft engine is running and a man is speaking "], "question": "which entity is muffled", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "after a few seconds of silence, a loud bang occurs followed by a softer banging noise"], "sample_ids": ["weDbePuc-Xc", "zkKdxzNC97Y"], "start_seconds": ["40", "27"], "properties": ["cartoon character, music, vocalize", "loud, bang, noise"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a door is opened and closed"], "question": "which entity is more quiet", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "paper folding and crinkling"], "sample_ids": ["wP8ZKrlx3oA", "zPpG3RD8lSs"], "start_seconds": ["40", "20"], "properties": ["rain, storm, thunder", "paper, fold, crinkle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a heavy rain is falling on a surface", "the wind blows and a mouse clicks "], "question": "which entity is not a natural phenomenon", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "water pouring and bubbling"], "sample_ids": ["spYNpeN7rPY", "uyRfq-jKPpo"], "start_seconds": ["1", "50"], "properties": ["a clock, ticktock, man", "water, bubbles, pouring"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["xERFUeZONz8", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["ring, approach, traffic", "music, gunfire, explosion"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["an emergency vehicle siren blares", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["a goat bleats as a person speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tPJvjq9QePY", "vb1fPSDI4c"], "start_seconds": ["40", "30"], "properties": ["bleats, person, speak", "multiple, people, yell"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "small dogs yip and bark sharply"], "sample_ids": ["xV7Mg1QucSc", "v-wcQf4BDY0"], "start_seconds": ["14", "120"], "properties": ["alarm, ticktocks, laughs", "bark, yip, sharply"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a duck quacks continuously", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vh30P49Po6s", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["quacks, continuously, duck", "loud, jet engine, roar"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a duck is quacking loudly", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a power tool runs and touches a surface", "a horn rings out as a machine runs by"], "sample_ids": ["zfvPRf3chY", "slZLHwNbbt4"], "start_seconds": ["290", "300"], "properties": ["power tool, run, touch", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man yells and speaks as water splashes", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vimzuGQvdcU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, man, yells", "stream, water, flow"], "captions_pred_video": ["a group of people are rafting down a river", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while water is splashing and a child is speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a man speaks as a car is passing by"], "sample_ids": ["xSKJGCItUWE", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["engine, run, boy", "a, car, pass"], "captions_pred_video": ["footage of the helicopter flying in the room", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a boy speaking?", "label": 0}, {"captions": ["a man speaks as a machine runs", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vD6lYD1l0BY", "su6FAOcOA8c"], "start_seconds": ["330", "4"], "properties": ["a, machine, run", "engine, idle, woman"], "captions_pred_video": ["game controller being held in the hands of the person", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["paper folding and crinkling", "winds blows roughly as a vehicle races past"], "sample_ids": ["zPpG3RD8lSs", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["paper, fold, crinkle", "wind, blows, vehicle"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["dishes cling together then a man begins to speak", "some tunes played by whistling"], "sample_ids": ["sQGXqGcwOTc", "u6BnG6YZqJ4"], "start_seconds": ["3", "0"], "properties": ["cling, speak, dishes", "tune, play, whistling"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["an aircraft engine runs", "a car accelerates and wind blows"], "sample_ids": ["yLCORCnd35Q", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["engine, aircraft, runs", "accelerates, wind, blows"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", null], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "wind blowing followed by a zoom"], "sample_ids": ["yLy-WycbVVE", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["background, people, talk", "wind, blow, zoom"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "a car speeding up in the distance"], "sample_ids": ["xyL9F5VrjkE", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["wind, motor, distance", "distance, car, speed"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a race car accelerates and revs its engine "], "question": "which object is moving faster", "label": 0}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["y8WEcpOlT3I", "sQGXqGcwOTc"], "start_seconds": ["40", "3"], "properties": ["harsh, wind, blows", "cling, speak, dishes"], "captions_pred_video": ["on how to use a sewing machine youtube", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["wind noise makes sound into a microphone", "roadway noise occurs and a truck accelerates"], "sample_ids": ["w8uLijTqtlU", "tgbONvsP47Y"], "start_seconds": ["70", "0"], "properties": ["wind, microphone, noise", "noise, truck, accelerate"], "captions_pred_video": ["footage is blurry and shaky", "footage of a fire truck entering a garage"], "captions_pred_audio": ["the wind is blowing strongly", "a car is driving on the road "], "question": "which noise is caused by a truck", "label": 1}, {"captions": ["material crumbles into a microphone", "an airplane engine runs"], "sample_ids": ["vofpvUo6NAw", "yVPZ2MNWpms"], "start_seconds": ["220", "0"], "properties": ["material, crumbles, microphone", "engine, airplane, runs"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a person whistles and clicks a mouse", "a small engine spits as it runs"], "sample_ids": ["zCrAfDfv6-A", "sZvwOuuPGP0"], "start_seconds": ["30", "50"], "properties": ["person, mouse, click", "spits, engine, runs"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", "of a bulldozer clearing a road in a forest stock footage and royalty-free videos"], "captions_pred_audio": ["a person whistles a song", "a medium engine is running "], "question": "which entity is not a person?", "label": 1}, {"captions": ["birds chirp then an animal grunts", "people applaud and hoot and chat quietly"], "sample_ids": ["tDlysoZiA1I", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["animal, grunt, chirp", "people, applaud, hoot"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "vehicle engines race around a track as a man commentates"], "sample_ids": ["w1mlz3Pe4fU", "sZPuqDgX2V0"], "start_seconds": ["300", "30"], "properties": ["vocalize, chirp, continuously", "commentator, race, track"], "captions_pred_video": ["of a bird in a cage", null], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking and a helicopter is flying overhead "], "question": "which entity is a human activity", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "some men converse over an engine running"], "sample_ids": ["uYT5gxnyMWM", "sCiy7QS1U"], "start_seconds": ["50", "300"], "properties": ["female, spraying, scream", "men, converse, engine"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more calm", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "wind blowing followed by a zoom"], "sample_ids": ["s4Uz1Ffgo04", "vr8ZXjEBhMQ"], "start_seconds": ["100", "150"], "properties": ["water, rushes, motorcycle", "wind, blow, zoom"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sd7xVssqlw", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["accelerates, tires, squealing", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a jet engine roars "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a man speaks as a machine runs", "paper is crumpling consistently"], "sample_ids": ["vD6lYD1l0BY", "v5cSxLaHADY"], "start_seconds": ["330", "0"], "properties": ["a, machine, run", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "water splashes and a motorboat passes as people yell"], "sample_ids": ["yeFvk9x0wWI", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["chirp, twitter, clatter", "water, splashes, motorboat"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more active", "label": 1}, {"captions": ["a motorcycle engine is idling", "a car accelerates and wind blows"], "sample_ids": ["vZAqdHZ81yA", "u0TrcHhkPQ"], "start_seconds": ["180", "20"], "properties": ["engine, motorcycle, idling", "accelerates, wind, blows"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a vehicle accelerates and squeals tires"], "sample_ids": ["ugHJF0hfYkg", "yRx9txMcBl0"], "start_seconds": ["10", "40"], "properties": ["engine, running, continuously", "accelerates, tires, squeals"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is revving its engine and skidding "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yNtRmrn0io8", "tiDFTC-5vU"], "start_seconds": ["210", "30"], "properties": ["storm, distance, strike", "male, duck, laugh"], "captions_pred_video": ["footage of a house in the middle of the night", null], "captions_pred_audio": ["rain falls and thunder roars", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["children cry and people talk", "a toilet flushes and a female speaks"], "sample_ids": ["xLwHe825Zs", "yaln9y8I7ms"], "start_seconds": ["18", "230"], "properties": ["people talk, children cry, people talk", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries and a woman speaks", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "a child speaks in closed space"], "sample_ids": ["w-4gHptFNuU", "yW6FWLSLkx4"], "start_seconds": ["21", "40"], "properties": ["engine revs, accelerates, bump", "child, space, speak"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a child and woman laughs and the woman speaks"], "sample_ids": ["sQwlkXjQabo", "uPDn2BFTHk"], "start_seconds": ["10", "140"], "properties": ["water, spray, surface", "woman, laughs, speaks"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "a baby laughs and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "several insects fly while two men talk"], "sample_ids": ["w0xsN8X18Y", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["music, surface, rain", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more insects", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a car speeding up in the distance"], "sample_ids": ["yks4cLgIDMc", "u0TrcHhkPQ"], "start_seconds": ["170", "20"], "properties": ["background, speaking, child", "distance, car, speed"], "captions_pred_video": ["footage of two kids wrestling on the floor", null], "captions_pred_audio": ["a man is speaking and a child is crying", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "a woman and man speak while food is frying"], "sample_ids": ["wTideSjRFS0", "zk-xJGQU8-4"], "start_seconds": ["30", "130"], "properties": ["food, sizzle, woman", "food, man, woman"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity has a man speaking while food is frying?", "label": 1}, {"captions": ["a male speaks over some small clicks", "people speak as gunfire rings out"], "sample_ids": ["uXxVebHsGZ8", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["male, clicks, speak", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sSMl2vc3ek", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["a person, laughs, snores", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person?", "label": 0}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "a car speeding up in the distance"], "sample_ids": ["ukg5L09Wpvo", "u0TrcHhkPQ"], "start_seconds": ["150", "20"], "properties": ["clickety-clack, train, whistle", "distance, car, speed"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wqZ135Ssz0", "tdWhHV3X25Q"], "start_seconds": ["60", "60"], "properties": ["man, woman, squawks", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vlS6YMeWAPo", "su6FAOcOA8c"], "start_seconds": ["40", "4"], "properties": ["noise, bleat, call", "engine, idle, woman"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a goat bleats and birds chirp", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "people applaud and hoot and chat quietly"], "sample_ids": ["slZLHwNbbt4", "wwyfGO2J4"], "start_seconds": ["300", "90"], "properties": ["clap, distance, horn", "people, applaud, hoot"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", null], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yRx9txMcBl0", "tDVADusiIoc"], "start_seconds": ["40", "60"], "properties": ["accelerates, tires, squeals", "water, radio, man"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "some men converse over an engine running"], "sample_ids": ["vlS6YMeWAPo", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["sheep, baa, birds", "men, converse, engine"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation?", "label": 1}, {"captions": ["a toilet flushes and water drains", "some tunes played by whistling"], "sample_ids": ["sfAvvZwdLCY", "u6BnG6YZqJ4"], "start_seconds": ["20", "0"], "properties": ["water drains, flushes, water", "tune, play, whistling"], "captions_pred_video": ["footage of the toilet in the bathroom", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a toilet is flushed", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a telephone rings followed by a woman talking"], "sample_ids": ["tQWGZLItBXk", "tGcFnX0GHI"], "start_seconds": ["170", "0"], "properties": ["music, kid, speak", "ring, talk, woman"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["goats bleat and people speak", "people applaud and hoot and chat quietly"], "sample_ids": ["z5iUE5h0EPs", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["goats bleat, people speak, language", "people, applaud, hoot"], "captions_pred_video": ["of the goat in the barn", null], "captions_pred_audio": ["a goat bleats and a man speaks", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "an insect buzzes around continuously"], "sample_ids": ["w5W5Kqtc8E", "v25l1jef3JY"], "start_seconds": ["100", "0"], "properties": ["wind, blow, vehicle", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "wind blows as people chatter quietly"], "sample_ids": ["sapQIQUhFc", "xBxDz0CFVn0"], "start_seconds": ["280", "30"], "properties": ["liquid, flow, distance", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a machine beeps continuously"], "sample_ids": ["ugHJF0hfYkg", "y682ml90jGw"], "start_seconds": ["10", "11"], "properties": ["engine, running, continuously", "beeps, machine, continuously"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a beeping sound is being made "], "question": "which machine beeps continuously", "label": 1}, {"captions": ["a propeller moves loudly nearby", "an engine runs loudly"], "sample_ids": ["ugHJF0hfYkg", "vqZuVbG6-HI"], "start_seconds": ["10", "130"], "properties": ["loud, propeller, move", "loud, engine, run"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a helicopter is flying overhead ", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "water splashes as an animal walks through"], "sample_ids": ["tDVADusiIoc", "w1ir-sZ3Im8"], "start_seconds": ["60", "90"], "properties": ["man, radio, blows", "animal, water, splashes"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a stream of water runs briefly"], "sample_ids": ["wqZ135Ssz0", "x-PeY8Yb8M4"], "start_seconds": ["60", "300"], "properties": ["man, woman, squawks", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "roadway noise occurs and a truck accelerates"], "sample_ids": ["w5W5Kqtc8E", "tgbONvsP47Y"], "start_seconds": ["100", "0"], "properties": ["wind, blow, vehicle", "noise, truck, accelerate"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zdYdyF9-m8U", "su6FAOcOA8c"], "start_seconds": ["7", "4"], "properties": ["wind, crash, shoreline", "engine, idle, woman"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["waves crash and wind blows ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sAam2NqGhLY", "tiDFTC-5vU"], "start_seconds": ["20", "30"], "properties": ["snoring, breathing, child", "male, duck, laugh"], "captions_pred_video": ["of a little girl sleeping on a couch", null], "captions_pred_audio": ["a person is snoring", "a man is speaking and ducks are quacking"], "question": "which entity is a person", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wDVMhEdTiVw", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["gun, shoot, water", "stream, water, flow"], "captions_pred_video": ["a blurry image of trees and water in the forest", "footage is blurry and out of focus"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a machine beeps continuously"], "sample_ids": ["wsHBIgzs9Fs", "y682ml90jGw"], "start_seconds": ["50", "11"], "properties": ["horn, continuous, buzzing", "beeps, machine, continuously"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", null], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a beeping sound is being made "], "question": "which entity is continuous", "label": 1}, {"captions": ["wind blowing followed by a zoom", "small dogs yip and bark sharply"], "sample_ids": ["vr8ZXjEBhMQ", "v-wcQf4BDY0"], "start_seconds": ["150", "120"], "properties": ["wind, blow, zoom", "bark, yip, sharply"], "captions_pred_video": ["is taken from a motorcycle's point of view", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a cat meows and children speak", "a person snoring several times"], "sample_ids": ["x5cuQjOdM3E", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["cat, speak, children", "snore, person, several"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a cat meows and a woman speaks", "a person is snoring loudly"], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "people cheer as a vehicle engine revs"], "sample_ids": ["zO-LSSY92ZM", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["liquid, surface, sound", "engine revs, vehicle, people"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["steam is hissing and hissing", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["w6RTHR6AeAg", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["call, owl, screech", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "a duck quacks continuously"], "sample_ids": ["xjhAnI2q6hM", "vh30P49Po6s"], "start_seconds": ["6", "30"], "properties": ["wind, blow, loudly", "quacks, continuously, duck"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "people speak softly as food sizzles"], "sample_ids": ["sU53zg9Jp7s", "yhQ2Lg-7qDY"], "start_seconds": ["380", "130"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "food, sizzle, speak"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a faucet is running and a man is speaking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["s4Uz1Ffgo04", "wz7N8YRy74I"], "start_seconds": ["100", "30"], "properties": ["water, rushes, motorcycle", "rooster, crow, background, men"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["w9lpbUn0hPc", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["male, wind, rustling", "rooster, crow, background, men"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a woman speaks and other women and a man talk with her"], "sample_ids": ["se87d6yxEOA", "vbpKkWvfOu4"], "start_seconds": ["10", "560"], "properties": ["run, whistle, pass", "a, woman, man"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a man speaks over a running engine and blowing wind"], "sample_ids": ["zl9Dqx-j7q4", "ylpYOorfH4o"], "start_seconds": ["6", "410"], "properties": ["engine, laugh, loud", "engine, running, wind"], "captions_pred_video": ["footage of a man driving a car in the dark", "for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking and an engine is revving"], "question": "which entity is a man speaking over a running engine and blowing wind?", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["slZLHwNbbt4", "yDoT73BWsdA"], "start_seconds": ["300", "10"], "properties": ["train, horn, sound", "engine, revs, vehicle"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["people speak and tapping occurs", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["tFCUUGdREgA", "ziUT9IFTkjg"], "start_seconds": ["70", "10"], "properties": ["people, tap, speak", "background, birds, rustling"], "captions_pred_video": ["a person riding a white horse in an indoor arena", null], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "wind blows and a vehicle blows a hard then a train blows a horn"], "sample_ids": ["xyL9F5VrjkE", "wnpJndXuxLc"], "start_seconds": ["20", "50"], "properties": ["wind, motor, distance", "blows, vehicle, train"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is about a train blowing a horn?", "label": 1}, {"captions": ["some clanking with distant murmuring", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["uMTTDZ2mb4", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["clanking, murmuring, distant", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more abrasive", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uC9dtII1KDI", "tdWhHV3X25Q"], "start_seconds": ["150", "60"], "properties": ["wind, gusts, distance", "applause, audience, yells"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a human activity", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "an infant crying as a woman laughs"], "sample_ids": ["vbpKkWvfOu4", "xhmRY9yhC7c"], "start_seconds": ["560", "20"], "properties": ["a, woman, man", "a, laugh, infant"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a baby cries and a woman speaks"], "question": "which woman is laughing", "label": 1}, {"captions": ["someone whistles a tune", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["sIXTftIuUgw", "tDVADusiIoc"], "start_seconds": ["90", "60"], "properties": ["someone, tune, whistle", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person whistling a song", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yJ0TePmaOo", "wz7N8YRy74I"], "start_seconds": ["390", "30"], "properties": ["two hard objects, man, speak", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a toilet flushes and a female speaks"], "sample_ids": ["zhx6hoYrHeI", "yaln9y8I7ms"], "start_seconds": ["160", "230"], "properties": ["engine, sputter, rough", "female, flushes, toilet"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "footage is blurry and out of focus"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "a telephone rings followed by a woman talking"], "sample_ids": ["w0xsN8X18Y", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["music, surface, rain", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a man speaks as crickets sing", "water runs into a sink while men speak"], "sample_ids": ["ryFDPxgDOGc", "vzceMbklWc"], "start_seconds": ["570", "180"], "properties": ["a, crickets, sing", "water, sink, run"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "water is running and a man is speaking"], "question": "which entity is a man speaking as crickets sing?", "label": 0}, {"captions": ["an engine works in idle nearby followed by a man talking", "a man speaks as a car is passing by"], "sample_ids": ["wqADXCzngMw", "sK4u5T8hW78"], "start_seconds": ["340", "30"], "properties": ["engine, idle, man", "a, car, pass"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["wSVhSdj0F0", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["beep, clang, footsteps", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be heard in a car", "label": 0}, {"captions": ["a man speaks, then dials a rotary telephone", "an airplane engine spools and people speak"], "sample_ids": ["tK4VlLsNxak", "wTjoRj1se3U"], "start_seconds": ["120", "390"], "properties": ["a, dial, telephone", "airplane, engine, spool"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tIY7qOV3rEM", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "multiple, people, yell"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a crowd of people are talking and laughing"], "question": "which entity is a human activity", "label": 1}, {"captions": ["a toilet flushes and water drains", "a small engine idles continuously"], "sample_ids": ["sfAvvZwdLCY", "y5WII6cTH7k"], "start_seconds": ["20", "40"], "properties": ["water drains, flushes, water", "engine, idle, continuously"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["a toilet is flushed", "an engine is knocking and vibrating "], "question": "which entity is not a machine?", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "wind blowing followed by a zoom"], "sample_ids": ["x6ijhqRY38s", "vr8ZXjEBhMQ"], "start_seconds": ["250", "150"], "properties": ["something metal, glass, hit", "wind, blow, zoom"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "small dogs yip and bark sharply"], "sample_ids": ["w5W5Kqtc8E", "v-wcQf4BDY0"], "start_seconds": ["100", "120"], "properties": ["wind, blow, vehicle", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "vehicles pass by on a roadway"], "sample_ids": ["uYT5gxnyMWM", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["person, spray, yell", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a car is driving on the road "], "question": "which entity is more passive", "label": 1}, {"captions": ["a kid speaks followed by music playing", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tQWGZLItBXk", "vbZ-0lGPneg"], "start_seconds": ["170", "30"], "properties": ["music, kid, speak", "a woman, a television program, a bird"], "captions_pred_video": ["worms revolution screenshots", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a dog is whimpering"], "question": "which entity has more birds", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "people speak as gunfire rings out"], "sample_ids": ["w5W5Kqtc8E", "wqTCwqVRDlk"], "start_seconds": ["100", "80"], "properties": ["wind, engine, scream", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["viuTg1M-dqg", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["two men, speak, follow", "a woman, something, fried"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["xl2PIWyXaM", "vzxHnu-SFEw"], "start_seconds": ["160", "80"], "properties": ["chirp, man, younger person", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["birds are chirping and people are talking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vlJS7LN2XyM", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["background, clocks, ticking", "a woman, a television program, a bird"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program in the background?", "label": 1}, {"captions": ["someone is burping continuously", "water is sprayed across a hard surface"], "sample_ids": ["y636gklDioE", "sQwlkXjQabo"], "start_seconds": ["20", "10"], "properties": ["burps, burps, burps", "water, spray, surface"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a person burps loudly several times", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a door opens and birds chirp", "water pouring and bubbling"], "sample_ids": ["yeFvk9x0wWI", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["door, open, birds", "water, bubbles, pouring"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["a child speaks", "water splashes as an animal walks through"], "sample_ids": ["yW6FWLSLkx4", "w1ir-sZ3Im8"], "start_seconds": ["40", "90"], "properties": ["a, child, speaks", "animal, water, splashes"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks as several small engines run", "a vehicles accelerate quickly and someone laughs"], "sample_ids": ["u9A6VZQCZpU", "uWPRNLnpy7Y"], "start_seconds": ["30", "10"], "properties": ["a, man, talk", "accelerate, laugh, vehicle"], "captions_pred_video": [null, "is taken from a car driving down the street"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a car accelerates and revs its engine "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["zgUgkpk78xU", "uYT5gxnyMWM"], "start_seconds": ["70", "50"], "properties": ["horn, bells, ring", "female, spraying, scream"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking and a baby is crying"], "question": "which entity is a warning", "label": 0}, {"captions": ["a man speaks while playing a video game on a keyboard", "people applaud and hoot and chat quietly"], "sample_ids": ["tw76HGONaKg", "wwyfGO2J4"], "start_seconds": ["570", "90"], "properties": ["A, game, keyboard", "people, applaud, hoot"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "a person snores loudly multiple times at a close distance"], "sample_ids": ["rwtmaKiCcQU", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["nozzle, depressed, spray can", "loud, multiple, distance"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", null], "captions_pred_audio": ["spraying and people speaking", "a person snoring loudly"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a sleeping person snores and wheezes", "people speak as gunfire rings out"], "sample_ids": ["spJCm8tD9Zo", "wqTCwqVRDlk"], "start_seconds": ["90", "80"], "properties": ["snores, wheezes, sleeps", "gunfire, ring, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "paper is crumpling consistently"], "sample_ids": ["sapQIQUhFc", "v5cSxLaHADY"], "start_seconds": ["280", "0"], "properties": ["liquid, flow, distance", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vb1fPSDI4c", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["multiple, people, yell", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a crowd of people are talking and laughing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 0}, {"captions": ["small dogs growl, bark and yip.", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sShpyu2l4YQ", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["growl, bark, yip", "airplane, boy, fly"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a dog is barking and growling", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "three men talk while wind blows and some liquid flows"], "sample_ids": ["uZesmtKZGSw", "vJ7JPEFhyLA"], "start_seconds": ["250", "16"], "properties": ["car, track, man", "three men, wind, flow"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about cars going around a track?", "label": 0}, {"captions": ["an adult man speaks over glass clinking", "water splashes as an animal walks through"], "sample_ids": ["u6jIvCtKarQ", "w1ir-sZ3Im8"], "start_seconds": ["70", "90"], "properties": ["a, man, speaks", "animal, water, splashes"], "captions_pred_video": ["footage of a person using a blender on a stove top", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["w2M4i1mklOA", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["alarm, gears, turn", "gun, shoot, water"], "captions_pred_video": ["footage of an antique clock", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a small engine spits as it runs", "a infant makes noise and is excited"], "sample_ids": ["sZvwOuuPGP0", "wIJK3-5y0kA"], "start_seconds": ["50", "30"], "properties": ["spits, engine, runs", "noise, excited, infant"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a medium engine is running ", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "paper is crumpling consistently"], "sample_ids": ["xyL9F5VrjkE", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["wind, motor, distance", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a car speeding up in the distance"], "sample_ids": ["vddP56-ogds", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["water, splash, person, laugh", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["un9VQlzgZM", "t97k0cejSQE"], "start_seconds": ["5", "250"], "properties": ["females, talk, laugh", "sound, chirp, buzz"], "captions_pred_video": [null, "a bee on a purple thistle flower"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a bee buzzes and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a small engine idles continuously", "an infant crying as a woman laughs"], "sample_ids": ["y5WII6cTH7k", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["engine, idle, continuously", "a, laugh, infant"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a baby cries and a woman speaks"], "question": "which entity is not a person?", "label": 0}, {"captions": ["some clanking with distant murmuring", "winds blows roughly as a vehicle races past"], "sample_ids": ["uMTTDZ2mb4", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["clanking, murmuring, distant", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["a person is snoring while sleeping", "wind blows as people chatter quietly"], "sample_ids": ["vJrjSeP17yE", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["a person is sleeping, snoring, person", "wind, chatter, people"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage is blurry and out of focus"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "dishes cling together then a man begins to speak"], "sample_ids": ["vhJWZheqaE", "sQGXqGcwOTc"], "start_seconds": ["0", "3"], "properties": ["water drains unevenly, toilet flushes, water drains", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a toilet is flushed", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a toilet flushing and water draining unevenly?", "label": 0}, {"captions": ["children speak as a female ask them questions", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wEBlkGWVWwE", "vJ7JPEFhyLA"], "start_seconds": ["260", "16"], "properties": ["female, speak, questions", "three men, wind, flow"], "captions_pred_video": ["shows a person writing on the whiteboard", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a female speaking to children?", "label": 0}, {"captions": ["water gurgles, metal squeaks and the water stops", "some tunes played by whistling"], "sample_ids": ["x4a9YGIw4ok", "u6BnG6YZqJ4"], "start_seconds": ["120", "0"], "properties": ["water, gurgles, stops", "tune, play, whistling"], "captions_pred_video": ["footage is blurry and out of focus", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a toilet flushes and water splashes", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "water runs into a sink while men speak"], "sample_ids": ["zcDwZ6W7E3E", "vzceMbklWc"], "start_seconds": ["180", "180"], "properties": ["a, man, speak", "water, sink, run"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "water is running and a man is speaking"], "question": "which entity is about water running into a sink?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["yajyRTUQk3U", "y8WEcpOlT3I"], "start_seconds": ["400", "40"], "properties": ["noise, woman, speak", "harsh, wind, blows"], "captions_pred_video": ["- a woman cooking in the kitchen", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking with wind noise in the background "], "question": "which entity is a man speaking over a harsh wind?", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["ugHJF0hfYkg", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["engine, running, continuously", "wind, blow, vehicle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a running engine", "label": 1}, {"captions": ["the revving of an engine throttle followed by a man speaking", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tezvROoo4bs", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["audio, throttle, speaking", "female, spraying, scream"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["an aircraft engine runs", "an insect buzzes around continuously"], "sample_ids": ["yLCORCnd35Q", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["engine, aircraft, runs", "buzzes, continuously, insect"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vqZuVbG6-HI", "wDVMhEdTiVw"], "start_seconds": ["130", "30"], "properties": ["background, male, female", "gun, shoot, water"], "captions_pred_video": ["footage is blurry because it's raining outside", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["music plays followed by gunshots and then an explosion", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["xKB8O8LTs6s", "uYT5gxnyMWM"], "start_seconds": ["70", "50"], "properties": ["music, gunshots, explosion", "female, spraying, scream"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a woman is speaking and a baby is crying"], "question": "which entity is more calm", "label": 1}, {"captions": ["a helicopter engine idles continuously", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["ugHJF0hfYkg", "yDoT73BWsdA"], "start_seconds": ["10", "10"], "properties": ["engine, idle, continuously", "engine, revs, vehicle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a helicopter is flying overhead ", "a race car accelerates and revs its engine "], "question": "which engine is revving", "label": 1}, {"captions": ["sawing of wood and rustling with leaves blowing in the distance", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["uiItxDsDMFI", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["sound, distance, leaves", "sound, chirp, buzz"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "a bee on a purple thistle flower"], "captions_pred_audio": ["a saw is being used with background noise ", "a bee buzzes and a woman speaks"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["a person sniffs and sneezes", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["uRlbY6aoBU", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["sneezes, person, sniffs", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is sneezing ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a door slams shut roughly", "a toilet flushes and water drains"], "sample_ids": ["zkKdxzNC97Y", "sfAvvZwdLCY"], "start_seconds": ["27", "20"], "properties": ["a door, slams, shut", "water drains, flushes, water"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a door is opened and closed", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "a vehicle engine accelerating then running on idle"], "sample_ids": ["se87d6yxEOA", "vYkA3cfXp5Q"], "start_seconds": ["10", "30"], "properties": ["run, whistle, pass", "engine, accelerate, idle"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "an engine is idling"], "question": "which engine is running on idle", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["vYkA3cfXp5Q", "wyllXV6PjKo"], "start_seconds": ["30", "30"], "properties": ["speed, idle, accelerate", "a baby, a woman, a man"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", null], "captions_pred_audio": ["an engine is idling", "a woman speaks and a baby cries"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "people applaud and hoot and chat quietly"], "sample_ids": ["tw76HGONaKg", "wwyfGO2J4"], "start_seconds": ["570", "90"], "properties": ["music, click, man", "people, applaud, hoot"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "people are clapping and speaking with background noise "], "question": "which entity has more people", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "metal clacking as food and oil sizzles followed by a woman talking"], "sample_ids": ["sTpirNYo8vQ", "vW4x7S1VfQc"], "start_seconds": ["30", "150"], "properties": ["a, tone, fast", "clacking, oil, woman"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of a person cooking fish in a frying pan on a stove top"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "food sizzles in a frying pan"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a man woman speak while crickets sing"], "sample_ids": ["xvDdE3zNf8Y", "zTLVJCo4WEE"], "start_seconds": ["120", "30"], "properties": ["a, female, speaks", "a, crickets, sing"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a woman speaks and crumples paper", "a woman speaks and crickets chirp"], "question": "which entity has a female speaking softly as paper crinkles?", "label": 0}, {"captions": ["an adult woman and an adult man speak", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zTLVJCo4WEE", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["two people, adult, speak", "multiple, people, yell"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a toilet flushes and a female speaks"], "sample_ids": ["vlJS7LN2XyM", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["background, clocks, ticking", "female, flushes, toilet"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage is blurry and out of focus"], "captions_pred_audio": ["a ticktock of a clock", "a toilet flushes and a man speaks"], "question": "which entity is accompanied by a female speaking?", "label": 1}, {"captions": ["water pouring and bubbling", "someone is typing on a computer keyboard"], "sample_ids": ["uyRfq-jKPpo", "v0x1odnXtP0"], "start_seconds": ["50", "210"], "properties": ["water, bubbles, pouring", "keyboard, type, computer"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "how to make money on youtube in spanish"], "captions_pred_audio": ["water is running from a faucet", "a person is typing on a keyboard"], "question": "which is not a source of bubbles", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["zkKdxzNC97Y", "wSVhSdj0F0"], "start_seconds": ["27", "10"], "properties": ["loud, bang, noise", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["footage of the door opening and closing in slow motion", null], "captions_pred_audio": ["a door is opened and closed", "a car horn honks and keys jangle with background noise "], "question": "which entity is softer", "label": 1}, {"captions": ["a consistent ticking pattern", "loud ringing of a telephone stops followed by a man speaking and a digital beep"], "sample_ids": ["sCeWURVHfOM", "uzQnlJXBbOM"], "start_seconds": ["30", "50"], "properties": ["ticking, pattern, clock", "ringing, beep, stop"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "footage of a person using a cell phone on a table"], "captions_pred_audio": ["ticking of a clock", "a telephone rings and a man speaks"], "question": "which entity is a clock", "label": 0}, {"captions": ["a person snores hilariously while someone laughs", "someone snores nearby"], "sample_ids": ["sSMl2vc3ek", "spJCm8tD9Zo"], "start_seconds": ["20", "90"], "properties": ["a person, laughs, snores", "someone snores, nearby, someone"], "captions_pred_video": [null, "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a person snoring loudly", "a person is snoring loudly"], "question": "which entity is a person?", "label": 0}, {"captions": ["a car speeding up in the distance", "a telephone rings followed by a woman talking"], "sample_ids": ["u0TrcHhkPQ", "tGcFnX0GHI"], "start_seconds": ["20", "0"], "properties": ["distance, car, speed", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a motorcycle engine is idling", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["vZAqdHZ81yA", "wSVhSdj0F0"], "start_seconds": ["180", "10"], "properties": ["engine, motorcycle, idling", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a car horn honks and keys jangle with background noise "], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a person sniffs and sneezes", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["uRlbY6aoBU", "uZesmtKZGSw"], "start_seconds": ["0", "250"], "properties": ["sneezes, person, sniffs", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["birds chirp and pigeons vocalize while walking around", "a car accelerates and wind blows"], "sample_ids": ["wIvYjuR3nrg", "u0TrcHhkPQ"], "start_seconds": ["9", "20"], "properties": ["birds, pigeons, vocalize", "accelerates, wind, blows"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", null], "captions_pred_audio": ["birds are chirping and cooing", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks with water running", "a cat meows and children speak"], "sample_ids": ["wTideSjRFS0", "x5cuQjOdM3E"], "start_seconds": ["30", "30"], "properties": ["water, running, woman", "cat, speak, children"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a black background with an airplane flying in the sky"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a cat meows and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["an electronic device bleeps once", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["tHJ6JSa8Y4", "t25U-v4k4ts"], "start_seconds": ["0", "40"], "properties": ["bleeps, electronic, device", "a, chirps, bird"], "captions_pred_video": [null, "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a clock is ticking and beeping", "a man is speaking and bees are buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a woman speaks in a fast tone with a male"], "sample_ids": ["xZepNM9qcRA", "sTpirNYo8vQ"], "start_seconds": ["30", "30"], "properties": ["background, motor, run", "a, tone, fast"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a man is speaking while a car is revving and accelerating "], "question": "which entity is a man speaking?", "label": 0}, {"captions": ["a man talks as something metal hits against and glass is set down", "an engine runs loudly"], "sample_ids": ["x6ijhqRY38s", "vqZuVbG6-HI"], "start_seconds": ["250", "130"], "properties": ["something metal, glass, hit", "loud, engine, run"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a person sniffles and sneezes", "a man speaks, another man speaks, and a small bell dings"], "sample_ids": ["uRlbY6aoBU", "t69a8aRKhmc"], "start_seconds": ["0", "30"], "properties": ["sneezes, sniffles, person", "a, b, c"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and birds are chirping in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tMJne1a4AFI", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["wind, buzz, rustling", "a woman, something, fried"], "captions_pred_video": ["a swarm of bees on the ground", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking while food is frying in the background"], "question": "which entity is a video", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["vSeGhaZt-aI", "xjvTpk2Zpr8"], "start_seconds": ["50", "70"], "properties": ["water, bubbles, speak", "wind, blows, vehicle"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a calm environment", "label": 0}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a toilet flushes and a female speaks"], "sample_ids": ["vuUVPzd2FXw", "yaln9y8I7ms"], "start_seconds": ["160", "230"], "properties": ["a, steam, release", "female, flushes, toilet"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a toilet flushes and a man speaks"], "question": "which entity is a video of a person speaking?", "label": 1}, {"captions": ["a horse runs while two women talk", "people cheer as a vehicle engine revs"], "sample_ids": ["sdvI1mHAsc", "xjhAnI2q6hM"], "start_seconds": ["20", "6"], "properties": ["two women, horse, run", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an infant crying frantically", "a man speaks followed by another man speaking outside"], "sample_ids": ["zwOBqeFTgiU", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["cry, infant, frantically", "two men, speak, follow"], "captions_pred_video": ["of the baby crying in the car seat", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a baby cries loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a human", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a infant makes noise and is excited"], "sample_ids": ["vYkA3cfXp5Q", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["engine, accelerate, idle", "noise, excited, infant"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["an engine is idling", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["wind blows strongly", "dishes cling together then a man begins to speak"], "sample_ids": ["w8uLijTqtlU", "sQGXqGcwOTc"], "start_seconds": ["70", "3"], "properties": ["wind, blows, strongly", "cling, speak, dishes"], "captions_pred_video": ["footage is blurry and shaky", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["the wind is blowing strongly", "mechanisms are operating and water is splashing "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a dog barks and whimpers", "some clanking with distant murmuring"], "sample_ids": ["sShpyu2l4YQ", "uMTTDZ2mb4"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "clanking, murmuring, distant"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "people are talking and a car is driving by with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vlJS7LN2XyM", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["background, clocks, ticking", "clickety-clack, train, whistle"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a ticktock of a clock", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["an animal quacks rapidly", "someone whistles a tune"], "sample_ids": ["vh30P49Po6s", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["animal, quacks, rapidly", "someone, tune, whistle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "white noise and snoring with some rustling in the background"], "sample_ids": ["zl9Dqx-j7q4", "xzKKf9bKNUo"], "start_seconds": ["6", "10"], "properties": ["engine, laugh, loud", "background, noise, snoring"], "captions_pred_video": ["footage of a man driving a car in the dark", "shows a woman laying on a bed with her eyes closed and her mouth open"], "captions_pred_audio": ["a jet engine roars ", "a person snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a car accelerates and wind blows"], "sample_ids": ["vXlk0lIQBFo", "u0TrcHhkPQ"], "start_seconds": ["470", "20"], "properties": ["wind, talk, vocalize", "accelerates, wind, blows"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", null], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "an infant crying frantically"], "sample_ids": ["x5cuQjOdM3E", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["cat, talk, meow", "cry, infant, frantically"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of the baby crying in the car seat"], "captions_pred_audio": ["a cat meows and a woman speaks", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a man speaks as a car is passing by"], "sample_ids": ["sLUnaPT5gM8", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["loud, laughter, intermittent", "a, car, pass"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking to a car passing by?", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "an infant crying frantically"], "sample_ids": ["vfYTJq7nU", "zwOBqeFTgiU"], "start_seconds": ["130", "30"], "properties": ["ducks, quack, man", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tOj4tdLRaA", "w5W5Kqtc8E"], "start_seconds": ["70", "100"], "properties": ["woman, laugh, baby", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a baby laughing?", "label": 0}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uqFtmnhuqA8", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "gun, shoot, water"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a movie", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ukxt9I7eMMg", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["food, pan, cook", "female, spraying, scream"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sK4u5T8hW78", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "male, duck, laugh"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a car speeding up in the distance"], "sample_ids": ["yZrFNS7GFBQ", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["pigeon, buzzes, insect", "distance, car, speed"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "a race car accelerates and revs its engine "], "question": "which object is moving faster", "label": 0}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a child and woman laughs and the woman speaks"], "sample_ids": ["ziUT9IFTkjg", "uPDn2BFTHk"], "start_seconds": ["10", "140"], "properties": ["background, birds, rustling", "woman, laughs, speaks"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a baby laughs and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["food is frying then a woman speaks", "some men converse over an engine running"], "sample_ids": ["ukxt9I7eMMg", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["food, woman, speak", "men, converse, engine"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a person whistles and clicks a mouse", "a car speeding up in the distance"], "sample_ids": ["zCrAfDfv6-A", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["person, mouse, click", "distance, car, speed"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", null], "captions_pred_audio": ["a person whistles a song", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["people speak in a closed space", "a telephone rings followed by a woman talking"], "sample_ids": ["sTpirNYo8vQ", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["people, space, speak", "ring, talk, woman"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a car speeding up in the distance"], "sample_ids": ["t25U-v4k4ts", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["a, chirps, bird", "distance, car, speed"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", null], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["small dogs yip and bark sharply", "someone is typing on a computer keyboard"], "sample_ids": ["v-wcQf4BDY0", "v0x1odnXtP0"], "start_seconds": ["120", "210"], "properties": ["bark, yip, sharply", "keyboard, type, computer"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "how to make money on youtube in spanish"], "captions_pred_audio": ["a dog barks and growls", "a person is typing on a keyboard"], "question": "which entity is typing", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["yYEVLuqEytU", "su6FAOcOA8c"], "start_seconds": ["40", "4"], "properties": ["grunt, slurp, background", "engine, idle, woman"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a child speaks in closed space"], "sample_ids": ["uiItxDsDMFI", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["wood, piece, saw", "child, space, speak"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a saw is being used with background noise ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not being sawed", "label": 1}, {"captions": ["a small engine idles continuously", "several insects fly while two men talk"], "sample_ids": ["y5WII6cTH7k", "s-T9OVOiMLo"], "start_seconds": ["40", "330"], "properties": ["engine, idle, continuously", "several, fly, men"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is not stationary", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vzxHnu-SFEw", "uYT5gxnyMWM"], "start_seconds": ["80", "50"], "properties": ["two objects, woman, speak", "a, scream, girl"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a train horn sounds as it passes by", "a child speaks in closed space"], "sample_ids": ["ukg5L09Wpvo", "yW6FWLSLkx4"], "start_seconds": ["150", "40"], "properties": ["sound, train, horn", "child, space, speak"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["xBxDz0CFVn0", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["wind, chatter, people", "female, spraying, scream"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["an airplane engine spools and people speak", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["wTjoRj1se3U", "w5W5Kqtc8E"], "start_seconds": ["390", "100"], "properties": ["airplane, engine, spool", "wind, blow, vehicle"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sG7TyPnFDR0", "uZesmtKZGSw"], "start_seconds": ["180", "250"], "properties": ["beeps, machine, smoke alarm", "men, talk, cars"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "a woman speaks as she rubs two objects together"], "sample_ids": ["uiS58TNyUiw", "vzxHnu-SFEw"], "start_seconds": ["430", "80"], "properties": ["audio, man, speaking", "two objects, woman, speak"], "captions_pred_video": ["of the pigeon in the cage", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a helicopter engine runs", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["t5ZbXbniOWk", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["engine, helicopter, run", "airplane, boy, fly"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "pigeons vocalize and birds chirp"], "sample_ids": ["wsHBIgzs9Fs", "uiS58TNyUiw"], "start_seconds": ["50", "430"], "properties": ["horn, continuous, buzzing", "vocalize, bird, chirp"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "of the pigeon in the cage"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a clock ticktocks"], "sample_ids": ["zl9Dqx-j7q4", "v-g-j2uTByM"], "start_seconds": ["6", "30"], "properties": ["engine, laugh, loud", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a man driving a car in the dark", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a jet engine roars ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "pigeons vocalize and birds chirp"], "sample_ids": ["sAam2NqGhLY", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["snoring, breathing, child", "vocalize, bird, chirp"], "captions_pred_video": ["of a little girl sleeping on a couch", "of the pigeon in the cage"], "captions_pred_audio": ["a person is snoring", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["dogs barking and whimpering", "water pouring and bubbling"], "sample_ids": ["tIY7qOV3rEM", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["barking, whimpering, dog", "water, bubbles, pouring"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["children speak and play together", "a vehicle engine accelerating then running on idle"], "sample_ids": ["yVVP8XvWJTo", "vYkA3cfXp5Q"], "start_seconds": ["260", "30"], "properties": ["children, speak, play", "engine, accelerate, idle"], "captions_pred_video": ["footage of a playground at a school or daycare center", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "an engine is idling"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "birds chirp in the background while a horse neighs followed by a girl speaking"], "sample_ids": ["sNB8zxXneIM", "s59PfAghdkM"], "start_seconds": ["20", "0"], "properties": ["several, quack, cocks", "bird, chirp, background, horse, neigh"], "captions_pred_video": ["a group of geese in a cage", "is an anime scene featuring two people and a horse in the foreground and a fence in the background"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "birds are chirping a horse is neighing and a woman is speaking "], "question": "which entity has a horse?", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "birds coo incessantly"], "sample_ids": ["y2ZBGpgbhHM", "yZrFNS7GFBQ"], "start_seconds": ["30", "30"], "properties": ["birds, tweet, pant", "coo, bird, incessant"], "captions_pred_video": [null, "of the bird in the cage"], "captions_pred_audio": ["birds chirping and a dog panting", "an owl hoots in the background "], "question": "which bird is incessant", "label": 1}, {"captions": ["a church bell rings several times", "water is sprayed across a hard surface"], "sample_ids": ["sUVVjE3Ucp8", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["ring, bell, several", "water, spray, surface"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a church bell is ringing ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "dishes cling together then a man begins to speak"], "sample_ids": ["voJh2gJxXhA", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["music, frog, croak", "cling, speak, dishes"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["music is playing and crickets are chirping ", "mechanisms are operating and water is splashing "], "question": "which entity is about a frog?", "label": 0}, {"captions": ["a flush is followed by gurgling water, then another flush", "a infant makes noise and is excited"], "sample_ids": ["tqR406bGiE", "wIJK3-5y0kA"], "start_seconds": ["40", "30"], "properties": ["flush, water, gurgle", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a toilet is flushed", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wyllXV6PjKo", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["a kid, talk, cry", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["ugHJF0hfYkg", "uEU-Hg5MTN8"], "start_seconds": ["10", "27"], "properties": ["loud, intense, propeller", "animal, grunts, snorts"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a small musical boom and then birds tweet and a few dogs pant"], "sample_ids": ["yLy-WycbVVE", "y2ZBGpgbhHM"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "birds, tweet, pant"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", null], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "birds chirping and a dog panting"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a baby cries and a woman moans", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["smDKStoHBJo", "sapQIQUhFc"], "start_seconds": ["0", "280"], "properties": ["a, cry, woman", "liquid, flow, distance"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more distant", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["xfaoyyzw2WU", "tDlysoZiA1I"], "start_seconds": ["180", "0"], "properties": ["loud, jet engine, roar", "animal, grunts, chirps"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "birds are chirping and a rooster is crowing "], "question": "which entity is quieter", "label": 1}, {"captions": ["male speech with light ticking", "a stream of water runs briefly"], "sample_ids": ["xO-Q2BlIIPU", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["male, speech, ticking", "stream, water, run"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["water flows followed by women screaming", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["w5W5Kqtc8E", "ziUT9IFTkjg"], "start_seconds": ["100", "10"], "properties": ["water, flow, women", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["water splashes as an animal walks through", "some men converse over an engine running"], "sample_ids": ["w1ir-sZ3Im8", "sCiy7QS1U"], "start_seconds": ["90", "300"], "properties": ["animal, water, splashes", "men, converse, engine"], "captions_pred_video": ["footage of a group of people riding horses through a river", null], "captions_pred_audio": ["water splashes and gurgles as people speak", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["multiple people speak then an emergency vehicle siren sounds", "water splashes as an animal walks through"], "sample_ids": ["wy1eKjR7KC0", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["audio, sound, siren", "animal, water, splashes"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and a siren is going off", "water splashes and gurgles as people speak"], "question": "which entity is a video", "label": 1}, {"captions": ["a large crowd cheers and applauds", "water flows and trickles"], "sample_ids": ["rqfQRErjfk8", "tB7hWb9gTuQ"], "start_seconds": ["170", "30"], "properties": ["crowd, cheers, applauds", "water, flow, trickle"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a crowd of people clapping and cheering", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "water flows as men speak and yell"], "sample_ids": ["tZGN5a7ybxo", "vJ7JPEFhyLA"], "start_seconds": ["60", "16"], "properties": ["ring, train, horn", "water, flow, men"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a train horn blowing?", "label": 0}, {"captions": ["race cars go around a track as a man commentates", "paper is crumpling consistently"], "sample_ids": ["uZesmtKZGSw", "v5cSxLaHADY"], "start_seconds": ["250", "0"], "properties": ["car, track, man", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "water runs into a sink while men speak"], "sample_ids": ["weDbePuc-Xc", "vzceMbklWc"], "start_seconds": ["40", "180"], "properties": ["cartoon character, music, vocalize", "water, sink, run"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "water is running and a man is speaking"], "question": "which entity is a video of a sink?", "label": 1}, {"captions": ["a vehicle engine revs as the vehicle passes", "an airplane engine runs"], "sample_ids": ["yDoT73BWsdA", "yVPZ2MNWpms"], "start_seconds": ["10", "0"], "properties": ["engine, revs, vehicle", "engine, airplane, runs"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car is driving by on the road "], "question": "which entity has a running engine", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "an engine runs loudly"], "sample_ids": ["uJV8NDaHqqk", "vqZuVbG6-HI"], "start_seconds": ["100", "130"], "properties": ["loud, fly, chirp", "loud, engine, run"], "captions_pred_video": ["a bee hive in a wooden box", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a swarm of bees buzzing around", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as a machine runs", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vD6lYD1l0BY", "uEU-Hg5MTN8"], "start_seconds": ["330", "27"], "properties": ["a, machine, run", "a woman, laughs, animal"], "captions_pred_video": ["game controller being held in the hands of the person", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking and a baby is crying"], "question": "which entity has a man speaking as a machine runs?", "label": 0}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wy1eKjR7KC0", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["people, talk, distance", "rooster, crow, background, men"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a toilet flushes and water drains unevenly"], "sample_ids": ["zl9Dqx-j7q4", "vhJWZheqaE"], "start_seconds": ["6", "0"], "properties": ["engine, laugh, loud", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a toilet is flushed"], "question": "which entity is a toilet?", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a weapon fires multiple times"], "sample_ids": ["wqZ135Ssz0", "sMC07Ucy7kg"], "start_seconds": ["60", "10"], "properties": ["man, woman, squawks", "weapon, fire, multiple"], "captions_pred_video": [null, "footage is from a car's point of view"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["s4Uz1Ffgo04", "y2bVZ7rz-5M"], "start_seconds": ["100", "280"], "properties": ["roars, background, people speaking", "motor noise, horn, siren"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "paper is crumpling consistently"], "sample_ids": ["s59PfAghdkM", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["bird, chirp, background, horse, neigh", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "paper is crumpled and crinkled"], "question": "which entity is more quiet", "label": 1}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "a man speaks as a motor runs in the background"], "sample_ids": ["vfYTJq7nU", "xZepNM9qcRA"], "start_seconds": ["130", "30"], "properties": ["rustling, ducks, quack", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["wqZ135Ssz0", "xV7Mg1QucSc"], "start_seconds": ["60", "14"], "properties": ["man, woman, squawks", "alarm, ticktocks, laughs"], "captions_pred_video": [null, "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "an alarm clock ticks and a woman laughs"], "question": "which entity is accompanied by a man speaking", "label": 0}, {"captions": ["an infant crying frantically", "a motor noise is accompanied by a door opening and closing"], "sample_ids": ["zwOBqeFTgiU", "vBHyYJ8pL0"], "start_seconds": ["30", "2"], "properties": ["cry, infant, frantically", "noise, door, opening"], "captions_pred_video": ["of the baby crying in the car seat", null], "captions_pred_audio": ["a baby cries loudly", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is not a person", "label": 1}, {"captions": ["loud, continuous burping", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["y636gklDioE", "y2bVZ7rz-5M"], "start_seconds": ["20", "280"], "properties": ["loud, continuous, burping", "motor noise, horn, siren"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a person burps loudly several times", "a truck is honking its horn and a siren is blaring "], "question": "which is louder", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "vehicles pass by on a roadway"], "sample_ids": ["sEprKHm8Sj8", "tgbONvsP47Y"], "start_seconds": ["90", "0"], "properties": ["car, tires, slows", "pass, vehicle, roadway"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car is driving on the road "], "question": "which vehicle is moving faster", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sYITalLZjj4", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["water, rushes, background, birds", "female, spraying, scream"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["wind blows and birds chirp", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a speedboat passes quickly on the water"], "sample_ids": ["y2ZBGpgbhHM", "tjmoSi330GM"], "start_seconds": ["30", "23"], "properties": ["birds, tweet, pant", "speed, water, boat"], "captions_pred_video": [null, "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["birds chirping and a dog panting", "a motorboat speeds through water with wind noise "], "question": "which entity is moving faster", "label": 1}, {"captions": ["water rushes by", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["x-PeY8Yb8M4", "wDVMhEdTiVw"], "start_seconds": ["300", "30"], "properties": ["water, rushes, by", "gun, shoot, water"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a car is driving on a wet road ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a video of water moving?", "label": 0}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a child speaks in closed space"], "sample_ids": ["yZrFNS7GFBQ", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["pigeon, buzzes, insect", "child, space, speak"], "captions_pred_video": ["of the bird in the cage", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["an owl hoots in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "water splashes as an animal walks through"], "sample_ids": ["weDbePuc-Xc", "w1ir-sZ3Im8"], "start_seconds": ["40", "90"], "properties": ["music, slaps, human", "animal, water, splashes"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a video of a person playing music?", "label": 0}, {"captions": ["a horn rings out as a machine runs by", "a train horn blows as it passes by"], "sample_ids": ["slZLHwNbbt4", "zVacuqSb4LI"], "start_seconds": ["300", "30"], "properties": ["a, horn, run", "horn, blows, train"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a toilet flushes and water drains"], "sample_ids": ["sfAvvZwdLCY", "sfAvvZwdLCY"], "start_seconds": ["20", "20"], "properties": ["flushes, drains, water", "water drains, flushes, water"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a toilet is flushed", "a toilet is flushed"], "question": "which entity has more water", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a man speaks as a motor runs in the background"], "sample_ids": ["vddP56-ogds", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["liquid, laughs, man", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a man speaks as a motor runs in the background"], "sample_ids": ["t25U-v4k4ts", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["bees buzz, birds chirp, man speaks", "background, motor, run"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "an insect buzzes around continuously"], "sample_ids": ["vSeGhaZt-aI", "v25l1jef3JY"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, run", "buzzes, continuously, insect"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "vehicles pass by on a roadway"], "sample_ids": ["yJ0TePmaOo", "tgbONvsP47Y"], "start_seconds": ["390", "0"], "properties": ["two hard objects, man, speak", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a infant makes noise and is excited"], "sample_ids": ["ylpYOorfH4o", "wIJK3-5y0kA"], "start_seconds": ["410", "30"], "properties": ["motor, run, steady", "noise, excited, infant"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "some men converse over an engine running"], "sample_ids": ["vddP56-ogds", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["water, splash, person, laugh", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a person speaking?", "label": 0}, {"captions": ["an adult speaks and is typing on a computer keyboard", "several insects fly while two men talk"], "sample_ids": ["x9JovgqUcs", "s-T9OVOiMLo"], "start_seconds": ["500", "330"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video of a person speaking and typing on a computer keyboard?", "label": 0}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["wP8ZKrlx3oA", "su6FAOcOA8c"], "start_seconds": ["40", "4"], "properties": ["rain, storm, thunder", "engine, idle, woman"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "wind blows and a vehicle blows a hard then a train blows a horn"], "sample_ids": ["wPz6QRAkEb4", "wnpJndXuxLc"], "start_seconds": ["60", "50"], "properties": ["chirps, tweets, song", "blows, vehicle, train"], "captions_pred_video": ["a bird in a cage on top of a pole", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["birds are chirping in the background ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is not a bird?", "label": 1}, {"captions": ["frogs croak and vocalize", "a car speeding up in the distance"], "sample_ids": ["yswmmRZFItk", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["croak, vocalize, frog", "distance, car, speed"], "captions_pred_video": ["a close up of a frog in the water", null], "captions_pred_audio": ["a frog is croaking", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks as a machine runs", "an airplane engine spools and people speak"], "sample_ids": ["vD6lYD1l0BY", "wTjoRj1se3U"], "start_seconds": ["330", "390"], "properties": ["a, machine, run", "airplane, engine, spool"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a jet engine is running and people are talking"], "question": "which entity is a machine", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a person is snoring while sleeping"], "sample_ids": ["v7jJS8aAyA", "vJrjSeP17yE"], "start_seconds": ["10", "40"], "properties": ["wind, blows, loudly", "a person is sleeping, snoring, person"], "captions_pred_video": [null, "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a person snoring loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xERFUeZONz8", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "gun, shoot, water"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["an emergency vehicle siren blares", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["scraping and female speech with distant music", "a toilet flushes and a female speaks"], "sample_ids": ["yHeVV-xeOxQ", "yaln9y8I7ms"], "start_seconds": ["130", "230"], "properties": ["female, speech, music", "female, flushes, toilet"], "captions_pred_video": ["of a girl milking a goat's udder", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "a man speaks as a car is passing by"], "sample_ids": ["soTOh3zYJfY", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["vehicle, skid, tires", "a, car, pass"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which vehicle is skidding and squealing tires", "label": 0}, {"captions": ["an electronic device bleeps once", "small dogs yip and bark sharply"], "sample_ids": ["tHJ6JSa8Y4", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["bleeps, electronic, device", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a clock is ticking and beeping", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uEU-Hg5MTN8", "su6FAOcOA8c"], "start_seconds": ["27", "4"], "properties": ["a woman, laughs, animal", "engine, idle, woman"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a subway train is moving "], "question": "which woman is speaking", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "an airplane engine runs"], "sample_ids": ["w5W5Kqtc8E", "yVPZ2MNWpms"], "start_seconds": ["100", "0"], "properties": ["wind, blow, vehicle", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a car is driving by on the road "], "question": "which entity has a running engine", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "water splashes as an animal walks through"], "sample_ids": ["vW4x7S1VfQc", "w1ir-sZ3Im8"], "start_seconds": ["150", "90"], "properties": ["clacking, oil, woman", "animal, water, splashes"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["food sizzles in a frying pan", "water splashes and gurgles as people speak"], "question": "which entity is more likely to be a video of a person cooking?", "label": 0}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["wyllXV6PjKo", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["a baby, a woman, a man", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman speaks and a baby cries", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tQWGZLItBXk", "zj2R0XoFr5k"], "start_seconds": ["170", "50"], "properties": ["music, kid, speak", "airplane, boy, fly"], "captions_pred_video": ["worms revolution screenshots", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "vehicles pass by on a roadway"], "sample_ids": ["vBslzh7saPw", "tgbONvsP47Y"], "start_seconds": ["90", "0"], "properties": ["power, scream, increase", "pass, vehicle, roadway"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a machine runs continuously", "small dogs yip and bark sharply"], "sample_ids": ["wdXV3Pv0jiY", "v-wcQf4BDY0"], "start_seconds": ["11", "120"], "properties": ["machine, running, continuously", "bark, yip, sharply"], "captions_pred_video": ["footage is blurry and shaky", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["an emergency siren wails as it passes", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vGj1XLJvNrw", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["wails, wails, pass", "gun, shoot, water"], "captions_pred_video": ["footage of a police car driving down a city street", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not followed by water sloshing?", "label": 0}, {"captions": ["a motor idles, accelerates, then slows down.", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vYkA3cfXp5Q", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["speed, idle, accelerate", "female, spraying, scream"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tDVADusiIoc", "zl9Dqx-j7q4"], "start_seconds": ["60", "6"], "properties": ["water, radio, man", "engine, laugh, loud"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a jet engine roars "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an animal quacks rapidly", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vh30P49Po6s", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["animal, quacks, rapidly", "a woman, a television program, a bird"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a duck is quacking loudly", "a woman is speaking and a dog is whimpering"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a car speeding up in the distance"], "sample_ids": ["sSMl2vc3ek", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["a person, laughs, snores", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["x9JovgqUcs", "tdWhHV3X25Q"], "start_seconds": ["500", "60"], "properties": ["a, man, speaks, keyboard", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "people cheer as a vehicle engine revs"], "sample_ids": ["vfYTJq7nU", "xjhAnI2q6hM"], "start_seconds": ["130", "6"], "properties": ["ducks, quack, man", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an engine runs loudly", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["vqZuVbG6-HI", "uEU-Hg5MTN8"], "start_seconds": ["130", "27"], "properties": ["loud, engine, run", "animal, grunts, snorts"], "captions_pred_video": ["footage is blurry because it's raining outside", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is quieter", "label": 1}, {"captions": ["a motorcycle engine works nearby", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tOSWIURC-4", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["engine, work, nearby", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zdYdyF9-m8U", "zj2R0XoFr5k"], "start_seconds": ["7", "50"], "properties": ["wind, crash, shoreline", "airplane, boy, fly"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["waves crash and wind blows ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a vehicle engine accelerating then running on idle"], "sample_ids": ["uWPRNLnpy7Y", "vYkA3cfXp5Q"], "start_seconds": ["10", "30"], "properties": ["accelerate, laugh, vehicle", "engine, accelerate, idle"], "captions_pred_video": ["is taken from a car driving down the street", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a car accelerates and revs its engine ", "an engine is idling"], "question": "which vehicle is accelerating quickly", "label": 0}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a vehicle engine revs and tires squeal"], "sample_ids": ["wnpJndXuxLc", "yDoT73BWsdA"], "start_seconds": ["50", "10"], "properties": ["blows, vehicle, train", "engine revs, tires squeal, vehicle"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["xSKJGCItUWE", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["engine, work, child", "stream, water, flow"], "captions_pred_video": ["footage of the helicopter flying in the room", "footage is blurry and out of focus"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xSKJGCItUWE", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["engine, work, child", "applause, audience, yells"], "captions_pred_video": ["footage of the helicopter flying in the room", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yRx9txMcBl0", "vbZ-0lGPneg"], "start_seconds": ["40", "30"], "properties": ["accelerates, tires, squeals", "a woman, a television program, a bird"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a stream of water runs briefly"], "sample_ids": ["y8WEcpOlT3I", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["harsh, wind, blows", "stream, water, run"], "captions_pred_video": ["on how to use a sewing machine youtube", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["ylpYOorfH4o", "tiDFTC-5vU"], "start_seconds": ["410", "30"], "properties": ["engine, running, wind", "male, duck, laugh"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking over?", "label": 0}, {"captions": ["an infant crying as a woman laughs", "some men converse over an engine running"], "sample_ids": ["xhmRY9yhC7c", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["a, laugh, infant", "men, converse, engine"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a woman and an infant?", "label": 0}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "a horse runs while two women talk"], "sample_ids": ["ylpYOorfH4o", "sdvI1mHAsc"], "start_seconds": ["410", "20"], "properties": ["motor, run, steady", "two women, horse, run"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "horses clip-clop and a woman speaks"], "question": "which entity is moving", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "a woman speaks as she rubs two objects together"], "sample_ids": ["w0xsN8X18Y", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["rain, thunder, surface", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a process", "label": 1}, {"captions": ["a person sniffles and then sneezes in the distance", "paper is crumpling consistently"], "sample_ids": ["uRlbY6aoBU", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["a, distance, sneeze", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is sneezing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["ylpYOorfH4o", "uEU-Hg5MTN8"], "start_seconds": ["410", "27"], "properties": ["engine, run, loud", "a woman, laughs, animal"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a woman is speaking and a baby is crying"], "question": "which entity is more quiet", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "a large crowd cheers and applauds"], "sample_ids": ["uC9dtII1KDI", "rqfQRErjfk8"], "start_seconds": ["150", "170"], "properties": ["wind, gusts, distance", "crowd, cheers, applauds"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "a man hugging another man in front of an orchestra"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a crowd of people clapping and cheering"], "question": "which entity is more likely to be a natural phenomenon", "label": 0}, {"captions": ["a helicopter engine idles continuously", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["ugHJF0hfYkg", "xfaoyyzw2WU"], "start_seconds": ["10", "180"], "properties": ["engine, idle, continuously", "loud, jet engine, roar"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a helicopter is flying overhead ", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "pigeons vocalize and birds chirp"], "sample_ids": ["sncRqQ67iJU", "uiS58TNyUiw"], "start_seconds": ["460", "430"], "properties": ["loud, repeatedly, man", "vocalize, bird, chirp"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "of the pigeon in the cage"], "captions_pred_audio": ["a person is snoring", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["someone whistles a tune", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sIXTftIuUgw", "wDVMhEdTiVw"], "start_seconds": ["90", "30"], "properties": ["someone, tune, whistle", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a person whistling a song", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["multiple motorcycles pass by as a man speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zcDwZ6W7E3E", "w5W5Kqtc8E"], "start_seconds": ["180", "100"], "properties": ["man, speak, motorcycles", "wind, blow, vehicle"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "dishes cling together then a man begins to speak"], "sample_ids": ["tw76HGONaKg", "sQGXqGcwOTc"], "start_seconds": ["570", "3"], "properties": ["A, game, keyboard", "cling, speak, dishes"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a person snores hilariously while someone laughs", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sSMl2vc3ek", "wqZ135Ssz0"], "start_seconds": ["20", "60"], "properties": ["a person, laughs, snores", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a person snoring loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a low rumbling in the distance followed by a motorcycle engine revving up", "a man speaks as a motor runs in the background"], "sample_ids": ["vr8ZXjEBhMQ", "xZepNM9qcRA"], "start_seconds": ["150", "30"], "properties": ["sound, distance, engine", "background, motor, run"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["sjlVMgdGSK0", "tDlysoZiA1I"], "start_seconds": ["30", "0"], "properties": ["car, revving, loudly", "animal, grunts, chirps"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a car accelerates and revs its engine ", "birds are chirping and a rooster is crowing "], "question": "which entity is quieter", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zl9Dqx-j7q4", "tiDFTC-5vU"], "start_seconds": ["6", "30"], "properties": ["motors rev, laugh, loudly", "male, duck, laugh"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a man is speaking and ducks are quacking"], "question": "which entity is a person?", "label": 0}, {"captions": ["a vehicle engine runs as a siren and horn sound", "a man speaks followed by another man speaking outside"], "sample_ids": ["u--KhUW8l1Y", "viuTg1M-dqg"], "start_seconds": ["0", "30"], "properties": ["sound, vehicle, horn", "two men, speak, follow"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "a mechanical buzzing getting louder"], "sample_ids": ["wwyfGO2J4", "sEprKHm8Sj8"], "start_seconds": ["90", "90"], "properties": ["people, applaud, hoot", "noise, loud, buzzing"], "captions_pred_video": [null, "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is quieter", "label": 0}, {"captions": ["someone sprays liquid onto a hard surface", "a man speaks as a car is passing by"], "sample_ids": ["sQwlkXjQabo", "sK4u5T8hW78"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "a, car, pass"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "a duck quacks continuously"], "sample_ids": ["ujMt0-D-x2k", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["snoring, rhythmical, nearby", "quacks, continuously, duck"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person is snoring loudly", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["t25U-v4k4ts", "wSVhSdj0F0"], "start_seconds": ["40", "10"], "properties": ["bees buzz, birds chirp, man speaks", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", null], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a car horn honks and keys jangle with background noise "], "question": "which entity is more likely to be heard in a car", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "a car speeding up in the distance"], "sample_ids": ["smGI3C1NZc", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["water, drain, toilet", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an adult woman and an adult man speak", "a propeller rotates loudly and intensely"], "sample_ids": ["zTLVJCo4WEE", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["two people, adult, speak", "loud, intense, propeller"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "small dogs yip and bark sharply"], "sample_ids": ["wqADXCzngMw", "v-wcQf4BDY0"], "start_seconds": ["340", "120"], "properties": ["audio, humming, revving", "bark, yip, sharply"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "frogs croak and vocalize"], "sample_ids": ["wwyfGO2J4", "yswmmRZFItk"], "start_seconds": ["90", "0"], "properties": ["people, applaud, hoot", "croak, vocalize, frog"], "captions_pred_video": [null, "a close up of a frog in the water"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["a child yells and another yells", "a woman speaks happily and an animal chirps"], "sample_ids": ["vMDHu7Lxcgw", "uWAAAL4CIoc"], "start_seconds": ["410", "0"], "properties": ["two, yell, child", "a woman, chirps, animal"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", null], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a woman is speaking and a dog is barking "], "question": "which entity has a more calming effect", "label": 1}, {"captions": ["a child speaks in closed space", "a woman speaks as she rubs two objects together"], "sample_ids": ["yW6FWLSLkx4", "vzxHnu-SFEw"], "start_seconds": ["40", "80"], "properties": ["child, space, speak", "two objects, woman, speak"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is speaking", "label": 1}, {"captions": ["some tunes played by whistling", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["u6BnG6YZqJ4", "tDlysoZiA1I"], "start_seconds": ["0", "0"], "properties": ["tune, play, whistling", "animal, grunts, chirps"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a person whistling a song", "birds are chirping and a rooster is crowing "], "question": "which entity is not a musical instrument", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uOpoD0gGXcs", "vfYTJq7nU"], "start_seconds": ["120", "130"], "properties": ["chirps, woman, bird", "rustling, ducks, quack"], "captions_pred_video": ["a herd of cows grazing in the field", null], "captions_pred_audio": ["birds are chirping and a man is speaking", "a duck quacks and a woman speaks"], "question": "which entity is about birds?", "label": 0}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a car speeding up in the distance"], "sample_ids": ["weDbePuc-Xc", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["music, slaps, human", "distance, car, speed"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "here comes the train and it starts to blow the horn and get close"], "sample_ids": ["w0xsN8X18Y", "s7knHCFW82w"], "start_seconds": ["30", "30"], "properties": ["rain, thunder, surface", "blow horn, get close, train"], "captions_pred_video": [null, "footage of the train on the tracks near a building and a car parked on the side of the road"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a train is blowing its horn and its wheels are squealing "], "question": "which entity is moving", "label": 1}, {"captions": ["rain falls heavily on a surface and a storm builds in the background with loud thunder", "water flows as men speak and yell"], "sample_ids": ["wP8ZKrlx3oA", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["rain, storm, thunder", "water, flow, men"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking and yelling?", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["ul60S8TXDA8", "tdWhHV3X25Q"], "start_seconds": ["60", "60"], "properties": ["sound, distance, bell", "applause, audience, yells"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vSeGhaZt-aI", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["water, bubbles, speak", "a woman, something, fried"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["tEE3MpBt1sg", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["drill, something, laugh", "engine, laugh, loud"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a man driving a car in the dark"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a jet engine roars "], "question": "which entity is followed by laughter", "label": 1}, {"captions": ["an adult woman and an adult man speak", "a child speaks in closed space"], "sample_ids": ["zTLVJCo4WEE", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["two people, adult, speak", "child, space, speak"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a child speaking?", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yZmhM1HcsyE", "vfYTJq7nU"], "start_seconds": ["4", "130"], "properties": ["engine, roar, water", "rustling, ducks, quack"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", null], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a duck quacks and a woman speaks"], "question": "which entity is more likely to be in a lake", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "people applaud and hoot and chat quietly"], "sample_ids": ["ujMt0-D-x2k", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["snoring, rhythmical, nearby", "people, applaud, hoot"], "captions_pred_video": ["of the dog playing with a toy on the floor", null], "captions_pred_audio": ["a person is snoring loudly", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["sHbXC6na9hg", "tDlysoZiA1I"], "start_seconds": ["0", "0"], "properties": ["a person, saw, wood", "animal, grunts, chirps"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["an engine is idling and vibrating", "birds are chirping and a rooster is crowing "], "question": "which entity is not a person?", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a duck quacks continuously"], "sample_ids": ["x5cuQjOdM3E", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["cat, talk, meow", "quacks, continuously, duck"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a cat meows and a woman speaks", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["multiple ducks quack continuously", "a woman speaks as she rubs two objects together"], "sample_ids": ["wfHeoPDLMaM", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["multiple, quack, continuously", "two objects, woman, speak"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["ducks are quacking", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "birds chirp and objects are moved around"], "sample_ids": ["wz7N8YRy74I", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["rooster, crow, background, men", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "an engine revs and a turning noise is made"], "sample_ids": ["wPz6QRAkEb4", "tOSWIURC-4"], "start_seconds": ["60", "0"], "properties": ["chirps, tweets, song", "noise, engine, revs"], "captions_pred_video": ["a bird in a cage on top of a pole", null], "captions_pred_audio": ["birds are chirping in the background ", "a lawn mower is running "], "question": "which entity is not a bird?", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["sWZzXuWYY", "tDlysoZiA1I"], "start_seconds": ["420", "0"], "properties": ["male, clanks, thumps", "animal, grunts, chirps"], "captions_pred_video": [null, "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "birds are chirping and a rooster is crowing "], "question": "which entity is more animal", "label": 1}, {"captions": ["a goat bleats as a person speaks", "a frog croaks as other frogs croak in the background"], "sample_ids": ["tPJvjq9QePY", "yswmmRZFItk"], "start_seconds": ["40", "0"], "properties": ["bleats, person, speak", "background, frog, croak"], "captions_pred_video": ["a dog and a sheep in a barn", "a close up of a frog in the water"], "captions_pred_audio": ["a baby cries and a man speaks", "a frog is croaking"], "question": "which animal is more likely to be a frog", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["w-4gHptFNuU", "ukg5L09Wpvo"], "start_seconds": ["21", "150"], "properties": ["engine revs, accelerates, bump", "clickety-clack, train, whistle"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a train engine runs and a horn blows"], "sample_ids": ["vZAw4apG0Es", "zPX9o1uDiI"], "start_seconds": ["30", "40"], "properties": ["background, tick, repeat", "engine, horn, run"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a train moves with its horn blowing and wheels squealing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "paper is crumpling consistently"], "sample_ids": ["zl9Dqx-j7q4", "v5cSxLaHADY"], "start_seconds": ["6", "0"], "properties": ["engine, laugh, loud", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a jet engine roars ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "birds chirp and objects are moved around"], "sample_ids": ["vBHyYJ8pL0", "yPUYU6t3rwo"], "start_seconds": ["2", "370"], "properties": ["noise, door, opening", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "someone whistles a tune"], "sample_ids": ["tPJvjq9QePY", "sIXTftIuUgw"], "start_seconds": ["40", "90"], "properties": ["animal, bleat, moo", "someone, tune, whistle"], "captions_pred_video": ["a dog and a sheep in a barn", null], "captions_pred_audio": ["a baby cries and a man speaks", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "a woman speaks happily and an animal chirps"], "sample_ids": ["xOZfdgAgJ9o", "uWAAAL4CIoc"], "start_seconds": ["40", "0"], "properties": ["woman, whimpering, speaking", "a woman, chirps, animal"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a dog is barking "], "question": "which entity is more cheerful", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "a person is snoring while sleeping"], "sample_ids": ["t8CV69hcvF0", "vJrjSeP17yE"], "start_seconds": ["210", "40"], "properties": ["person, sneeze, follow", "a person is sleeping, snoring, person"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a woman sneezes and speaks", "a person snoring loudly"], "question": "which person is snoring", "label": 1}, {"captions": ["continuous snoring", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["sLkeqCDJIyw", "vlS6YMeWAPo"], "start_seconds": ["120", "40"], "properties": ["loud, snoring, noise", "sheep, baa, birds"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a person is snoring loudly", "a goat bleats and birds chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["a car speeding up in the distance", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u0TrcHhkPQ", "zj2R0XoFr5k"], "start_seconds": ["20", "50"], "properties": ["distance, car, speed", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman speaks while a helicopter flies overhead "], "question": "which object is flying", "label": 0}, {"captions": ["someone is snoring while sleeping", "a car speeding up in the distance"], "sample_ids": ["ujMt0-D-x2k", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["snore, sleep, someone", "distance, car, speed"], "captions_pred_video": ["of the dog playing with a toy on the floor", null], "captions_pred_audio": ["a person is snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "wind blows as people chatter quietly"], "sample_ids": ["u21-Z5gJCB8", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["background, voice, man", "wind, chatter, people"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "an airplane engine spools and people speak"], "sample_ids": ["vzxHnu-SFEw", "wTjoRj1se3U"], "start_seconds": ["80", "390"], "properties": ["two objects, woman, speak", "airplane, engine, spool"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a jet engine is running and people are talking"], "question": "which object is moving", "label": 1}, {"captions": ["wind blowing followed by a zoom", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vr8ZXjEBhMQ", "vfYTJq7nU"], "start_seconds": ["150", "130"], "properties": ["wind, blow, zoom", "rustling, ducks, quack"], "captions_pred_video": ["is taken from a motorcycle's point of view", null], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a clock ticks quietly and rhythmically"], "sample_ids": ["tIY7qOV3rEM", "u7C-AEBQM"], "start_seconds": ["0", "30"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "ticks, rhythmic, quiet"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a ticktock of a clock"], "question": "which entity is quieter", "label": 1}, {"captions": ["wind blows as people chatter quietly", "an infant crying frantically"], "sample_ids": ["xBxDz0CFVn0", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["wind, chatter, people", "cry, infant, frantically"], "captions_pred_video": ["footage is blurry and out of focus", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a baby cries loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["w34HjHr6gAY", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["beeps, hit, woman", "airplane, boy, fly"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a person sniffs and sneezes", "motors rev and run loudly as a person laughs"], "sample_ids": ["uRlbY6aoBU", "zl9Dqx-j7q4"], "start_seconds": ["0", "6"], "properties": ["sneezes, person, sniffs", "motors rev, laugh, loudly"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is sneezing ", "a jet engine roars "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["t25U-v4k4ts", "zj2R0XoFr5k"], "start_seconds": ["40", "50"], "properties": ["a, chirps, bird", "airplane, boy, fly"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about flying?", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a toilet flushes and a female speaks"], "sample_ids": ["wEBlkGWVWwE", "yaln9y8I7ms"], "start_seconds": ["260", "230"], "properties": ["a, babble, woman", "female, flushes, toilet"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uWAAAL4CIoc", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["a, dog, vocalize", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a woman is speaking and a baby is crying"], "question": "which entity has a dog vocalize?", "label": 0}, {"captions": ["wind blowing and birds chirping with the distant cooing of a large bird", "frogs croak and vocalize"], "sample_ids": ["wRBHTgrbiwg", "yswmmRZFItk"], "start_seconds": ["50", "0"], "properties": ["birds, chirp, cooing", "croak, vocalize, frog"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "a close up of a frog in the water"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a frog is croaking"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["a person is whistling a tune", "a man speaks as a car is passing by"], "sample_ids": ["scYRUkrFLiQ", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a, tune, whistle", "a, car, pass"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 0}, {"captions": ["a baby laughs giddily and a woman laughs then speaks", "a person sneezes followed by another person speaking"], "sample_ids": ["wjsXBsc7M40", "t8CV69hcvF0"], "start_seconds": ["10", "210"], "properties": ["a baby laughs, a woman laughs, a woman speaks", "person, sneeze, follow"], "captions_pred_video": ["footage of the baby playing with a toothbrush", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman sneezes and speaks"], "question": "which entity has a person speaking after a person sneezes?", "label": 1}, {"captions": ["a person is whistling", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sIXTftIuUgw", "yajyRTUQk3U"], "start_seconds": ["90", "400"], "properties": ["person, whistling, person", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["wAAkbZToh8", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["burp, laugh, speak", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man burps and a woman speaks", "a man is speaking and ducks are quacking"], "question": "which entity is a person?", "label": 0}, {"captions": ["a steam engine runs and whistles as it passes by", "a propeller rotates loudly and intensely"], "sample_ids": ["se87d6yxEOA", "ugHJF0hfYkg"], "start_seconds": ["10", "10"], "properties": ["run, whistle, pass", "loud, intense, propeller"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a person snoring several times", "a woman speaks with water running"], "sample_ids": ["spJCm8tD9Zo", "wTideSjRFS0"], "start_seconds": ["90", "30"], "properties": ["snore, person, several", "water, running, woman"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking while water is running in the background"], "question": "which entity is a person", "label": 0}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["w8uLijTqtlU", "su6FAOcOA8c"], "start_seconds": ["70", "4"], "properties": ["wind, microphone, noise", "engine, idle, woman"], "captions_pred_video": ["footage is blurry and shaky", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["the wind is blowing strongly", "a woman is speaking and a subway train is moving "], "question": "which entity is a recording", "label": 1}, {"captions": ["a person is burping while a girl speaks", "a man speaks as a car is passing by"], "sample_ids": ["vdoxuJn9lTc", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["person, burp, girl", "a, car, pass"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["someone whistles a tune", "a infant makes noise and is excited"], "sample_ids": ["sIXTftIuUgw", "wIJK3-5y0kA"], "start_seconds": ["90", "30"], "properties": ["someone, tune, whistle", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a person whistling a song", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "an engine starts and increases in power"], "sample_ids": ["u7C-AEBQM", "zjTG0gaGCUI"], "start_seconds": ["30", "80"], "properties": ["ticks, rhythmic, quiet", "power, increase, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a jet engine roars as wind blows "], "question": "which is more quiet", "label": 0}, {"captions": ["someone is burping continuously", "a motor idles, accelerates, then slows down."], "sample_ids": ["y636gklDioE", "vYkA3cfXp5Q"], "start_seconds": ["20", "30"], "properties": ["burps, burps, burps", "speed, idle, accelerate"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a person burps loudly several times", "an engine is idling"], "question": "which entity is not a burp", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sQwlkXjQabo", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["water, spray, surface", "stream, water, flow"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage is blurry and out of focus"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a car speeding up in the distance", "a vehicle engine accelerating then running on idle"], "sample_ids": ["u0TrcHhkPQ", "vYkA3cfXp5Q"], "start_seconds": ["20", "30"], "properties": ["distance, car, speed", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "an engine is idling"], "question": "which is not a car", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "people speak as gunfire rings out"], "sample_ids": ["uiS58TNyUiw", "wqTCwqVRDlk"], "start_seconds": ["430", "80"], "properties": ["vocalize, bird, chirp", "gunfire, ring, speak"], "captions_pred_video": ["of the pigeon in the cage", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "pigeons vocalize and birds chirp"], "sample_ids": ["zF8yoL0rkbI", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["engine, run, someone", "vocalize, bird, chirp"], "captions_pred_video": ["footage of the traffic on the street at night", "of the pigeon in the cage"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "an engine sputters followed by a car zooming by"], "sample_ids": ["wSVhSdj0F0", "u5RmF3c3Aw"], "start_seconds": ["10", "60"], "properties": ["beep, clang, footsteps", "engine, car, zoom"], "captions_pred_video": [null, null], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a race car accelerates and skids with wind noise in the background "], "question": "which entity is a car?", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "waves crash against a shoreline and wind blows"], "sample_ids": ["xBxDz0CFVn0", "zdYdyF9-m8U"], "start_seconds": ["30", "7"], "properties": ["stream, water, flow", "wind, crash, shoreline"], "captions_pred_video": ["footage is blurry and out of focus", "a person kayaking in the ocean near a cliff"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "waves crash and wind blows "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a toilet flushes and a female speaks"], "sample_ids": ["vZAw4apG0Es", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["background, tick, repeat", "female, flushes, toilet"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage is blurry and out of focus"], "captions_pred_audio": ["a clock is ticking and people are talking", "a toilet flushes and a man speaks"], "question": "which entity has a female speaking?", "label": 1}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "water splashes and a door squeaks"], "sample_ids": ["uZesmtKZGSw", "sdXV-ylviw"], "start_seconds": ["250", "190"], "properties": ["men, talk, cars", "sound, splash, door"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", null], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a dog barks and taps with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "paper is crumpling consistently"], "sample_ids": ["sOa7g-44Dag", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["background, man, spray", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a whistling owl calls out repeatedly and insects screech", "a man speaks as a car is passing by"], "sample_ids": ["w6RTHR6AeAg", "sK4u5T8hW78"], "start_seconds": ["40", "30"], "properties": ["call, owl, screech", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["an owl hoots and mechanisms operate ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["an infant crying frantically", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zwOBqeFTgiU", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["cry, infant, frantically", "applause, audience, yells"], "captions_pred_video": ["of the baby crying in the car seat", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a baby cries loudly", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "water flows as men speak and yell"], "sample_ids": ["zO-LSSY92ZM", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["liquid, surface, sound", "water, flow, men"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["steam is hissing and hissing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of water flowing?", "label": 1}, {"captions": ["people speak in a closed space", "vehicles pass by on a roadway"], "sample_ids": ["sTpirNYo8vQ", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["people, space, speak", "pass, vehicle, roadway"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["bees buzz and wind blows", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tMJne1a4AFI", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["bees buzz, wind blows, bees", "multiple, people, yell"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a door slams shut roughly", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["zkKdxzNC97Y", "uZesmtKZGSw"], "start_seconds": ["27", "250"], "properties": ["a door, slams, shut", "men, talk, cars"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["heavy rain splashes as it falls", "people cheer as a vehicle engine revs"], "sample_ids": ["wP8ZKrlx3oA", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["fall, rain, splash", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["a man speaks as a car is passing by", "people speak as gunfire rings out"], "sample_ids": ["sK4u5T8hW78", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["a, car, pass", "gunfire, ring, speak"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "water splashes as an animal walks through"], "sample_ids": ["rwTERCUno", "w1ir-sZ3Im8"], "start_seconds": ["90", "90"], "properties": ["engine, idle, sputter", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["an engine is idling and vibrating", "water splashes and gurgles as people speak"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "water splashes as an animal walks through"], "sample_ids": ["uiS58TNyUiw", "w1ir-sZ3Im8"], "start_seconds": ["430", "90"], "properties": ["audio, man, speaking", "animal, water, splashes"], "captions_pred_video": ["of the pigeon in the cage", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a man sprays as a scraping occurs in the background"], "sample_ids": ["ylpYOorfH4o", "sOa7g-44Dag"], "start_seconds": ["410", "30"], "properties": ["engine, running, wind", "background, man, spray"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and rubbing his hands together "], "question": "which entity has a man speaking over a running engine and blowing wind?", "label": 0}, {"captions": ["a machine beeps continuously", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["y682ml90jGw", "uYT5gxnyMWM"], "start_seconds": ["11", "50"], "properties": ["beeps, machine, continuously", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uEU-Hg5MTN8", "tdWhHV3X25Q"], "start_seconds": ["27", "60"], "properties": ["animal, grunts, snorts", "applause, audience, yells"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a horn rings out as a machine runs by"], "sample_ids": ["zj2R0XoFr5k", "slZLHwNbbt4"], "start_seconds": ["50", "300"], "properties": ["airplane, boy, fly", "a, horn, run"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["birds chirp as a bell rings", "an electric engine works nearby followed by a child talking"], "sample_ids": ["ziUT9IFTkjg", "xSKJGCItUWE"], "start_seconds": ["10", "10"], "properties": ["chirp, bell, ring", "engine, work, child"], "captions_pred_video": [null, "footage of the helicopter flying in the room"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a high pitched engine is running and a child speaks"], "question": "which entity is a machine", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a man speaks as bees buzz and birds chirp"], "sample_ids": ["sa6TLVbooCc", "t25U-v4k4ts"], "start_seconds": ["240", "40"], "properties": ["people, laugh, child", "bees buzz, birds chirp, man speaks"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking and bees are buzzing"], "question": "which entity has a child speaking?", "label": 0}, {"captions": ["a motor idles, accelerates, then slows down.", "wind blows and people scream while an engine revs"], "sample_ids": ["vYkA3cfXp5Q", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["speed, idle, accelerate", "wind, engine, scream"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", null], "captions_pred_audio": ["an engine is idling", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a video of a car?", "label": 0}, {"captions": ["frogs croak and vocalize", "an infant crying as a woman laughs"], "sample_ids": ["yswmmRZFItk", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["croak, vocalize, frog", "a, laugh, infant"], "captions_pred_video": ["a close up of a frog in the water", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a frog is croaking", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "a frog vocalizes as birds chirp"], "sample_ids": ["x6ijhqRY38s", "wqUmIEzuNz4"], "start_seconds": ["250", "30"], "properties": ["something metal, glass, hit", "frog, bird, vocalize"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "a frog sitting in the grass on a sunny day"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a cat meows and rustles"], "question": "which entity is a frog?", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "vehicles pass by on a roadway"], "sample_ids": ["xZepNM9qcRA", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["background, motor, run", "pass, vehicle, roadway"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a car is driving on the road "], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "bird squawks are accompanied by a man and woman speaking"], "sample_ids": ["yajyRTUQk3U", "wqZ135Ssz0"], "start_seconds": ["400", "60"], "properties": ["noise, woman, speak", "man, woman, squawks"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has a man and woman speaking?", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a woman speaks as she rubs two objects together"], "sample_ids": ["rwTERCUno", "vzxHnu-SFEw"], "start_seconds": ["90", "80"], "properties": ["engine, idle, sputter", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["an engine is idling and vibrating", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a process", "label": 1}, {"captions": ["a man speaks as a machine runs", "paper is crumpling consistently"], "sample_ids": ["vD6lYD1l0BY", "v5cSxLaHADY"], "start_seconds": ["330", "0"], "properties": ["a, machine, run", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "three men talk while wind blows and some liquid flows"], "sample_ids": ["x6ijhqRY38s", "vJ7JPEFhyLA"], "start_seconds": ["250", "16"], "properties": ["something metal, glass, hit", "three men, wind, flow"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man talking?", "label": 0}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "a stream of water flows as people talk and wind blows"], "sample_ids": ["xNMovAf3o50", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["rain, thunder, music", "stream, water, flow"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "footage is blurry and out of focus"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is flowing", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a man speaks as insects buzz and a bird chirps"], "sample_ids": ["uZesmtKZGSw", "t25U-v4k4ts"], "start_seconds": ["250", "40"], "properties": ["car, track, man", "a, chirps, bird"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking and bees are buzzing"], "question": "which entity has a bird chirp?", "label": 1}, {"captions": ["a person speaks over rustling leaves", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zOZleIRqZm4", "wqZ135Ssz0"], "start_seconds": ["80", "60"], "properties": ["rustling, leaves, person", "two men, woman, birds"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sQwlkXjQabo", "vYkA3cfXp5Q"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "engine, accelerate, idle"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["spraying followed by silence", "an engine is idling"], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["an aircraft engine runs", "small dogs yip and bark sharply"], "sample_ids": ["yLCORCnd35Q", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["engine, aircraft, runs", "bark, yip, sharply"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "multiple people speak and children yell while water gurgles"], "sample_ids": ["ylpYOorfH4o", "vb1fPSDI4c"], "start_seconds": ["410", "30"], "properties": ["engine, running, wind", "multiple, people, yell"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "water splashes as an animal walks through"], "sample_ids": ["wRBHTgrbiwg", "w1ir-sZ3Im8"], "start_seconds": ["50", "90"], "properties": ["bird, owl, speak", "animal, water, splashes"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["wz7N8YRy74I", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["rooster, crow, background, people", "engine, revs, vehicle"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["someone whistles briefly", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["uFoga8sHpiw", "su6FAOcOA8c"], "start_seconds": ["90", "4"], "properties": ["sound, duration, pitch", "engine, idle, woman"], "captions_pred_video": ["footage of a bird in a cage", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a person whistles a song", "a woman is speaking and a subway train is moving "], "question": "which entity has a longer duration", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "a door opens and birds chirp"], "sample_ids": ["zuua6-5goWw", "yeFvk9x0wWI"], "start_seconds": ["30", "30"], "properties": ["sound, pop, bird", "door, open, birds"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "birds chirp in the background as a car drives by "], "question": "which entity has birds chirp and a pop occurs before a man speaks?", "label": 0}, {"captions": ["a man sprays as a scraping occurs in the background", "an insect buzzes around continuously"], "sample_ids": ["sOa7g-44Dag", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["background, man, spray", "buzzes, continuously, insect"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zl9Dqx-j7q4", "vbZ-0lGPneg"], "start_seconds": ["6", "30"], "properties": ["motors rev, laugh, loudly", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a man driving a car in the dark", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a jet engine roars ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["an insect buzzes around continuously", "an engine runs loudly"], "sample_ids": ["v25l1jef3JY", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["buzzes, continuously, insect", "loud, engine, run"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ukg5L09Wpvo", "uYT5gxnyMWM"], "start_seconds": ["150", "50"], "properties": ["a train, a horn, a bell", "female, spraying, scream"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and spraying?", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a child speaks in closed space"], "sample_ids": ["rqfQRErjfk8", "yW6FWLSLkx4"], "start_seconds": ["170", "40"], "properties": ["crowd, cheers, applauds", "child, space, speak"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is more likely to be in a public place", "label": 0}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["y2bVZ7rz-5M", "yLy-WycbVVE"], "start_seconds": ["280", "30"], "properties": ["motor noise, horn, siren", "background, people, talk"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity has a horn honking?", "label": 0}, {"captions": ["paper is repeatedly crumpled and crinkled", "a child speaks in closed space"], "sample_ids": ["vms5XGTDVQc", "yW6FWLSLkx4"], "start_seconds": ["220", "40"], "properties": ["paper, crumpled, crinkled", "child, space, speak"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["paper is crumpled and crinkled", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not crumpled and crinkled", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a motorcycle engine is idling"], "sample_ids": ["xBxDz0CFVn0", "vZAqdHZ81yA"], "start_seconds": ["30", "180"], "properties": ["wind, chatter, people", "engine, motorcycle, idling"], "captions_pred_video": ["footage is blurry and out of focus", "a motorcycle is parked on the side of the road with its rear end facing the viewer"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "an engine is idling loudly"], "question": "which is quieter", "label": 1}, {"captions": ["food is frying while a woman speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["yhQ2Lg-7qDY", "uYT5gxnyMWM"], "start_seconds": ["130", "50"], "properties": ["food, woman, speak", "a, scream, girl"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["a piece of wood is being placed down and sawed", "a motor runs in the distance as a soft wind periodically gusts"], "sample_ids": ["uiItxDsDMFI", "xyL9F5VrjkE"], "start_seconds": ["30", "20"], "properties": ["wood, piece, saw", "wind, motor, distance"], "captions_pred_video": ["a man cutting a log with an axe in the woods", "of a caterpillar truck loading logs into a trailer"], "captions_pred_audio": ["a saw is being used with background noise ", "the wind is blowing and a car is passing by "], "question": "which entity is not a piece of wood?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a clock ticktocks"], "sample_ids": ["x5cuQjOdM3E", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["cat, meows, young woman", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a black background with an airplane flying in the sky", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a cat meows and a woman speaks", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "paper folding and crinkling"], "sample_ids": ["vddP56-ogds", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["liquid, laughs, man", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of folding and crinkling?", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "some men converse over an engine running"], "sample_ids": ["w9lpbUn0hPc", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["male, wind, rustling", "men, converse, engine"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", null], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation between men?", "label": 1}, {"captions": ["a clock ticktocks briefly", "a child speaks in closed space"], "sample_ids": ["u7C-AEBQM", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["ticktocks, clock, ticktocks briefly", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a drill runs and two people laugh", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["tEE3MpBt1sg", "uEU-Hg5MTN8"], "start_seconds": ["50", "27"], "properties": ["two people, laugh, drill", "a woman, laughs, animal"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a woman is speaking and a baby is crying"], "question": "which entity has a drill running?", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "an engine revs and a turning noise is made"], "sample_ids": ["wAAkbZToh8", "tOSWIURC-4"], "start_seconds": ["0", "0"], "properties": ["burp, laugh, speak", "noise, engine, revs"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man burps and a woman speaks", "a lawn mower is running "], "question": "which entity is not a noise?", "label": 0}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["y2bVZ7rz-5M", "wz7N8YRy74I"], "start_seconds": ["280", "30"], "properties": ["motor noise, horn, siren", "rooster, crow, background, men"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["uYT5gxnyMWM", "xfaoyyzw2WU"], "start_seconds": ["50", "180"], "properties": ["female, spraying, scream", "loud, jet engine, roar"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks briefly", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["u7C-AEBQM", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["ticktocks, clock, ticktocks briefly", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["birds tweet and squawk", "water pouring and bubbling"], "sample_ids": ["w1mlz3Pe4fU", "uyRfq-jKPpo"], "start_seconds": ["300", "50"], "properties": ["squawk, tweet, scream", "water, bubbles, pouring"], "captions_pred_video": ["of a bird in a cage", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["birds are chirping and singing", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vveS8HT7Uog", "uYT5gxnyMWM"], "start_seconds": ["100", "50"], "properties": ["a man, objects, speak", "female, spraying, scream"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["yaln9y8I7ms", "yDoT73BWsdA"], "start_seconds": ["230", "10"], "properties": ["female, flushes, toilet", "engine, revs, vehicle"], "captions_pred_video": ["footage is blurry and out of focus", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["wind blows and a stream of water flows nearby", "paper folding and crinkling"], "sample_ids": ["sYITalLZjj4", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["stream, flow, wind", "paper, fold, crinkle"], "captions_pred_video": ["two ducks are swimming in the water near each other", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["wind blows and birds chirp", "the wind blows and a mouse clicks "], "question": "which entity is not a stream of water", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "an airplane engine runs"], "sample_ids": ["v0x1odnXtP0", "yVPZ2MNWpms"], "start_seconds": ["210", "0"], "properties": ["keyboard, type, computer", "engine, airplane, runs"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a person is typing on a keyboard", "a car is driving by on the road "], "question": "which is a moving object", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "people speak softly as food sizzles"], "sample_ids": ["s4Uz1Ffgo04", "yhQ2Lg-7qDY"], "start_seconds": ["100", "130"], "properties": ["roars, background, people speaking", "food, sizzle, speak"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a faucet is running and a man is speaking"], "question": "which entity is quieter", "label": 1}, {"captions": ["a person snoring several times", "a car speeding up in the distance"], "sample_ids": ["spJCm8tD9Zo", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["snore, person, several", "distance, car, speed"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "some men converse over an engine running"], "sample_ids": ["vlS6YMeWAPo", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["noise, bleat, call", "men, converse, engine"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a conversation?", "label": 1}, {"captions": ["a small engine idles continuously", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["y5WII6cTH7k", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["engine, idle, continuously", "female, spraying, scream"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a woman is speaking and a baby is crying"], "question": "which entity is not spraying?", "label": 0}, {"captions": ["a dog barks and whimpers", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sShpyu2l4YQ", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "a woman, a television program, a bird"], "captions_pred_video": ["the puppies are playing with a toy", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a clock ticktocks"], "sample_ids": ["uWPRNLnpy7Y", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["accelerate, laugh, vehicle", "ticktocks, clock, ticktocks"], "captions_pred_video": ["is taken from a car driving down the street", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "a car accelerates and wind blows"], "sample_ids": ["vbpKkWvfOu4", "u0TrcHhkPQ"], "start_seconds": ["560", "20"], "properties": ["a, woman, man", "accelerates, wind, blows"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "water is sprayed across a hard surface"], "sample_ids": ["tGcFnX0GHI", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["ring, talk, woman", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["several insects fly while two men talk", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["s-T9OVOiMLo", "zj2R0XoFr5k"], "start_seconds": ["330", "50"], "properties": ["several, fly, men", "airplane, boy, fly"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a woman and man are speaking", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vbpKkWvfOu4", "xfaoyyzw2WU"], "start_seconds": ["560", "180"], "properties": ["two people, speaking, woman, man", "loud, jet engine, roar"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["goats bleat and people speak", "a propeller rotates loudly and intensely"], "sample_ids": ["z5iUE5h0EPs", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["goats bleat, people speak, language", "loud, intense, propeller"], "captions_pred_video": ["of the goat in the barn", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a goat bleats and a man speaks", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "long loud burping by a man"], "sample_ids": ["zsLxS-uLJTw", "xmiUIOhtZyQ"], "start_seconds": ["20", "60"], "properties": ["horn, blast, train", "loud, burp, man"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "homer simpson drinking a beer"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a person burps and music plays in the background "], "question": "which is louder", "label": 0}, {"captions": ["water bubbles and gurgles.", "an aircraft engine runs"], "sample_ids": ["tB7hWb9gTuQ", "yLCORCnd35Q"], "start_seconds": ["30", "0"], "properties": ["bubbles, gurgles, water", "engine, aircraft, runs"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "a lufthansa airbus a380 landing at london's heathrow airport"], "captions_pred_audio": ["water is splashing and gurgling", "a train is moving and its wheels are squealing "], "question": "which entity is a moving object", "label": 1}, {"captions": ["people speak as gunfire rings out", "water runs into a sink while men speak"], "sample_ids": ["wqTCwqVRDlk", "vzceMbklWc"], "start_seconds": ["80", "180"], "properties": ["gunfire, ring, speak", "water, sink, run"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", null], "captions_pred_audio": ["a man is speaking and a gun is fired", "water is running and a man is speaking"], "question": "which entity is more calm", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "birds twitter and chirp and clatter"], "sample_ids": ["sWZzXuWYY", "yeFvk9x0wWI"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "chirp, twitter, clatter"], "captions_pred_video": [null, "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "birds chirp in the background as a car drives by "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man talks as several small engines run", "people cheer as a vehicle engine revs"], "sample_ids": ["u9A6VZQCZpU", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["a, man, talk", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a dark barks and whimpers", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sYj4hpDUZDQ", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["barks, whimpers, dark", "a woman, a television program, a bird"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a dog barks and a cat meows", "a woman is speaking and a dog is whimpering"], "question": "which entity is more active", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "people cheer as a vehicle engine revs"], "sample_ids": ["tqR406bGiE", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["flush, water, gurgle", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a toilet is flushed", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a motorcycle engine is idling", "small dogs yip and bark sharply"], "sample_ids": ["vZAqdHZ81yA", "v-wcQf4BDY0"], "start_seconds": ["180", "120"], "properties": ["engine, motorcycle, idling", "bark, yip, sharply"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["an engine is idling loudly", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "an infant crying as a woman laughs"], "sample_ids": ["wtDqrBygTcU", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["man, engine, run", "a, laugh, infant"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a motor is running", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a stream of water runs briefly"], "sample_ids": ["vSeGhaZt-aI", "x-PeY8Yb8M4"], "start_seconds": ["50", "300"], "properties": ["water, sink, talk", "stream, water, run"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a man speaks as a motor runs in the background"], "sample_ids": ["xM4joTqDVp4", "xZepNM9qcRA"], "start_seconds": ["160", "30"], "properties": ["background, chirp, birds", "background, motor, run"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a dog barks and whimpers", "a child speaks in closed space"], "sample_ids": ["sShpyu2l4YQ", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["barks, whimpers, dog", "child, space, speak"], "captions_pred_video": ["the puppies are playing with a toy", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["x5cuQjOdM3E", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["cat, talk, meow", "a woman, laughs, animal"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a cat", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["y8WEcpOlT3I", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["wind, speak, buffeting", "wind, blow, vehicle"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blows before women yell?", "label": 1}, {"captions": ["a man is filing a hard object", "paper folding and crinkling"], "sample_ids": ["vveS8HT7Uog", "zPpG3RD8lSs"], "start_seconds": ["100", "20"], "properties": ["a man, hard, object", "paper, fold, crinkle"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "the wind blows and a mouse clicks "], "question": "which object is being filed", "label": 0}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "a man speaks as a car is passing by"], "sample_ids": ["s59PfAghdkM", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["bird, chirp, background, horse, neigh", "a, car, pass"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a horse in it?", "label": 0}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "people cheer as a vehicle engine revs"], "sample_ids": ["tQWGZLItBXk", "xjhAnI2q6hM"], "start_seconds": ["170", "6"], "properties": ["voice, music, whoosh", "engine revs, vehicle, people"], "captions_pred_video": ["worms revolution screenshots", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a truck is revving its engine and a man is speaking "], "question": "which entity has more people", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "a woman speaks as she rubs two objects together"], "sample_ids": ["zhx6hoYrHeI", "vzxHnu-SFEw"], "start_seconds": ["160", "80"], "properties": ["engine, sputter, rough", "two objects, woman, speak"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is not a person speaking?", "label": 0}, {"captions": ["white noise and birds chirping", "wind blows as people chatter quietly"], "sample_ids": ["wRBHTgrbiwg", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["noise, white, chirping", "wind, chatter, people"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "a man speaks while turning a water faucet on"], "sample_ids": ["xvDdE3zNf8Y", "vf9xf3vMsGM"], "start_seconds": ["120", "540"], "properties": ["a, female, speaks", "A man speaks while turning a water faucet on."], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of the person washing their hands under the faucet"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking while water is running in the background"], "question": "which entity is a man", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "an airplane accelerates briefly"], "sample_ids": ["uiS58TNyUiw", "zjTG0gaGCUI"], "start_seconds": ["430", "80"], "properties": ["audio, man, speaking", "accelerates, airplane, briefly"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a jet engine roars as wind blows "], "question": "which is a moving object", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a man speaks while water trickles and flows"], "sample_ids": ["tDVADusiIoc", "sapQIQUhFc"], "start_seconds": ["60", "280"], "properties": ["wind, radio, waves", "water, trickles, flow"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and a stream is flowing in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["bees buzz and wind blows", "a door opens and closes"], "sample_ids": ["tMJne1a4AFI", "vBHyYJ8pL0"], "start_seconds": ["0", "2"], "properties": ["bees buzz, wind blows, bees", "open, close, door"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is not a door?", "label": 0}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vlS6YMeWAPo", "zj2R0XoFr5k"], "start_seconds": ["40", "50"], "properties": ["sheep, baa, birds", "airplane, boy, fly"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a goat bleats and birds chirp", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a woman speaks with water running", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["wTideSjRFS0", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["water, running, woman", "two men, woman, birds"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["water bubbles and gurgles.", "an infant crying as a woman laughs"], "sample_ids": ["tB7hWb9gTuQ", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["bubbles, gurgles, water", "a, laugh, infant"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["water is splashing and gurgling", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "children speak and play together"], "sample_ids": ["zcDwZ6W7E3E", "yVVP8XvWJTo"], "start_seconds": ["180", "260"], "properties": ["a, man, speak", "children, speak, play"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "children are speaking and breathing with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a stream of water flows as people talk and wind blows", "a woman speaks as she rubs two objects together"], "sample_ids": ["xBxDz0CFVn0", "vzxHnu-SFEw"], "start_seconds": ["30", "80"], "properties": ["stream, water, flow", "two objects, woman, speak"], "captions_pred_video": ["footage is blurry and out of focus", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "someone whistles a tune"], "sample_ids": ["zsLxS-uLJTw", "sIXTftIuUgw"], "start_seconds": ["20", "90"], "properties": ["horn, blast, train", "someone, tune, whistle"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", null], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a person whistling a song"], "question": "which is a musical instrument", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["zFjIWfSD-4", "w34HjHr6gAY"], "start_seconds": ["410", "30"], "properties": ["People, motor, brakes", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a man talks as several small engines run", "a toilet flushes and water drains"], "sample_ids": ["u9A6VZQCZpU", "sfAvvZwdLCY"], "start_seconds": ["30", "20"], "properties": ["a, man, talk", "water drains, flushes, water"], "captions_pred_video": [null, "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["bees buzz and wind blows", "paper is crumpling consistently"], "sample_ids": ["tMJne1a4AFI", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["bees buzz, wind blows, bees", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a swarm of bees on the ground", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a swarm of bees buzzing around", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["wind blows as people chatter quietly", "an airplane accelerates briefly"], "sample_ids": ["xBxDz0CFVn0", "zjTG0gaGCUI"], "start_seconds": ["30", "80"], "properties": ["wind, chatter, people", "accelerates, airplane, briefly"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a jet engine roars as wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["a stream of water flows quickly", "birds chirp and objects are moved around"], "sample_ids": ["wbHTKEJZyhc", "yPUYU6t3rwo"], "start_seconds": ["20", "370"], "properties": ["stream, water, flow", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["yajyRTUQk3U", "yks4cLgIDMc"], "start_seconds": ["400", "170"], "properties": ["a woman, something, fried", "background, speaking, child"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["a machine runs continuously", "frogs croak and vocalize"], "sample_ids": ["wdXV3Pv0jiY", "yswmmRZFItk"], "start_seconds": ["11", "0"], "properties": ["machine, running, continuously", "croak, vocalize, frog"], "captions_pred_video": ["footage is blurry and shaky", "a close up of a frog in the water"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a frog is croaking"], "question": "which entity is not a machine?", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yZrFNS7GFBQ", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["pigeon, buzzes, insect", "People, motor, brakes"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a clock ticktocks"], "sample_ids": ["xM4joTqDVp4", "v-g-j2uTByM"], "start_seconds": ["160", "30"], "properties": ["background, chirp, birds", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "wind blowing followed by a zoom"], "sample_ids": ["slZLHwNbbt4", "vr8ZXjEBhMQ"], "start_seconds": ["300", "150"], "properties": ["clap, distance, horn", "wind, blow, zoom"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["tZGN5a7ybxo", "wz7N8YRy74I"], "start_seconds": ["60", "30"], "properties": ["ring, train, horn", "rooster, crow, background, men"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to a rooster?", "label": 1}, {"captions": ["a goat bleats as a person speaks", "water flows and trickles"], "sample_ids": ["tPJvjq9QePY", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["bleats, person, speak", "water, flow, trickle"], "captions_pred_video": ["a dog and a sheep in a barn", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a baby cries and a man speaks", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["children speak and play together", "multiple people speak and children yell while water gurgles"], "sample_ids": ["yVVP8XvWJTo", "vb1fPSDI4c"], "start_seconds": ["260", "30"], "properties": ["children, speak, play", "multiple, people, yell"], "captions_pred_video": ["footage of a playground at a school or daycare center", null], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "an engine runs loudly"], "sample_ids": ["ukxt9I7eMMg", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["continuous, woman, speaking", "loud, engine, run"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "a man speaks followed by another man speaking outside"], "sample_ids": ["wqUmIEzuNz4", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["frog, bird, vocalize", "two men, speak, follow"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a cat meows and rustles", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a single speaker?", "label": 0}, {"captions": ["vehicles pass by on a roadway", "water splashing and a person laughs in the distance then a man speaks nearby"], "sample_ids": ["tgbONvsP47Y", "vddP56-ogds"], "start_seconds": ["0", "30"], "properties": ["pass, vehicle, roadway", "water, splash, person, laugh"], "captions_pred_video": ["footage of a fire truck entering a garage", null], "captions_pred_audio": ["a car is driving on the road ", "water is running and gurgling and a man is speaking"], "question": "which entity is more active", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["weDbePuc-Xc", "y8WEcpOlT3I"], "start_seconds": ["40", "40"], "properties": ["music, slaps, human", "harsh, wind, blows"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a man is speaking with wind noise in the background "], "question": "which entity has a harsh wind blowing?", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "wind blowing followed by a zoom"], "sample_ids": ["uJV8NDaHqqk", "vr8ZXjEBhMQ"], "start_seconds": ["100", "150"], "properties": ["loud, fly, chirp", "wind, blow, zoom"], "captions_pred_video": ["a bee hive in a wooden box", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a swarm of bees buzzing around", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not loud", "label": 1}, {"captions": ["children speak and play together", "winds blows roughly as a vehicle races past"], "sample_ids": ["yVVP8XvWJTo", "xjvTpk2Zpr8"], "start_seconds": ["260", "70"], "properties": ["children, speak, play", "wind, blows, vehicle"], "captions_pred_video": ["footage of a playground at a school or daycare center", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a motor vehicle roars, drowning out people speaking in the background"], "sample_ids": ["ugHJF0hfYkg", "s4Uz1Ffgo04"], "start_seconds": ["10", "100"], "properties": ["engine, running, continuously", "roars, background, people speaking"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["an adult woman and an adult man speak", "wind noise makes sound into a microphone"], "sample_ids": ["zTLVJCo4WEE", "w8uLijTqtlU"], "start_seconds": ["30", "70"], "properties": ["two people, adult, speak", "wind, microphone, noise"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "footage is blurry and shaky"], "captions_pred_audio": ["a woman speaks and crickets chirp", "the wind is blowing strongly"], "question": "which is not a person", "label": 1}, {"captions": ["an engine runs loudly", "a man speaks then multiple motorcycles pass by"], "sample_ids": ["vqZuVbG6-HI", "zcDwZ6W7E3E"], "start_seconds": ["130", "180"], "properties": ["loud, engine, run", "a, man, speak"], "captions_pred_video": ["footage is blurry because it's raining outside", "2 people riding motorcycles down a mountain road with trees lining the sides of the road"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking while a car accelerates and revs its engine "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["xfudFO976zE", "xfaoyyzw2WU"], "start_seconds": ["0", "180"], "properties": ["animal, bleats, cry", "loud, jet engine, roar"], "captions_pred_video": ["footage is blurry and shaky", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yVumC9TGknc", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["humming, clock, birds", "female, spraying, scream"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a series of beeps and chirps", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vb1fPSDI4c", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["multiple, people, yell", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["birds chirp as a bell rings", "winds blows roughly as a vehicle races past"], "sample_ids": ["ziUT9IFTkjg", "xjvTpk2Zpr8"], "start_seconds": ["10", "70"], "properties": ["chirp, bell, ring", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a jet engine roars and wind blows "], "question": "which entity is more likely to be a natural occurrence", "label": 0}, {"captions": ["a person is whistling", "pigeons vocalize and birds chirp"], "sample_ids": ["sIXTftIuUgw", "uiS58TNyUiw"], "start_seconds": ["90", "430"], "properties": ["person, whistling, person", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a person whistling a song", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person?", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "dishes cling together then a man begins to speak"], "sample_ids": ["vf9xf3vMsGM", "sQGXqGcwOTc"], "start_seconds": ["540", "3"], "properties": ["A man speaks while turning a water faucet on.", "cling, speak, dishes"], "captions_pred_video": ["of the person washing their hands under the faucet", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking while water is running in the background", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a male speaks over some small clicks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["uXxVebHsGZ8", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["male, clicks, speak", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "water splashes and a door squeaks"], "sample_ids": ["tqR406bGiE", "sdXV-ylviw"], "start_seconds": ["40", "190"], "properties": ["flush, water, gurgle", "sound, splash, door"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a dog barks and taps with background noise "], "question": "which entity has a door that squeaks?", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["zj2R0XoFr5k", "siJFXfGWgDk"], "start_seconds": ["50", "50"], "properties": ["airplane, fly, overhead", "a, bird, vehicle"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking and birds are chirping in the background "], "question": "which entity is flying overhead", "label": 0}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["yLy-WycbVVE", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["background, people, talk", "gun, shoot, water"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["water rushes by", "an insect buzzes around continuously"], "sample_ids": ["x-PeY8Yb8M4", "v25l1jef3JY"], "start_seconds": ["300", "0"], "properties": ["water, rushes, by", "buzzes, continuously, insect"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a car is driving on a wet road ", "a fly is buzzing around a microphone "], "question": "which entity is moving faster", "label": 0}, {"captions": ["a beep repeats multiple times", "a car accelerates and wind blows"], "sample_ids": ["y682ml90jGw", "u0TrcHhkPQ"], "start_seconds": ["11", "20"], "properties": ["beep, repeat, multiple", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a dark barks and whimpers", "a man speaks as a car is passing by"], "sample_ids": ["sYj4hpDUZDQ", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["barks, whimpers, dark", "a, car, pass"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a dog barks and a cat meows", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "a person snores loudly multiple times at a close distance"], "sample_ids": ["x4a9YGIw4ok", "sSMl2vc3ek"], "start_seconds": ["120", "20"], "properties": ["water, gurgles, stops", "loud, multiple, distance"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a toilet flushes and water splashes", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "an adult woman speaks over chopping and silverware noises"], "sample_ids": ["xOZfdgAgJ9o", "yYJksgsxx5U"], "start_seconds": ["40", "30"], "properties": ["woman, whimpering, speaking", "audio, woman, silverware"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "of a woman slicing an orange on a cutting board"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and dishes are clanging in the background "], "question": "which woman is speaking", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wAAkbZToh8", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["burp, laugh, speak", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man burps and a woman speaks", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 0}, {"captions": ["a man speaks while a vehicle engine runs and revs loudly", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["ylpYOorfH4o", "wz7N8YRy74I"], "start_seconds": ["410", "30"], "properties": ["engine, run, loud", "rooster, crow, background, men"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a man speaking to?", "label": 0}, {"captions": ["several insects fly while two men talk", "water flows as men speak and yell"], "sample_ids": ["s-T9OVOiMLo", "vJ7JPEFhyLA"], "start_seconds": ["330", "16"], "properties": ["several, fly, men", "water, flow, men"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows men speaking and yelling?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "birds chirp and objects are moved around"], "sample_ids": ["wTideSjRFS0", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["food, sizzle, woman", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "insects buzz and a man speaks"], "question": "which entity is about birds?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "some tunes played by whistling"], "sample_ids": ["s4Uz1Ffgo04", "u6BnG6YZqJ4"], "start_seconds": ["100", "0"], "properties": ["roars, background, people speaking", "tune, play, whistling"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["xjvTpk2Zpr8", "zFjIWfSD-4"], "start_seconds": ["70", "410"], "properties": ["engine, run, wind", "People, motor, brakes"], "captions_pred_video": ["footage of a dhl plane landing on the runway", null], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is running", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "a motor noise is accompanied by a door opening and closing"], "sample_ids": ["xyL9F5VrjkE", "vBHyYJ8pL0"], "start_seconds": ["20", "2"], "properties": ["wind, blows, vehicle", "noise, door, opening"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", null], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is accompanied by a door opening and closing?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "women speak as water runs briefly, children call out, and a man speaks"], "sample_ids": ["wTideSjRFS0", "uRExseg-0XI"], "start_seconds": ["30", "210"], "properties": ["food, sizzle, woman", "woman, man, water"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking while water is running and birds are chirping "], "question": "which entity has a man speaking?", "label": 1}, {"captions": ["male speech with light ticking", "a man speaks with another voice speaking in the background"], "sample_ids": ["xO-Q2BlIIPU", "u21-Z5gJCB8"], "start_seconds": ["30", "30"], "properties": ["male, speech, ticking", "background, voice, man"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "- a person cooking eggs in a pan on the stove"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking with another voice speaking in the background?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "water splashes as an animal walks through"], "sample_ids": ["vZAw4apG0Es", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["background, tick, repeat", "animal, water, splashes"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a clock is ticking and people are talking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a baby laugh at a sputter"], "sample_ids": ["v-wcQf4BDY0", "sLUnaPT5gM8"], "start_seconds": ["120", "0"], "properties": ["bark, yip, sharply", "laugh, sputter, baby"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a dog barks and growls", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more likely to be a child", "label": 1}, {"captions": ["a infant makes noise and is excited", "water pouring and bubbling"], "sample_ids": ["wIJK3-5y0kA", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["noise, excited, infant", "water, bubbles, pouring"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a baby cries and a woman speaks", "water is running from a faucet"], "question": "which entity is bubbling", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "an infant crying frantically"], "sample_ids": ["wAAkbZToh8", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["burp, laugh, speak", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a man burps and a woman speaks", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wsHBIgzs9Fs", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["horn, continuous, buzzing", "three men, wind, flow"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a duck quacks continuously"], "sample_ids": ["tDlysoZiA1I", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, chirp", "quacks, continuously, duck"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 1}, {"captions": ["goats bleat and people speak", "a machine beeps continuously"], "sample_ids": ["z5iUE5h0EPs", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["goats bleat, people speak, language", "beeps, machine, continuously"], "captions_pred_video": ["of the goat in the barn", null], "captions_pred_audio": ["a goat bleats and a man speaks", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks as a machine runs", "water flows as men speak and yell"], "sample_ids": ["vD6lYD1l0BY", "vJ7JPEFhyLA"], "start_seconds": ["330", "16"], "properties": ["a, machine, run", "water, flow, men"], "captions_pred_video": ["game controller being held in the hands of the person", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a man speaking as a machine runs?", "label": 0}, {"captions": ["a horn blasts loudly as a train passes", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zsLxS-uLJTw", "wDVMhEdTiVw"], "start_seconds": ["20", "30"], "properties": ["horn, blast, train", "gun, shoot, water"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause a train to pass", "label": 0}, {"captions": ["a man speaks while turning a water faucet on", "a man speaks over intermittent keyboard taps"], "sample_ids": ["vf9xf3vMsGM", "tw76HGONaKg"], "start_seconds": ["540", "570"], "properties": ["A man speaks while turning a water faucet on.", "audio, man, keyboard"], "captions_pred_video": ["of the person washing their hands under the faucet", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a man speaks and types on a computer keyboard "], "question": "which entity is a video?", "label": 0}, {"captions": ["a vehicle engine revs and tires squeal", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yDoT73BWsdA", "vbZ-0lGPneg"], "start_seconds": ["10", "30"], "properties": ["engine revs, tires squeal, vehicle", "a woman, a television program, a bird"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a man speaks over a running engine and blowing wind", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["ylpYOorfH4o", "ukg5L09Wpvo"], "start_seconds": ["410", "150"], "properties": ["engine, running, wind", "clickety-clack, train, whistle"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "water splashes as an animal walks through"], "sample_ids": ["tPJvjq9QePY", "w1ir-sZ3Im8"], "start_seconds": ["40", "90"], "properties": ["animal, bleat, moo", "animal, water, splashes"], "captions_pred_video": ["a dog and a sheep in a barn", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a baby cries and a man speaks", "water splashes and gurgles as people speak"], "question": "which animal is more active", "label": 1}, {"captions": ["an engine runs and wind blows", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vs65y4qmyBE", "vbZ-0lGPneg"], "start_seconds": ["340", "30"], "properties": ["engine, run, wind", "a woman, a television program, a bird"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["spJCm8tD9Zo", "uEU-Hg5MTN8"], "start_seconds": ["90", "27"], "properties": ["snores, wheezes, sleeps", "animal, grunts, snorts"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["an animal growls followed by birds chirping", "people cheer as a vehicle engine revs"], "sample_ids": ["y2ZBGpgbhHM", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["animal, growl, bird", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["birds chirping and a dog panting", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["skd2PphS6oI", "zj2R0XoFr5k"], "start_seconds": ["190", "50"], "properties": ["ring, bird, vocalize", "airplane, boy, fly"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a woman and man are speaking", "paper folding and crinkling"], "sample_ids": ["vbpKkWvfOu4", "zPpG3RD8lSs"], "start_seconds": ["560", "20"], "properties": ["two people, speaking, woman, man", "paper, fold, crinkle"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "the wind blows and a mouse clicks "], "question": "which is not a person", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "a duck quacks loudly and continuously"], "sample_ids": ["zuua6-5goWw", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["birds, chirp, quiet, man, speaks", "loud, continuous, quacks"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["a large bell chimes back and forth loudly", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["w2M4i1mklOA", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["loud, chime, bell", "background, birds, rustling"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "birds are chirping and a chime is ringing "], "question": "which entity is quieter", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["tQWGZLItBXk", "wSVhSdj0F0"], "start_seconds": ["170", "10"], "properties": ["voice, music, whoosh", "horn honks, keys jingle, electronic beep"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a car horn honks and keys jangle with background noise "], "question": "which entity has a horn honk?", "label": 1}, {"captions": ["a man talks as several small engines run", "a woman speaks and other women and a man talk with her"], "sample_ids": ["u9A6VZQCZpU", "vbpKkWvfOu4"], "start_seconds": ["30", "560"], "properties": ["a, man, talk", "a, woman, man"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["a beep repeats multiple times", "people applaud and hoot and chat quietly"], "sample_ids": ["y682ml90jGw", "wwyfGO2J4"], "start_seconds": ["11", "90"], "properties": ["beep, repeat, multiple", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "an engine runs loudly"], "sample_ids": ["wnpJndXuxLc", "vqZuVbG6-HI"], "start_seconds": ["50", "130"], "properties": ["beeps, loud, whistle", "loud, engine, run"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "a man speaks in the background while a slow tick repeats"], "sample_ids": ["rqu8iB22IY", "vZAw4apG0Es"], "start_seconds": ["5", "30"], "properties": ["sound, repeats, laugh", "background, tick, repeat"], "captions_pred_video": [null, "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a clock is ticking and people are talking"], "question": "which entity has a tick repeating in the background?", "label": 1}, {"captions": ["children speak and play together", "people cheer as a vehicle engine revs"], "sample_ids": ["yVVP8XvWJTo", "xjhAnI2q6hM"], "start_seconds": ["260", "6"], "properties": ["children, speak, play", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a playground at a school or daycare center", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a machine beeps continuously", "some men converse over an engine running"], "sample_ids": ["y682ml90jGw", "sCiy7QS1U"], "start_seconds": ["11", "300"], "properties": ["beeps, machine, continuously", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a beeping sound is being made ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "wind blowing followed by a zoom"], "sample_ids": ["wDVMhEdTiVw", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["gun, shoot, water", "wind, blow, zoom"], "captions_pred_video": ["a blurry image of trees and water in the forest", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vuUVPzd2FXw", "uZesmtKZGSw"], "start_seconds": ["160", "250"], "properties": ["a, steam, release", "men, talk, cars"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tDVADusiIoc", "wqZ135Ssz0"], "start_seconds": ["60", "60"], "properties": ["water, radio, man", "two men, woman, birds"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a dark barks and whimpers", "a child speaks in closed space"], "sample_ids": ["sYj4hpDUZDQ", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["barks, whimpers, dark", "child, space, speak"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a dog barks and a cat meows", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zj2R0XoFr5k", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, overhead", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a bird?", "label": 1}, {"captions": ["an adult woman and an adult man speak", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zTLVJCo4WEE", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["two people, adult, speak", "two men, woman, birds"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["su6FAOcOA8c", "uEU-Hg5MTN8"], "start_seconds": ["4", "27"], "properties": ["engine, idle, woman", "animal, grunts, snorts"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a woman is speaking and a baby is crying"], "question": "which entity is a person speaking to an animal?", "label": 1}, {"captions": ["water running down a sink while a man is talking", "water flows as men speak and yell"], "sample_ids": ["vSeGhaZt-aI", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["water, sink, talk", "water, flow, men"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water flowing", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["s59PfAghdkM", "vfYTJq7nU"], "start_seconds": ["0", "130"], "properties": ["bird, chirp, background, horse, neigh", "rustling, ducks, quack"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", null], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 0}, {"captions": ["birds chirp and objects are moved around", "winds blows roughly as a vehicle races past"], "sample_ids": ["yPUYU6t3rwo", "xjvTpk2Zpr8"], "start_seconds": ["370", "70"], "properties": ["birds chirp, objects are moved around, birds", "wind, blows, vehicle"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["insects buzz and a man speaks", "a jet engine roars and wind blows "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wIvYjuR3nrg", "zj2R0XoFr5k"], "start_seconds": ["9", "50"], "properties": ["birds, pigeons, vocalize", "airplane, boy, fly"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["birds are chirping and cooing", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a clock ticktocks"], "sample_ids": ["wfHeoPDLMaM", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["quacking, squawking, ducks", "ticktocks, clock, ticktocks"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["ducks are quacking", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a jet engine screams, then increases its power"], "sample_ids": ["u21-Z5gJCB8", "vBslzh7saPw"], "start_seconds": ["30", "90"], "properties": ["background, voice, man", "power, scream, increase"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine roars and accelerates "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["wz7N8YRy74I", "vzceMbklWc"], "start_seconds": ["30", "180"], "properties": ["rooster, crow, background, men", "water, faucet, sink"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "water is running and a man is speaking"], "question": "which entity has a sink?", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "a man speaks as bees buzz and birds chirp"], "sample_ids": ["zliInBdC98Y", "t25U-v4k4ts"], "start_seconds": ["30", "40"], "properties": ["a, baby, cries, wails", "bees buzz, birds chirp, man speaks"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "of a beekeeper working on a beehive in the woods"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and bees are buzzing"], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a telephone rings followed by a woman talking"], "sample_ids": ["w2M4i1mklOA", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["loud, chime, bell", "ring, talk, woman"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks and dog vocalizes", "a telephone rings followed by a woman talking"], "sample_ids": ["uWAAAL4CIoc", "tGcFnX0GHI"], "start_seconds": ["0", "0"], "properties": ["a, dog, vocalize", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sShpyu2l4YQ", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["growl, bark, yip", "animal, grunts, snorts"], "captions_pred_video": ["the puppies are playing with a toy", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a dog is barking and growling", "a woman is speaking and a baby is crying"], "question": "which entity is more snorts", "label": 1}, {"captions": ["long loud burping by a man", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xmiUIOhtZyQ", "tDVADusiIoc"], "start_seconds": ["60", "60"], "properties": ["loud, burp, man", "water, radio, man"], "captions_pred_video": ["homer simpson drinking a beer", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a person burps and music plays in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio?", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["w2JXXIAdUdg", "zl9Dqx-j7q4"], "start_seconds": ["10", "6"], "properties": ["emits, sleeping, person", "engine, laugh, loud"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sjlVMgdGSK0", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["car, revving, loudly", "airplane, boy, fly"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["smDKStoHBJo", "y8WEcpOlT3I"], "start_seconds": ["0", "40"], "properties": ["a, talk, baby, cry", "harsh, wind, blows"], "captions_pred_video": ["a man holding a crying baby in his arms", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking with wind noise in the background "], "question": "which entity has a harsh wind blowing?", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "soft movement is accompanied by clocks ticking in the background"], "sample_ids": ["zdYdyF9-m8U", "vlJS7LN2XyM"], "start_seconds": ["7", "30"], "properties": ["wind, crash, shoreline", "background, clocks, ticking"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video"], "captions_pred_audio": ["waves crash and wind blows ", "a ticktock of a clock"], "question": "which entity is more calm", "label": 1}, {"captions": ["bees buzz as wind blows", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["tMJne1a4AFI", "xV7Mg1QucSc"], "start_seconds": ["0", "14"], "properties": ["bees, buzz, wind", "alarm, ticktocks, laughs"], "captions_pred_video": ["a swarm of bees on the ground", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a swarm of bees buzzing around", "an alarm clock ticks and a woman laughs"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a horn rings out as a machine runs by"], "sample_ids": ["wnpJndXuxLc", "slZLHwNbbt4"], "start_seconds": ["50", "300"], "properties": ["blows, vehicle, train", "a, horn, run"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a toilet flushes and water drains", "an airplane engine spools and people speak"], "sample_ids": ["sfAvvZwdLCY", "wTjoRj1se3U"], "start_seconds": ["20", "390"], "properties": ["water drains, flushes, water", "airplane, engine, spool"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a toilet is flushed", "a jet engine is running and people are talking"], "question": "which entity is a machine", "label": 1}, {"captions": ["a jet engine spools up and takes off", "several insects fly while two men talk"], "sample_ids": ["vBslzh7saPw", "s-T9OVOiMLo"], "start_seconds": ["90", "330"], "properties": ["engine, spools, takes", "several, fly, men"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["some tunes played by whistling", "someone whistles a tune"], "sample_ids": ["u6BnG6YZqJ4", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["tune, play, whistling", "someone, tune, whistle"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", null], "captions_pred_audio": ["a person whistling a song", "a person whistling a song"], "question": "which is a more active way to play a tune", "label": 0}, {"captions": ["a man talks while a clock does ticktock", "an engine runs loudly"], "sample_ids": ["spYNpeN7rPY", "vqZuVbG6-HI"], "start_seconds": ["1", "130"], "properties": ["a clock, ticktock, man", "loud, engine, run"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a lawn mower is running and men are speaking "], "question": "which entity is quieter", "label": 0}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wSVhSdj0F0", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["beep, clang, footsteps", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more like a movie", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "an insect buzzes around continuously"], "sample_ids": ["sQwlkXjQabo", "v25l1jef3JY"], "start_seconds": ["10", "0"], "properties": ["liquid, surface, spray", "buzzes, continuously, insect"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["spraying followed by silence", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["bees buzz as wind blows", "people cheer as a vehicle engine revs"], "sample_ids": ["tMJne1a4AFI", "xjhAnI2q6hM"], "start_seconds": ["0", "6"], "properties": ["bees, buzz, wind", "engine revs, vehicle, people"], "captions_pred_video": ["a swarm of bees on the ground", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a swarm of bees buzzing around", "a truck is revving its engine and a man is speaking "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "wind blows as people chatter quietly"], "sample_ids": ["zgUgkpk78xU", "xBxDz0CFVn0"], "start_seconds": ["70", "30"], "properties": ["clinking, humming, horn", "wind, chatter, people"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage is blurry and out of focus"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "wind blows as people chatter quietly"], "sample_ids": ["ziUT9IFTkjg", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["background, birds, rustling", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "an infant crying as a woman laughs"], "sample_ids": ["s3cTDAj31g", "xhmRY9yhC7c"], "start_seconds": ["80", "20"], "properties": ["man, talk, woman", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks while playing a video game on a keyboard", "a horn rings out as a machine runs by"], "sample_ids": ["tw76HGONaKg", "slZLHwNbbt4"], "start_seconds": ["570", "300"], "properties": ["A, game, keyboard", "a, horn, run"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "water running down a sink while a man is talking"], "sample_ids": ["uWAAAL4CIoc", "vSeGhaZt-aI"], "start_seconds": ["0", "50"], "properties": ["a woman, chirps, animal", "water, sink, talk"], "captions_pred_video": [null, "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a woman and man are speaking", "water splashes as an animal walks through"], "sample_ids": ["vbpKkWvfOu4", "w1ir-sZ3Im8"], "start_seconds": ["560", "90"], "properties": ["two people, speaking, woman, man", "animal, water, splashes"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "water splashes and gurgles as people speak"], "question": "which entity is a video of an animal walking through water?", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "someone whistles a tune"], "sample_ids": ["wqZ135Ssz0", "sIXTftIuUgw"], "start_seconds": ["60", "90"], "properties": ["man, woman, squawks", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sU53zg9Jp7s", "tiDFTC-5vU"], "start_seconds": ["380", "30"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "male, duck, laugh"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck?", "label": 1}, {"captions": ["rain falls onto a hard surface and thunder roars before music plays", "water flows and trickles"], "sample_ids": ["xNMovAf3o50", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["rain, thunder, music", "water, flow, trickle"], "captions_pred_video": ["tieng mua - the falling rain lynk lee", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["thunder and rain with music playing in the background ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "water pouring and bubbling"], "sample_ids": ["x6ijhqRY38s", "uyRfq-jKPpo"], "start_seconds": ["250", "50"], "properties": ["bowl, silverware, man", "water, bubbles, pouring"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a car speeding up in the distance"], "sample_ids": ["u21-Z5gJCB8", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, voice, man", "distance, car, speed"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "someone whistles a tune"], "sample_ids": ["wz7N8YRy74I", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["rooster, crow, background, people", "someone, tune, whistle"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", null], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["sa6TLVbooCc", "w34HjHr6gAY"], "start_seconds": ["240", "30"], "properties": ["people, laugh, child", "beeps, hit, woman"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a clock ticktocks briefly", "people speak as gunfire rings out"], "sample_ids": ["u7C-AEBQM", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["ticktocks, clock, ticktocks briefly", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a toilet flushes and water sputters as it drains"], "sample_ids": ["xfaoyyzw2WU", "smGI3C1NZc"], "start_seconds": ["180", "30"], "properties": ["loud, jet engine, roar", "water, drain, toilet"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", null], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a toilet is flushed"], "question": "which entity is quieter", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "people speak as gunfire rings out"], "sample_ids": ["yI-KvObbDoY", "wqTCwqVRDlk"], "start_seconds": ["260", "80"], "properties": ["sound, smack, wind", "gunfire, ring, speak"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["yYEVLuqEytU", "yeFvk9x0wWI"], "start_seconds": ["40", "30"], "properties": ["animal, pig, background", "clack, bird, chirp"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["several sheep bleat and a man speaks", "birds chirp in the background as a car drives by "], "question": "which entity has a bird chirping in the background?", "label": 0}, {"captions": ["here comes the train and it starts to blow the horn and get close", "water pouring and bubbling"], "sample_ids": ["s7knHCFW82w", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["blow horn, get close, train", "water, bubbles, pouring"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a car speeding up in the distance"], "sample_ids": ["t25U-v4k4ts", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["bees buzz, birds chirp, man speaks", "distance, car, speed"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", null], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["an engine starts and increases in power", "an airplane flies overhead as a woman speaks"], "sample_ids": ["zjTG0gaGCUI", "zj2R0XoFr5k"], "start_seconds": ["80", "50"], "properties": ["power, increase, engine", "airplane, fly, overhead"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a woman speaks while a helicopter flies overhead "], "question": "which object is moving", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "waves crash against a shoreline and people speak"], "sample_ids": ["ujMt0-D-x2k", "yFB25fqfU8I"], "start_seconds": ["0", "300"], "properties": ["snoring, rhythmical, nearby", "wave, crash, shoreline"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a frog croaks as other frogs croak in the background"], "sample_ids": ["sNB8zxXneIM", "yswmmRZFItk"], "start_seconds": ["20", "0"], "properties": ["several, quack, cocks", "background, frog, croak"], "captions_pred_video": ["a group of geese in a cage", "a close up of a frog in the water"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a frog is croaking"], "question": "which animal is more likely to be a frog", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "several insects fly while two men talk"], "sample_ids": ["wAAkbZToh8", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["burp, laugh, speak", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man burps and a woman speaks", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about a person speaking and laughing?", "label": 0}, {"captions": ["water splashes and wind noise is made into a microphone", "water flows as men speak and yell"], "sample_ids": ["sDSppXIlJrs", "vJ7JPEFhyLA"], "start_seconds": ["27", "16"], "properties": ["microphone, water, wind", "water, flow, men"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "some men converse over an engine running"], "sample_ids": ["se87d6yxEOA", "sCiy7QS1U"], "start_seconds": ["10", "300"], "properties": ["run, whistle, pass", "men, converse, engine"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", null], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a steam engine?", "label": 0}, {"captions": ["a woman and man are speaking", "people applaud and hoot and chat quietly"], "sample_ids": ["vbpKkWvfOu4", "wwyfGO2J4"], "start_seconds": ["560", "90"], "properties": ["two people, speaking, woman, man", "people, applaud, hoot"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "people are clapping and speaking with background noise "], "question": "which entity shows more people", "label": 1}, {"captions": ["people speak softly as food sizzles", "people applaud and hoot and chat quietly"], "sample_ids": ["yhQ2Lg-7qDY", "wwyfGO2J4"], "start_seconds": ["130", "90"], "properties": ["food, sizzle, speak", "people, applaud, hoot"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["an engine starts and increases in power", "people speak as gunfire rings out"], "sample_ids": ["zjTG0gaGCUI", "wqTCwqVRDlk"], "start_seconds": ["80", "80"], "properties": ["power, increase, engine", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "pigeons vocalize and birds chirp"], "sample_ids": ["y2ZBGpgbhHM", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["animal, growl, bird", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a horn honks and then loudly blares", "a car revs and accelerates loudly and men and women chatter among themselves"], "sample_ids": ["wnpJndXuxLc", "y8dSeubCNI"], "start_seconds": ["50", "4"], "properties": ["horn, honk, loud", "men, women, car"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "an engine revving and people talking in the background"], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["a heavy rain falls endlessly", "a man speaks while water drains"], "sample_ids": ["wP8ZKrlx3oA", "vSeGhaZt-aI"], "start_seconds": ["40", "50"], "properties": ["heavy, rain, fall", "water, drain, man"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a man speaking while water drains?", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "goats bleat and people speak"], "sample_ids": ["uEU-Hg5MTN8", "z5iUE5h0EPs"], "start_seconds": ["27", "30"], "properties": ["a woman, laughs, animal", "goats bleat, people speak, language"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of the goat in the barn"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a goat bleats and a man speaks"], "question": "which entity is a language", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "winds blows roughly as a vehicle races past"], "sample_ids": ["xKB8O8LTs6s", "xjvTpk2Zpr8"], "start_seconds": ["70", "70"], "properties": ["music, gunfire, explosion", "wind, blows, vehicle"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a jet engine roars and wind blows "], "question": "which entity is more calm", "label": 1}, {"captions": ["a motorcycle engine is idling", "a woman speaks happily and an animal chirps"], "sample_ids": ["vZAqdHZ81yA", "uWAAAL4CIoc"], "start_seconds": ["180", "0"], "properties": ["engine, motorcycle, idling", "a woman, chirps, animal"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", null], "captions_pred_audio": ["an engine is idling loudly", "a woman is speaking and a dog is barking "], "question": "which entity is not a motorcycle?", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a vehicle engine accelerating then running on idle"], "sample_ids": ["siJFXfGWgDk", "vYkA3cfXp5Q"], "start_seconds": ["50", "30"], "properties": ["a, bird, vehicle", "engine, accelerate, idle"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "an engine is idling"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a train horn blows as it passes by"], "sample_ids": ["tOSWIURC-4", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["engine, work, nearby", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a lawn mower is running ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vdoxuJn9lTc", "zj2R0XoFr5k"], "start_seconds": ["40", "50"], "properties": ["burp, loud, girl", "airplane, boy, fly"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a child speaks followed by a burp", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "water pouring and bubbling"], "sample_ids": ["wSVhSdj0F0", "uyRfq-jKPpo"], "start_seconds": ["10", "50"], "properties": ["horn honks, keys jingle, slam", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "three men talk while wind blows and some liquid flows"], "sample_ids": ["uJV8NDaHqqk", "vJ7JPEFhyLA"], "start_seconds": ["100", "16"], "properties": ["loud, fly, chirp", "three men, wind, flow"], "captions_pred_video": ["a bee hive in a wooden box", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "birds chirp and an insect buzzes around"], "sample_ids": ["vZAw4apG0Es", "t97k0cejSQE"], "start_seconds": ["30", "250"], "properties": ["background, clock, ticktocks", "bird, chirp, insect"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a bee on a purple thistle flower"], "captions_pred_audio": ["a clock is ticking and people are talking", "a bee buzzes and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a beep occurs briefly", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xtWeJ56-U-g", "ukg5L09Wpvo"], "start_seconds": ["20", "150"], "properties": ["beep, occur, briefly", "clickety-clack, train, whistle"], "captions_pred_video": ["how to create a simple program in python 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 8", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["mechanisms are ticking and a beep is heard ", "a train blows its whistle and blows its horn "], "question": "which is continuous", "label": 0}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "people talk quietly in the distance, followed by a police car siren wailing"], "sample_ids": ["uC9dtII1KDI", "wy1eKjR7KC0"], "start_seconds": ["150", "30"], "properties": ["wind, gusts, distance", "people, talk, distance"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "two police officers riding motorcycles down the street"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a man is speaking and a siren is going off"], "question": "which entity is more distant", "label": 1}, {"captions": ["electronic beeps occur in a short series", "a woman speaks as she rubs two objects together"], "sample_ids": ["y682ml90jGw", "vzxHnu-SFEw"], "start_seconds": ["11", "80"], "properties": ["beeps, series, electronic", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a beeping sound is being made ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a woman speaks in a fast tone with a male"], "sample_ids": ["sYITalLZjj4", "sTpirNYo8vQ"], "start_seconds": ["30", "30"], "properties": ["water, rushes, background, birds", "a, tone, fast"], "captions_pred_video": ["two ducks are swimming in the water near each other", "of a man taking a selfie on a bus"], "captions_pred_audio": ["wind blows and birds chirp", "a man is speaking while a car is revving and accelerating "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a infant makes noise and is excited", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wIJK3-5y0kA", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["noise, excited, infant", "men, talk, cars"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a person whistles a meandering tune", "pigeons vocalize and birds chirp"], "sample_ids": ["uFoga8sHpiw", "uiS58TNyUiw"], "start_seconds": ["90", "430"], "properties": ["person, tune, whistle", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a bird in a cage", "of the pigeon in the cage"], "captions_pred_audio": ["a person whistles a song", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "small dogs yip and bark sharply"], "sample_ids": ["w1mlz3Pe4fU", "v-wcQf4BDY0"], "start_seconds": ["300", "120"], "properties": ["vocalize, chirp, continuously", "bark, yip, sharply"], "captions_pred_video": ["of a bird in a cage", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["birds are chirping and singing", "a dog barks and growls"], "question": "which entity is more vocal", "label": 1}, {"captions": ["wind blows and people talk while livestock vocalizes", "a clock ticktocks"], "sample_ids": ["vXlk0lIQBFo", "v-g-j2uTByM"], "start_seconds": ["470", "30"], "properties": ["wind, talk, vocalize", "ticktocks, clock, ticktocks"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["someone whistles a song", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["sIXTftIuUgw", "zY3icUyMdh8"], "start_seconds": ["90", "20"], "properties": ["someone, song, whistle", "dog, bark, engine"], "captions_pred_video": [null, "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a person whistling a song", "a car is driving and dogs are barking and squealing "], "question": "which entity is a person", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["vKrYfzleLB8", "yLy-WycbVVE"], "start_seconds": ["110", "30"], "properties": ["a, ring, gunshots", "background, people, talk"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity has a man yell?", "label": 0}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "a car accelerates and wind blows"], "sample_ids": ["weDbePuc-Xc", "u0TrcHhkPQ"], "start_seconds": ["40", "20"], "properties": ["cartoon character, music, vocalize", "accelerates, wind, blows"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", null], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a person snoring several times", "paper is crumpling consistently"], "sample_ids": ["spJCm8tD9Zo", "v5cSxLaHADY"], "start_seconds": ["90", "0"], "properties": ["snore, person, several", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person is snoring loudly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["children speak as a female ask them questions", "vehicles pass by on a roadway"], "sample_ids": ["wEBlkGWVWwE", "tgbONvsP47Y"], "start_seconds": ["260", "0"], "properties": ["female, speak, questions", "pass, vehicle, roadway"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["someone whistles a tune", "an infant crying as a woman laughs"], "sample_ids": ["sIXTftIuUgw", "xhmRY9yhC7c"], "start_seconds": ["90", "20"], "properties": ["someone, tune, whistle", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a person whistling a song", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a jet engine spools up and takes off", "people cheer as a vehicle engine revs"], "sample_ids": ["vBslzh7saPw", "xjhAnI2q6hM"], "start_seconds": ["90", "6"], "properties": ["engine, spools, takes", "engine revs, vehicle, people"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man talks while vehicles pass by", "birds chirp and objects are moved around"], "sample_ids": ["sK4u5T8hW78", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["a, man, talk", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a stream of water flows quickly", "paper is crumpling consistently"], "sample_ids": ["wbHTKEJZyhc", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["stream, water, flow", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "water splashes as an animal walks through"], "sample_ids": ["vMf1dLD6Sng", "w1ir-sZ3Im8"], "start_seconds": ["6", "90"], "properties": ["frog, bird, vocalize", "animal, water, splashes"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a frog croaks loudly", "water splashes and gurgles as people speak"], "question": "which entity is not a frog?", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "an adult woman and an adult man speak"], "sample_ids": ["y8WEcpOlT3I", "zTLVJCo4WEE"], "start_seconds": ["40", "30"], "properties": ["wind, speak, buffeting", "two people, adult, speak"], "captions_pred_video": ["on how to use a sewing machine youtube", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman speaks and crickets chirp"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a stream of water flows as people talk and wind blows"], "sample_ids": ["zofjfKhqLk8", "xBxDz0CFVn0"], "start_seconds": ["10", "30"], "properties": ["background, metal, clings", "stream, water, flow"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage is blurry and out of focus"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "some tunes played by whistling"], "sample_ids": ["y2ZBGpgbhHM", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["dog, chirp, breathe", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["birds chirping and a dog panting", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "people speak as gunfire rings out"], "sample_ids": ["y1saVTXsKwc", "wqTCwqVRDlk"], "start_seconds": ["80", "80"], "properties": ["a, dog, talk", "gunfire, ring, speak"], "captions_pred_video": ["a dog playing with a pink ball", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a dog barks and a man speaks", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tDlysoZiA1I", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["animal, grunt, chirp", "a woman, something, fried"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a child speaks in closed space"], "sample_ids": ["w5W5Kqtc8E", "yW6FWLSLkx4"], "start_seconds": ["100", "40"], "properties": ["water, splashes, motorboat", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "water splashes and a door squeaks"], "sample_ids": ["vZAw4apG0Es", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["background, tick, repeat", "sound, splash, door"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a dog barks and taps with background noise "], "question": "which entity has a door?", "label": 1}, {"captions": ["long loud burping by a man", "birds chirp and objects are moved around"], "sample_ids": ["xmiUIOhtZyQ", "yPUYU6t3rwo"], "start_seconds": ["60", "370"], "properties": ["loud, burp, man", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["homer simpson drinking a beer", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a person burps and music plays in the background ", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wvKpEYswXO0", "yajyRTUQk3U"], "start_seconds": ["150", "400"], "properties": ["plastic, tap, speak", "a woman, something, fried"], "captions_pred_video": ["of the person preparing food in the kitchen", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman is speaking while food is frying in the background"], "question": "what is being tapped on in the first picture?", "label": 0}, {"captions": ["a man speaks as a vehicle engine idles", "a child speaks in closed space"], "sample_ids": ["shmR4OZtzqA", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["man, engine, idle", "child, space, speak"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man speaks while a motor runs", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a horn beeps twice followed by a clang and then some footsteps and another beep", "a stream of water flows quickly"], "sample_ids": ["wSVhSdj0F0", "wbHTKEJZyhc"], "start_seconds": ["10", "20"], "properties": ["beep, clang, footsteps", "stream, water, flow"], "captions_pred_video": [null, "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a waterfall is flowing and people are speaking "], "question": "which entity is moving", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a motor idles, accelerates, then slows down."], "sample_ids": ["tQWGZLItBXk", "vYkA3cfXp5Q"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "speed, idle, accelerate"], "captions_pred_video": ["worms revolution screenshots", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "an engine is idling"], "question": "which entity is more like a machine", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "an insect buzzes around continuously"], "sample_ids": ["xfaoyyzw2WU", "v25l1jef3JY"], "start_seconds": ["180", "0"], "properties": ["loud, jet engine, roar", "buzzes, continuously, insect"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a fly is buzzing around a microphone "], "question": "which entity is quieter", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "a stream of water runs briefly"], "sample_ids": ["wqZ135Ssz0", "x-PeY8Yb8M4"], "start_seconds": ["60", "300"], "properties": ["two men, woman, birds", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks as crickets sing", "a train horn blows as it passes by"], "sample_ids": ["ryFDPxgDOGc", "zVacuqSb4LI"], "start_seconds": ["570", "30"], "properties": ["a, crickets, sing", "horn, blows, train"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is louder", "label": 0}, {"captions": ["people converse in the distance as a clock ticks", "vehicles pass by on a roadway"], "sample_ids": ["vZAw4apG0Es", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["people, clock, converse", "pass, vehicle, roadway"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a clock is ticking and people are talking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks over intermittent keyboard taps", "a man speaks over intermittent keyboard taps"], "sample_ids": ["tw76HGONaKg", "tw76HGONaKg"], "start_seconds": ["570", "570"], "properties": ["audio, man, keyboard", "audio, man, keyboard"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man speaks and types on a computer keyboard "], "question": "which entity is a video", "label": 1}, {"captions": ["a small engine idles continuously", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["y5WII6cTH7k", "wDVMhEdTiVw"], "start_seconds": ["40", "30"], "properties": ["engine, idle, continuously", "gun, shoot, water"], "captions_pred_video": ["footage of a sewing machine stitching a red and white hat", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["an engine is knocking and vibrating ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not a gun?", "label": 0}, {"captions": ["birds twitter and chirp and clatter", "a man speaks while turning a water faucet on"], "sample_ids": ["yeFvk9x0wWI", "vf9xf3vMsGM"], "start_seconds": ["30", "540"], "properties": ["chirp, twitter, clatter", "A man speaks while turning a water faucet on."], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of the person washing their hands under the faucet"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking while water is running in the background"], "question": "which entity is silent", "label": 1}, {"captions": ["a train engine runs and a horn blows", "water flows and trickles"], "sample_ids": ["zPX9o1uDiI", "tB7hWb9gTuQ"], "start_seconds": ["40", "30"], "properties": ["engine, horn, run", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "a toilet flushes and a female speaks"], "sample_ids": ["xvDdE3zNf8Y", "yaln9y8I7ms"], "start_seconds": ["120", "230"], "properties": ["A, crumple, paper", "female, flushes, toilet"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman speaks and crumples paper", "a toilet flushes and a man speaks"], "question": "which woman is speaking", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["zofjfKhqLk8", "ukg5L09Wpvo"], "start_seconds": ["10", "150"], "properties": ["background, metal, clings", "clickety-clack, train, whistle"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "wind blowing followed by a zoom"], "sample_ids": ["yJ0TePmaOo", "vr8ZXjEBhMQ"], "start_seconds": ["390", "150"], "properties": ["two hard objects, man, speak", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "wind blows as people chatter quietly"], "sample_ids": ["slZLHwNbbt4", "xBxDz0CFVn0"], "start_seconds": ["300", "30"], "properties": ["clap, distance, horn", "wind, chatter, people"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage is blurry and out of focus"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "some tunes played by whistling"], "sample_ids": ["yVumC9TGknc", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["humming, clock, birds", "tune, play, whistling"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a series of beeps and chirps", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a man speaks as a machine runs", "dishes cling together then a man begins to speak"], "sample_ids": ["vD6lYD1l0BY", "sQGXqGcwOTc"], "start_seconds": ["330", "3"], "properties": ["a, machine, run", "cling, speak, dishes"], "captions_pred_video": ["game controller being held in the hands of the person", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking as a machine runs?", "label": 0}, {"captions": ["a man speaks as a motor runs in the background", "an engine sputters followed by a car zooming by"], "sample_ids": ["xZepNM9qcRA", "u5RmF3c3Aw"], "start_seconds": ["30", "60"], "properties": ["background, motor, run", "engine, car, zoom"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a race car accelerates and skids with wind noise in the background "], "question": "which entity has a car zooming by?", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["y2ZBGpgbhHM", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["animal, growl, bird", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["birds chirping and a dog panting", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "an engine runs loudly"], "sample_ids": ["sDSppXIlJrs", "vqZuVbG6-HI"], "start_seconds": ["27", "130"], "properties": ["microphone, water, wind", "loud, engine, run"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "footage is blurry because it's raining outside"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a loud engine muffles a man as he speaks"], "sample_ids": ["tOSWIURC-4", "xyx6eNVEYRY"], "start_seconds": ["0", "380"], "properties": ["engine, work, nearby", "loud, engine, muffles"], "captions_pred_video": [null, "footage of a helicopter landing on a runway at an airport"], "captions_pred_audio": ["a lawn mower is running ", "an aircraft engine is running and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vSeGhaZt-aI", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["water, bubbles, speak", "rooster, crow, background, men"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a propeller rotates loudly and intensely"], "sample_ids": ["w8uLijTqtlU", "ugHJF0hfYkg"], "start_seconds": ["70", "10"], "properties": ["wind, microphone, noise", "loud, intense, propeller"], "captions_pred_video": ["footage is blurry and shaky", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["the wind is blowing strongly", "a helicopter is flying overhead "], "question": "which is louder", "label": 0}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sLUnaPT5gM8", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["loud, laughter, intermittent", "airplane, boy, fly"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a clock ticktocks", "a machine engine runs and a man speaks"], "sample_ids": ["v-g-j2uTByM", "vs65y4qmyBE"], "start_seconds": ["30", "340"], "properties": ["ticktocks, clock, ticktocks", "engine, run, man"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "a car is engulfed in flames on the side of the road"], "captions_pred_audio": ["a clock is ticking loudly", "a heavy engine is running and men are speaking "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a man is snoring loudly and repeatedly", "a woman speaks over sizzling noise"], "sample_ids": ["sncRqQ67iJU", "yajyRTUQk3U"], "start_seconds": ["460", "400"], "properties": ["loud, repeatedly, man", "noise, woman, speak"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person is snoring", "a woman is speaking while food is frying in the background"], "question": "which entity is speaking over noise", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "distant men speak as a spray can nozzle is depressed"], "sample_ids": ["su6FAOcOA8c", "rwtmaKiCcQU"], "start_seconds": ["4", "30"], "properties": ["engine, idle, woman", "nozzle, depressed, spray can"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "shows a man spraying paint on a wall with a spray gun"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "spraying and people speaking"], "question": "which entity is about a spray can?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["tQWGZLItBXk", "vlS6YMeWAPo"], "start_seconds": ["170", "40"], "properties": ["voice, music, whoosh", "sheep, baa, birds"], "captions_pred_video": ["worms revolution screenshots", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a goat bleats and birds chirp"], "question": "which entity is more animal", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vddP56-ogds", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["water, flow, laugh", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a child yells and another yells", "a woman and man speak while food is frying"], "sample_ids": ["vMDHu7Lxcgw", "zk-xJGQU8-4"], "start_seconds": ["410", "130"], "properties": ["two, yell, child", "food, man, woman"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a woman speaks with water running", "people speak as gunfire rings out"], "sample_ids": ["wTideSjRFS0", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["water, running, woman", "gunfire, ring, speak"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and a gun is fired"], "question": "which entity is more calm", "label": 0}, {"captions": ["a car accelerates and wind blows", "a propeller rotates loudly and intensely"], "sample_ids": ["u0TrcHhkPQ", "ugHJF0hfYkg"], "start_seconds": ["20", "10"], "properties": ["accelerates, wind, blows", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a helicopter is flying overhead "], "question": "which entity is rotating", "label": 1}, {"captions": ["a small engine spits as it runs", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sZvwOuuPGP0", "vbZ-0lGPneg"], "start_seconds": ["50", "30"], "properties": ["spits, engine, runs", "a woman, a television program, a bird"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a medium engine is running ", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a woman speaks and other women and a man talk with her"], "sample_ids": ["vYkA3cfXp5Q", "vbpKkWvfOu4"], "start_seconds": ["30", "560"], "properties": ["engine, accelerate, idle", "a, woman, man"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["an engine is idling", "a woman is speaking and a man is speaking"], "question": "which entity is a group of people", "label": 1}, {"captions": ["a large crowd cheers and applauds", "small dogs yip and bark sharply"], "sample_ids": ["rqfQRErjfk8", "v-wcQf4BDY0"], "start_seconds": ["170", "120"], "properties": ["crowd, cheers, applauds", "bark, yip, sharply"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a dog barks and growls"], "question": "which entity is more likely to be a group of people", "label": 0}, {"captions": ["an electronic device bleeps once", "multiple people speak and children yell while water gurgles"], "sample_ids": ["tHJ6JSa8Y4", "vb1fPSDI4c"], "start_seconds": ["0", "30"], "properties": ["bleeps, electronic, device", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["a clock is ticking and beeping", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["the rumbling of a bus followed by a soft male voice", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vK93VuO0yNc", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["male voice, bus, rumble", "a woman, something, fried"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about a woman talking?", "label": 1}, {"captions": ["a man speaks as crickets sing", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["ryFDPxgDOGc", "y8WEcpOlT3I"], "start_seconds": ["570", "40"], "properties": ["a, crickets, sing", "harsh, wind, blows"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking with wind noise in the background "], "question": "which entity is a man speaking as crickets sing?", "label": 0}, {"captions": ["a helicopter engine runs", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["t5ZbXbniOWk", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["engine, helicopter, run", "engine, laugh, loud"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a helicopter is flying overhead ", "a jet engine roars "], "question": "which entity is not a helicopter?", "label": 1}, {"captions": ["water runs into a sink while men speak", "a telephone rings followed by a woman talking"], "sample_ids": ["vzceMbklWc", "tGcFnX0GHI"], "start_seconds": ["180", "0"], "properties": ["water, sink, run", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and a man is speaking", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "people applaud and hoot and chat quietly"], "sample_ids": ["vh30P49Po6s", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["loud, continuous, quacks", "people, applaud, hoot"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "people are clapping and speaking with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a motor runs steadily as a man speaks, then the motor revs twice", "water flows and trickles"], "sample_ids": ["ylpYOorfH4o", "tB7hWb9gTuQ"], "start_seconds": ["410", "30"], "properties": ["motor, run, steady", "water, flow, trickle"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and an engine is revving", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["u21-Z5gJCB8", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["background, voice, man", "a, scream, girl"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a man talks while metallic objects are rapped and steam is released"], "sample_ids": ["ukxt9I7eMMg", "vuUVPzd2FXw"], "start_seconds": ["30", "160"], "properties": ["continuous, woman, speaking", "a, steam, release"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of the person cooking on the grill with a spatula"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and dishes are clanging"], "question": "which entity has a woman speaking towards the end?", "label": 0}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a car accelerates and wind blows"], "sample_ids": ["ziUT9IFTkjg", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["background, birds, rustling", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a race car accelerates and revs its engine "], "question": "which entity is a car?", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["yks4cLgIDMc", "ziUT9IFTkjg"], "start_seconds": ["170", "10"], "properties": ["background, speaking, child", "background, birds, rustling"], "captions_pred_video": ["footage of two kids wrestling on the floor", null], "captions_pred_audio": ["a man is speaking and a child is crying", "birds are chirping and a chime is ringing "], "question": "which entity has birds in the background?", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xOZfdgAgJ9o", "tdWhHV3X25Q"], "start_seconds": ["40", "60"], "properties": ["woman, whimpering, speaking", "applause, audience, yells"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["paper folding and crinkling", "wind blows as people chatter quietly"], "sample_ids": ["zPpG3RD8lSs", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["paper, fold, crinkle", "wind, chatter, people"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "footage is blurry and out of focus"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sU53zg9Jp7s", "su6FAOcOA8c"], "start_seconds": ["380", "4"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "engine, idle, woman"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a woman is speaking and a subway train is moving "], "question": "which entity is a recording of a person speaking?", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yeFvk9x0wWI", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["chirp, twitter, clatter", "male, duck, laugh"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and ducks are quacking"], "question": "which entity is a human speaking?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "winds blows roughly as a vehicle races past"], "sample_ids": ["uYT5gxnyMWM", "xjvTpk2Zpr8"], "start_seconds": ["50", "70"], "properties": ["person, spray, yell", "wind, blows, vehicle"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a jet engine roars and wind blows "], "question": "which entity is more likely to be a person", "label": 0}, {"captions": ["people speak and laugh as a child speaks", "a person whistles a meandering tune"], "sample_ids": ["sa6TLVbooCc", "uFoga8sHpiw"], "start_seconds": ["240", "90"], "properties": ["people, laugh, child", "person, tune, whistle"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage of a bird in a cage"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a person whistles a song"], "question": "which entity is a person?", "label": 1}, {"captions": ["continuous sneezing together with speech", "a clock ticks quietly and rhythmically"], "sample_ids": ["x4dZyf9Gbj0", "u7C-AEBQM"], "start_seconds": ["130", "30"], "properties": ["continuous, sneeze, speech", "ticks, rhythmic, quiet"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a woman sneezes and speaks", "a ticktock of a clock"], "question": "which entity is quieter", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a group of people chatter and talk as multiple horns honk in the background"], "sample_ids": ["wnpJndXuxLc", "yLy-WycbVVE"], "start_seconds": ["50", "30"], "properties": ["blows, vehicle, train", "background, people, talk"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a soccer field in a stadium with yellow and red seats"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a man is speaking and a church bell is ringing with wind noise in the background "], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "plastic is tapped on while someone speaks"], "sample_ids": ["yJ0TePmaOo", "wvKpEYswXO0"], "start_seconds": ["390", "150"], "properties": ["two hard objects, man, speak", "plastic, tap, speak"], "captions_pred_video": [null, "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a woman is speaking and tapping with background noise and water running "], "question": "which object is being rubbed together", "label": 0}, {"captions": ["a man speaks while turning a water faucet on", "an engine runs loudly"], "sample_ids": ["vf9xf3vMsGM", "vqZuVbG6-HI"], "start_seconds": ["540", "130"], "properties": ["A man speaks while turning a water faucet on.", "loud, engine, run"], "captions_pred_video": ["of the person washing their hands under the faucet", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a lawn mower is running and men are speaking "], "question": "which entity is quieter", "label": 0}, {"captions": ["a door slams shut and an object moves on a hard surface", "an infant crying as a woman laughs"], "sample_ids": ["zkKdxzNC97Y", "xhmRY9yhC7c"], "start_seconds": ["27", "20"], "properties": ["hard, surface, door", "a, laugh, infant"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a door is opened and closed", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["birds fly and flutter around", "people applaud and hoot and chat quietly"], "sample_ids": ["wGKgwOP3h30", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["fly, flutter, around", "people, applaud, hoot"], "captions_pred_video": ["of the pigeons in the coop", null], "captions_pred_audio": ["pigeons coo and flap their wings", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 0}, {"captions": ["a propeller moves loudly nearby", "a duck quacks loudly and continuously"], "sample_ids": ["ugHJF0hfYkg", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["loud, propeller, move", "loud, continuous, quacks"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a helicopter is flying overhead ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["uYT5gxnyMWM", "wqZ135Ssz0"], "start_seconds": ["50", "60"], "properties": ["person, spray, yell", "two men, woman, birds"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yswmmRZFItk", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["background, frog, croak", "a woman, laughs, animal"], "captions_pred_video": ["a close up of a frog in the water", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a frog is croaking", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "a car speeding up in the distance"], "sample_ids": ["w8uLijTqtlU", "u0TrcHhkPQ"], "start_seconds": ["70", "20"], "properties": ["wind, microphone, noise", "distance, car, speed"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a clock ticktocks continuously", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vlJS7LN2XyM", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["ticktocks, clock, ticktocks continuously", "engine, revs, vehicle"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a ticktock of a clock", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a small engine spits as it runs", "an airplane engine runs"], "sample_ids": ["sZvwOuuPGP0", "yVPZ2MNWpms"], "start_seconds": ["50", "0"], "properties": ["spits, engine, runs", "engine, airplane, runs"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a medium engine is running ", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "a toilet flushes and a female speaks"], "sample_ids": ["sd7xVssqlw", "yaln9y8I7ms"], "start_seconds": ["50", "230"], "properties": ["accelerates, tires, squealing", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a toilet flushes and a man speaks"], "question": "which entity is a toilet?", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["vddP56-ogds", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["water, flow, laugh", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "some tunes played by whistling"], "sample_ids": ["x5cuQjOdM3E", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["cat, talk, meow", "tune, play, whistling"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a cat meows and a woman speaks", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a train horn blows as it passes by"], "sample_ids": ["s4Uz1Ffgo04", "zVacuqSb4LI"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "horn, blows, train"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "water pouring and bubbling"], "sample_ids": ["sofxkNWaP0s", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["wind, engine, louder", "water, bubbles, pouring"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an animal quacks rapidly", "wind blowing followed by a zoom"], "sample_ids": ["vh30P49Po6s", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["animal, quacks, rapidly", "wind, blow, zoom"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a duck is quacking loudly", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["wRBHTgrbiwg", "y2bVZ7rz-5M"], "start_seconds": ["50", "280"], "properties": ["bird, owl, speak", "motor noise, horn, siren"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a truck is honking its horn and a siren is blaring "], "question": "which entity is more likely to be heard in a car", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "wind blows as people chatter quietly"], "sample_ids": ["uOpoD0gGXcs", "xBxDz0CFVn0"], "start_seconds": ["120", "30"], "properties": ["chirps, woman, bird", "wind, chatter, people"], "captions_pred_video": ["a herd of cows grazing in the field", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tjmoSi330GM", "tDVADusiIoc"], "start_seconds": ["23", "60"], "properties": ["speed, water, boat", "water, radio, man"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which is not a speedboat", "label": 1}, {"captions": ["a motorcycle engine is idling", "a infant makes noise and is excited"], "sample_ids": ["vZAqdHZ81yA", "wIJK3-5y0kA"], "start_seconds": ["180", "30"], "properties": ["engine, motorcycle, idling", "noise, excited, infant"], "captions_pred_video": ["a motorcycle is parked on the side of the road with its rear end facing the viewer", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["an engine is idling loudly", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["a large crowd cheers and applauds", "an airplane engine runs"], "sample_ids": ["rqfQRErjfk8", "yVPZ2MNWpms"], "start_seconds": ["170", "0"], "properties": ["crowd, cheers, applauds", "engine, airplane, runs"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a dog barks and whimpers", "small dogs yip and bark sharply"], "sample_ids": ["sShpyu2l4YQ", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["barks, whimpers, dog", "bark, yip, sharply"], "captions_pred_video": ["the puppies are playing with a toy", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a dog is barking and growling", "a dog barks and growls"], "question": "which dog is more playful", "label": 1}, {"captions": ["a duck quacks continuously", "some tunes played by whistling"], "sample_ids": ["vh30P49Po6s", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["quacks, continuously, duck", "tune, play, whistling"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a duck is quacking loudly", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "some liquid flows while a woman laughs and man talks"], "sample_ids": ["sHbXC6na9hg", "vddP56-ogds"], "start_seconds": ["0", "30"], "properties": ["a person, saw, wood", "liquid, laughs, man"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", null], "captions_pred_audio": ["an engine is idling and vibrating", "water is running and gurgling and a man is speaking"], "question": "which entity is about a person cutting wood?", "label": 0}, {"captions": ["a woman speaks and a baby laughs", "a man speaks as a car is passing by"], "sample_ids": ["tOj4tdLRaA", "sK4u5T8hW78"], "start_seconds": ["70", "30"], "properties": ["woman, laugh, baby", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a baby?", "label": 0}, {"captions": ["a man makes an exclamation, then another man speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xO-Q2BlIIPU", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["two men, exclamation, speak", "engine, laugh, loud"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a horn blasts as warning bells ring"], "sample_ids": ["zofjfKhqLk8", "zgUgkpk78xU"], "start_seconds": ["10", "70"], "properties": ["background, metal, clank", "horn, bells, ring"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a warning device", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "a girl talking, laughing and sneezing noise"], "sample_ids": ["yZrFNS7GFBQ", "y4tPJXBKDig"], "start_seconds": ["30", "20"], "properties": ["pigeon, buzzes, insect", "a, noise, talk"], "captions_pred_video": ["of the bird in the cage", "footage of the woman wiping her nose with a tissue"], "captions_pred_audio": ["an owl hoots in the background ", "a woman is speaking and coughing with background noise and breathing "], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a stream of water flows as people talk and wind blows"], "sample_ids": ["t69a8aRKhmc", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "stream, water, flow"], "captions_pred_video": ["footage is blurry and out of focus", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and wind blows", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sxIvBMSavMQ", "y8WEcpOlT3I"], "start_seconds": ["210", "40"], "properties": ["birds, chirp, wind", "harsh, wind, blows"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "on how to use a sewing machine youtube"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 0}, {"captions": ["a man speaks over intermittent keyboard taps", "a person uses a saw to cut some wood"], "sample_ids": ["tw76HGONaKg", "sHbXC6na9hg"], "start_seconds": ["570", "0"], "properties": ["audio, man, keyboard", "a person, saw, wood"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "an engine is idling and vibrating"], "question": "which entity is a video of a person cutting wood?", "label": 1}, {"captions": ["a man talks as several small engines run", "a stream of water runs briefly"], "sample_ids": ["u9A6VZQCZpU", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["a, man, talk", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "an small aircraft engine runs and a boy speaks"], "sample_ids": ["xjhAnI2q6hM", "xSKJGCItUWE"], "start_seconds": ["6", "10"], "properties": ["engine revs, vehicle, people", "engine, run, boy"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a high pitched engine is running and a child speaks"], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vr8ZXjEBhMQ", "yDoT73BWsdA"], "start_seconds": ["150", "10"], "properties": ["wind, blow, zoom", "engine, revs, vehicle"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a race car accelerates and revs its engine "], "question": "which entity is not a zoom", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "some men converse over an engine running"], "sample_ids": ["yZrFNS7GFBQ", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["pigeon, buzzes, insect", "men, converse, engine"], "captions_pred_video": ["of the bird in the cage", null], "captions_pred_audio": ["an owl hoots in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a goat bleats as a person speaks", "an infant crying as a woman laughs"], "sample_ids": ["tPJvjq9QePY", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["bleats, person, speak", "a, laugh, infant"], "captions_pred_video": ["a dog and a sheep in a barn", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a baby cries and a man speaks", "a baby cries and a woman speaks"], "question": "which entity is crying", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "people cheer as a vehicle engine revs"], "sample_ids": ["sQGXqGcwOTc", "xjhAnI2q6hM"], "start_seconds": ["3", "6"], "properties": ["cling, speak, dishes", "engine revs, vehicle, people"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["u21-Z5gJCB8", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["background, voice, man", "harsh, wind, blows"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity has a harsh wind blowing?", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["uZesmtKZGSw", "w34HjHr6gAY"], "start_seconds": ["250", "30"], "properties": ["car, track, man", "beeps, hit, woman"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a beep sounds followed by a child speaking"], "question": "which entity has a man talking?", "label": 0}, {"captions": ["a male speaks and another male speaks", "pigeons vocalize and birds chirp"], "sample_ids": ["viuTg1M-dqg", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["two males, speaking, male", "vocalize, bird, chirp"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["an engine runs and a man speaks", "people speak as gunfire rings out"], "sample_ids": ["yT5WfYMRr-U", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["engine, run, man", "gunfire, ring, speak"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war", "label": 1}, {"captions": ["ticking continues without interruption", "a man speaks as a boat engine runs"], "sample_ids": ["v-g-j2uTByM", "wtDqrBygTcU"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "man, engine, run"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "shows a person riding on the back of a boat as it speeds through the water"], "captions_pred_audio": ["a clock is ticking loudly", "a man is speaking and a motor is running"], "question": "which is not a clock", "label": 1}, {"captions": ["children speak as a female ask them questions", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wEBlkGWVWwE", "uYT5gxnyMWM"], "start_seconds": ["260", "50"], "properties": ["female, speak, questions", "a, scream, girl"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a woman is speaking and a baby is crying"], "question": "which entity has a girl speaking followed by a scream?", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wvKpEYswXO0", "uYT5gxnyMWM"], "start_seconds": ["150", "50"], "properties": ["water, tap, run", "female, spraying, scream"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person spraying and screaming?", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "a child speaks in closed space"], "sample_ids": ["w5W5Kqtc8E", "yW6FWLSLkx4"], "start_seconds": ["100", "40"], "properties": ["wind, blow, vehicle", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a person is snoring while sleeping", "a frog croaks as other frogs croak in the background"], "sample_ids": ["vJrjSeP17yE", "yswmmRZFItk"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "background, frog, croak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a close up of a frog in the water"], "captions_pred_audio": ["a person snoring loudly", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "someone snores nearby"], "sample_ids": ["ujMt0-D-x2k", "spJCm8tD9Zo"], "start_seconds": ["0", "90"], "properties": ["snoring, rhythmical, nearby", "someone snores, nearby, someone"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a person is snoring loudly", "a person is snoring loudly"], "question": "which entity is more likely to be a person", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "some men converse over an engine running"], "sample_ids": ["y8WEcpOlT3I", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["harsh, wind, blows", "men, converse, engine"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["children cheer as a man speaks then an audience screams", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["vJvryTwuAV8", "wDVMhEdTiVw"], "start_seconds": ["16", "30"], "properties": ["audience, cheer, man", "gun, shoot, water"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "water splashes and a door squeaks"], "sample_ids": ["s4Uz1Ffgo04", "sdXV-ylviw"], "start_seconds": ["100", "190"], "properties": ["roars, background, people speaking", "sound, splash, door"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a dog barks and taps with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["someone whistles a song", "a drill drills through something then people begin laughing"], "sample_ids": ["sIXTftIuUgw", "tEE3MpBt1sg"], "start_seconds": ["90", "50"], "properties": ["someone, song, whistle", "drill, something, laugh"], "captions_pred_video": [null, "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a person whistling a song", "people are laughing breathing and speaking with background noise "], "question": "which entity is a drill?", "label": 1}, {"captions": ["a door slams shut roughly", "a man speaks as a car is passing by"], "sample_ids": ["zkKdxzNC97Y", "sK4u5T8hW78"], "start_seconds": ["27", "30"], "properties": ["a door, slams, shut", "a, car, pass"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a moving object", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "paper folding and crinkling"], "sample_ids": ["s7knHCFW82w", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["blow horn, get close, train", "paper, fold, crinkle"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "the wind blows and a mouse clicks "], "question": "which is not a train", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "water splashes as an animal walks through"], "sample_ids": ["w-4gHptFNuU", "w1ir-sZ3Im8"], "start_seconds": ["21", "90"], "properties": ["engine revs, accelerates, bump", "animal, water, splashes"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a car accelerates and revs its engine ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to cause a splash", "label": 1}, {"captions": ["birds chirp and wind blows", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sxIvBMSavMQ", "zl9Dqx-j7q4"], "start_seconds": ["210", "6"], "properties": ["birds, chirp, wind", "engine, laugh, loud"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage of a man driving a car in the dark"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a telephone rings and a bird vocalizes", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["skd2PphS6oI", "sLUnaPT5gM8"], "start_seconds": ["190", "0"], "properties": ["ring, bird, vocalize", "loud, laughter, intermittent"], "captions_pred_video": ["a vintage telephone on a towel with a pair of scissors and a pair of pliers next to it", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a telephone bell rings repeatedly ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["paper is repeatedly crumpled and crinkled", "wind blowing followed by a zoom"], "sample_ids": ["vms5XGTDVQc", "vr8ZXjEBhMQ"], "start_seconds": ["220", "150"], "properties": ["paper, crumpled, crinkled", "wind, blow, zoom"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["paper is crumpled and crinkled", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "people applaud and hoot and chat quietly"], "sample_ids": ["uiS58TNyUiw", "wwyfGO2J4"], "start_seconds": ["430", "90"], "properties": ["vocalize, bird, chirp", "people, applaud, hoot"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a symphony", "label": 1}, {"captions": ["a baby laugh at a sputter", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sLUnaPT5gM8", "yDoT73BWsdA"], "start_seconds": ["0", "10"], "properties": ["laugh, sputter, baby", "engine, revs, vehicle"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "wind blows as people chatter quietly"], "sample_ids": ["wtDqrBygTcU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["man, engine, run", "wind, chatter, people"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a motor is running", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zY3icUyMdh8", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["dog, bark, engine", "a, scream, girl"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a toilet flushes and water drains", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sfAvvZwdLCY", "tdWhHV3X25Q"], "start_seconds": ["20", "60"], "properties": ["water drains, flushes, water", "applause, audience, yells"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a man speaks as a vehicle engine idles"], "sample_ids": ["s7knHCFW82w", "shmR4OZtzqA"], "start_seconds": ["30", "30"], "properties": ["blow horn, get close, train", "man, engine, idle"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a man speaks while a motor runs"], "question": "which entity is stationary", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "wind blowing followed by a zoom"], "sample_ids": ["ukg5L09Wpvo", "vr8ZXjEBhMQ"], "start_seconds": ["150", "150"], "properties": ["clickety-clack, train, whistle", "wind, blow, zoom"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "people speak as gunfire rings out"], "sample_ids": ["x4a9YGIw4ok", "wqTCwqVRDlk"], "start_seconds": ["120", "80"], "properties": ["water, gurgles, stops", "gunfire, ring, speak"], "captions_pred_video": ["footage is blurry and out of focus", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a toilet flushes and water splashes", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a woman talks while a baby cries and a man whispers", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["smDKStoHBJo", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["a, talk, baby, cry", "engine, idle, woman"], "captions_pred_video": ["a man holding a crying baby in his arms", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman talking?", "label": 0}, {"captions": ["birds chirp and a dog breathes heavily", "water flows and trickles"], "sample_ids": ["y2ZBGpgbhHM", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["dog, chirp, breathe", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["birds chirping and a dog panting", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "a stream of water runs briefly"], "sample_ids": ["sOa7g-44Dag", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["audio, scratching, man", "stream, water, run"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "small dogs yip and bark sharply"], "sample_ids": ["wTideSjRFS0", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["food, sizzle, woman", "bark, yip, sharply"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a toilet flushes and a female speaks"], "sample_ids": ["uPDn2BFTHk", "yaln9y8I7ms"], "start_seconds": ["140", "230"], "properties": ["lady, laugh, baby", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a duck quacks continuously"], "sample_ids": ["wsHBIgzs9Fs", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["horn, continuous, buzzing", "quacks, continuously, duck"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["an airplane accelerates briefly", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["zjTG0gaGCUI", "wqZ135Ssz0"], "start_seconds": ["80", "60"], "properties": ["accelerates, airplane, briefly", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a jet engine roars as wind blows ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["a vehicle accelerates squealing tires", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sd7xVssqlw", "yajyRTUQk3U"], "start_seconds": ["50", "400"], "properties": ["accelerates, tires, squealing", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vYkA3cfXp5Q", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["engine, accelerate, idle", "men, talk, cars"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["an engine is idling", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is a video of a vehicle engine accelerating then running on idle?", "label": 0}, {"captions": ["a frog vocalizes as birds chirp", "three men talk while wind blows and some liquid flows"], "sample_ids": ["wqUmIEzuNz4", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["frog, bird, vocalize", "three men, wind, flow"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a cat meows and rustles", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a frog?", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a man speaks on a radio as wind blows"], "sample_ids": ["y2ZBGpgbhHM", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["birds, tweet, pant", "man, radio, blows"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking on a radio as wind blows?", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "some men converse over an engine running"], "sample_ids": ["vSeGhaZt-aI", "sCiy7QS1U"], "start_seconds": ["50", "300"], "properties": ["water, bubbles, speak", "men, converse, engine"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a child babbles as a woman speaks", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["wEBlkGWVWwE", "wDVMhEdTiVw"], "start_seconds": ["260", "30"], "properties": ["a, babble, woman", "gun, shoot, water"], "captions_pred_video": ["shows a person writing on the whiteboard", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zcDwZ6W7E3E", "zj2R0XoFr5k"], "start_seconds": ["180", "50"], "properties": ["man, speak, motorcycles", "airplane, boy, fly"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a child speaks in closed space"], "sample_ids": ["tDlysoZiA1I", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["animal, grunts, chirps", "child, space, speak"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["w0xsN8X18Y", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["rain, thunder, surface", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "water flows and trickles"], "sample_ids": ["y4tPJXBKDig", "tB7hWb9gTuQ"], "start_seconds": ["20", "30"], "properties": ["a, noise, talk", "water, flow, trickle"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "sucking and grunting followed by slurping with birds in the background"], "sample_ids": ["sEprKHm8Sj8", "yYEVLuqEytU"], "start_seconds": ["90", "40"], "properties": ["car, tires, slows", "grunt, slurp, background"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "a baby goat is being petted by a person's hand"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "several sheep bleat and a man speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a duck quacks loudly and continuously"], "sample_ids": ["zl9Dqx-j7q4", "vh30P49Po6s"], "start_seconds": ["6", "30"], "properties": ["motors rev, laugh, loudly", "loud, continuous, quacks"], "captions_pred_video": ["footage of a man driving a car in the dark", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a jet engine roars ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["s4Uz1Ffgo04", "yDoT73BWsdA"], "start_seconds": ["100", "10"], "properties": ["roars, background, people speaking", "engine, revs, vehicle"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a race car accelerates and revs its engine "], "question": "which vehicle is revving its engine", "label": 1}, {"captions": ["a motorcycle engine works nearby", "small dogs growl, bark and yip."], "sample_ids": ["tOSWIURC-4", "sShpyu2l4YQ"], "start_seconds": ["0", "0"], "properties": ["engine, work, nearby", "growl, bark, yip"], "captions_pred_video": [null, "the puppies are playing with a toy"], "captions_pred_audio": ["a lawn mower is running ", "a dog is barking and growling"], "question": "which entity is more active", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["sQGXqGcwOTc", "rqu8iB22IY"], "start_seconds": ["3", "5"], "properties": ["cling, speak, dishes", "sound, repeats, laugh"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a dog barks and a man speaks while music plays "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a man speaks as a machine runs", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vD6lYD1l0BY", "vbZ-0lGPneg"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "a woman, a television program, a bird"], "captions_pred_video": ["game controller being held in the hands of the person", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "people speak in a closed space"], "sample_ids": ["vYkA3cfXp5Q", "sTpirNYo8vQ"], "start_seconds": ["30", "30"], "properties": ["speed, idle, accelerate", "people, space, speak"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "of a man taking a selfie on a bus"], "captions_pred_audio": ["an engine is idling", "a man is speaking while a car is revving and accelerating "], "question": "which entity is stationary", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a child speaks in closed space"], "sample_ids": ["un9VQlzgZM", "yW6FWLSLkx4"], "start_seconds": ["5", "40"], "properties": ["wind, speak, laugh", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "waves crash against a shoreline and people speak"], "sample_ids": ["s59PfAghdkM", "yFB25fqfU8I"], "start_seconds": ["0", "300"], "properties": ["bird, chirp, background, horse, neigh", "wave, crash, shoreline"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "footage of a person surfing in the ocean"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["loud, continuous burping", "a toilet flushes and a female speaks"], "sample_ids": ["y636gklDioE", "yaln9y8I7ms"], "start_seconds": ["20", "230"], "properties": ["loud, continuous, burping", "female, flushes, toilet"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "footage is blurry and out of focus"], "captions_pred_audio": ["a person burps loudly several times", "a toilet flushes and a man speaks"], "question": "which entity is silent", "label": 1}, {"captions": ["water rushes by followed by a motorcycle zooming by in the distance", "some tunes played by whistling"], "sample_ids": ["s4Uz1Ffgo04", "u6BnG6YZqJ4"], "start_seconds": ["100", "0"], "properties": ["water, rushes, motorcycle", "tune, play, whistling"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tqR406bGiE", "vYkA3cfXp5Q"], "start_seconds": ["40", "30"], "properties": ["flush, water, gurgle", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a toilet is flushed", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "water is sprayed across a hard surface"], "sample_ids": ["vBHyYJ8pL0", "sQwlkXjQabo"], "start_seconds": ["2", "10"], "properties": ["noise, door, opening", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "an infant crying as a woman laughs"], "sample_ids": ["xjvTpk2Zpr8", "xhmRY9yhC7c"], "start_seconds": ["70", "20"], "properties": ["engine, run, wind", "a, laugh, infant"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "paper is crumpling consistently"], "sample_ids": ["v0x1odnXtP0", "v5cSxLaHADY"], "start_seconds": ["210", "0"], "properties": ["keyboard, type, computer", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a person is typing on a keyboard", "paper is crumpled and crinkled"], "question": "which object is crumpling", "label": 0}, {"captions": ["a few ducks quack and scamper and a man speaks", "a car speeding up in the distance"], "sample_ids": ["w2bYrCVLT60", "u0TrcHhkPQ"], "start_seconds": ["120", "20"], "properties": ["ducks, speak, quack", "distance, car, speed"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", null], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "water splashes as an animal walks through"], "sample_ids": ["x5cuQjOdM3E", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["cat, talk, meow", "animal, water, splashes"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a cat meows and a woman speaks", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vs65y4qmyBE", "vb1fPSDI4c"], "start_seconds": ["340", "30"], "properties": ["engine, run, man", "multiple, people, yell"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a person whistles a meandering tune", "a duck quacks continuously"], "sample_ids": ["uFoga8sHpiw", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["person, tune, whistle", "quacks, continuously, duck"], "captions_pred_video": ["footage of a bird in a cage", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person whistles a song", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["a consistent ticking pattern", "insects humming with a dog barking and small goat bleating"], "sample_ids": ["sCeWURVHfOM", "tIY7qOV3rEM"], "start_seconds": ["30", "0"], "properties": ["ticking, pattern, clock", "animal, bark, dog, barking, small, goat, bleating"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["ticking of a clock", "a dog is barking and a cat is meowing"], "question": "which animal is barking", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "the clinking of a train bell with the humming of an engine and a train horn blowing"], "sample_ids": ["zFjIWfSD-4", "zgUgkpk78xU"], "start_seconds": ["410", "70"], "properties": ["People, motor, brakes", "clinking, humming, horn"], "captions_pred_video": [null, "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a train?", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a man speaks as a car is passing by"], "sample_ids": ["sShpyu2l4YQ", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["growl, bark, yip", "a, car, pass"], "captions_pred_video": ["the puppies are playing with a toy", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more passive", "label": 1}, {"captions": ["a male speaks and another male speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["viuTg1M-dqg", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["two males, speaking, male", "stream, water, flow"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["an airplane engine spools and people speak", "a person is burping then speaks and laughs"], "sample_ids": ["wTjoRj1se3U", "wAAkbZToh8"], "start_seconds": ["390", "0"], "properties": ["airplane, engine, spool", "burp, laugh, speak"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", null], "captions_pred_audio": ["a jet engine is running and people are talking", "a man burps and a woman speaks"], "question": "which entity is a person?", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "food is frying while a woman speaks"], "sample_ids": ["y4tPJXBKDig", "yhQ2Lg-7qDY"], "start_seconds": ["20", "130"], "properties": ["a, noise, talk", "food, woman, speak"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "a pan filled with meat and sauce being cooked on a stove top"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a faucet is running and a man is speaking"], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "people applaud and hoot and chat quietly"], "sample_ids": ["yDoT73BWsdA", "wwyfGO2J4"], "start_seconds": ["10", "90"], "properties": ["engine revs, tires squeal, vehicle", "people, applaud, hoot"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "a vehicle engine accelerating then running on idle"], "sample_ids": ["yDoT73BWsdA", "vYkA3cfXp5Q"], "start_seconds": ["10", "30"], "properties": ["engine revs, tires squeal, vehicle", "engine, accelerate, idle"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["multiple people speak and children yell while water gurgles", "birds chirp and an owl hoots before a man speaks briefly"], "sample_ids": ["vb1fPSDI4c", "wRBHTgrbiwg"], "start_seconds": ["30", "50"], "properties": ["multiple, people, yell", "bird, owl, speak"], "captions_pred_video": [null, "of a bee pollinating the flowers in the field"], "captions_pred_audio": ["a crowd of people are talking and laughing", "birds are chirping and insects are buzzing"], "question": "which entity has more animals speaking", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["shmR4OZtzqA", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["man, engine, idle", "music, gunfire, explosion"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man speaks while a motor runs", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vzxHnu-SFEw", "vJ7JPEFhyLA"], "start_seconds": ["80", "16"], "properties": ["two objects, woman, speak", "three men, wind, flow"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a man speaks as bees buzz and birds chirp", "vehicles pass by on a roadway"], "sample_ids": ["t25U-v4k4ts", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["bees buzz, birds chirp, man speaks", "pass, vehicle, roadway"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a car is driving on the road "], "question": "which entity is more active", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "some men converse over an engine running"], "sample_ids": ["y8dSeubCNI", "sCiy7QS1U"], "start_seconds": ["4", "300"], "properties": ["engine revving, people speaking, motorcycle", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine revving and people talking in the background", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a motorcycle?", "label": 0}, {"captions": ["plastic is tapped on while someone speaks", "a toilet flushes and a female speaks"], "sample_ids": ["wvKpEYswXO0", "yaln9y8I7ms"], "start_seconds": ["150", "230"], "properties": ["plastic, tap, speak", "female, flushes, toilet"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a wooden clack accompanies nearby chirping birds", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["yeFvk9x0wWI", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["clack, bird, chirp", "two men, woman, birds"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", null], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uYT5gxnyMWM", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["female, spraying, scream", "stream, water, flow"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a child speaks in closed space"], "sample_ids": ["v0x1odnXtP0", "yW6FWLSLkx4"], "start_seconds": ["210", "40"], "properties": ["keyboard, type, computer", "child, space, speak"], "captions_pred_video": ["how to make money on youtube in spanish", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "a helicopter engine runs continuously"], "sample_ids": ["vlS6YMeWAPo", "ugHJF0hfYkg"], "start_seconds": ["40", "10"], "properties": ["noise, bleat, call", "engine, running, continuously"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a goat bleats and birds chirp", "a helicopter is flying overhead "], "question": "which entity is not a noise", "label": 1}, {"captions": ["an engine starts and increases in power", "an airplane engine spools and people speak"], "sample_ids": ["zjTG0gaGCUI", "wTjoRj1se3U"], "start_seconds": ["80", "390"], "properties": ["power, increase, engine", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a jet engine is running and people are talking"], "question": "which entity is a moving object", "label": 1}, {"captions": ["children cry and people talk", "a car speeding up in the distance"], "sample_ids": ["xLwHe825Zs", "u0TrcHhkPQ"], "start_seconds": ["18", "20"], "properties": ["people talk, children cry, people talk", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby cries and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a child speaks", "a toilet door squeaks as it is opened"], "sample_ids": ["yW6FWLSLkx4", "sdXV-ylviw"], "start_seconds": ["40", "190"], "properties": ["a, child, speaks", "door, toilet, squeaks"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", null], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a dog barks and taps with background noise "], "question": "which entity is quieter", "label": 1}, {"captions": ["a stream of water flows quickly", "an engine runs loudly"], "sample_ids": ["wbHTKEJZyhc", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["stream, water, flow", "loud, engine, run"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a lawn mower is running and men are speaking "], "question": "which entity is quieter", "label": 0}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "dishes cling together then a man begins to speak"], "sample_ids": ["zofjfKhqLk8", "sQGXqGcwOTc"], "start_seconds": ["10", "3"], "properties": ["background, metal, clank", "cling, speak, dishes"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "mechanisms are operating and water is splashing "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["material crumbles into a microphone", "an insect buzzes around continuously"], "sample_ids": ["vofpvUo6NAw", "v25l1jef3JY"], "start_seconds": ["220", "0"], "properties": ["material, crumbles, microphone", "buzzes, continuously, insect"], "captions_pred_video": ["person wrapping a toy car in a plastic bag", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["paper is being crumpled and crinkled", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a car speeding up in the distance"], "sample_ids": ["sfAvvZwdLCY", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["flushes, drains, water", "distance, car, speed"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wTideSjRFS0", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["food, sizzle, woman", "a, scream, girl"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a woman is speaking and a baby is crying"], "question": "which entity is about a girl speaking?", "label": 1}, {"captions": ["someone whistles a tune", "a duck quacks continuously"], "sample_ids": ["sIXTftIuUgw", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["someone, tune, whistle", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person whistling a song", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xvDdE3zNf8Y", "uZesmtKZGSw"], "start_seconds": ["120", "250"], "properties": ["a, female, speaks", "men, talk, cars"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a stream runs then someone speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["wbHTKEJZyhc", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["stream, run, someone", "people, applaud, hoot"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a clock ticktocks briefly", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["u7C-AEBQM", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["ticktocks, clock, ticktocks briefly", "two men, woman, birds"], "captions_pred_video": [null, null], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a clock?", "label": 0}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "people applaud and hoot and chat quietly"], "sample_ids": ["zgUgkpk78xU", "wwyfGO2J4"], "start_seconds": ["70", "90"], "properties": ["clinking, humming, horn", "people, applaud, hoot"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a cat meows as a young woman speaks"], "sample_ids": ["y2ZBGpgbhHM", "x5cuQjOdM3E"], "start_seconds": ["30", "30"], "properties": ["animal, growl, bird", "cat, meows, young woman"], "captions_pred_video": [null, "a black background with an airplane flying in the sky"], "captions_pred_audio": ["birds chirping and a dog panting", "a cat meows and a woman speaks"], "question": "which entity is more likely to be domesticated", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "small dogs yip and bark sharply"], "sample_ids": ["yswmmRZFItk", "v-wcQf4BDY0"], "start_seconds": ["0", "120"], "properties": ["background, frog, croak", "bark, yip, sharply"], "captions_pred_video": ["a close up of a frog in the water", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a frog is croaking", "a dog barks and growls"], "question": "which entity is more likely to be a frog", "label": 0}, {"captions": ["animals bleat and moo as a person speaks", "men speak and a nozzle sprays liquid"], "sample_ids": ["tPJvjq9QePY", "wRV8yMk886E"], "start_seconds": ["40", "0"], "properties": ["animal, bleat, moo", "liquid, spray, nozzle"], "captions_pred_video": ["a dog and a sheep in a barn", "two cars are parked in a parking lot at night"], "captions_pred_audio": ["a baby cries and a man speaks", "a man speaks followed by a loud burst"], "question": "which entity is more likely to be used in a science class", "label": 1}, {"captions": ["a woman sneezes then speaks", "a heavy rain falls endlessly"], "sample_ids": ["x4dZyf9Gbj0", "wP8ZKrlx3oA"], "start_seconds": ["130", "40"], "properties": ["sneezes, speaks, woman", "heavy, rain, fall"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a woman sneezes and speaks", "a heavy rain is falling on a surface"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "wind blows as people chatter quietly"], "sample_ids": ["uYT5gxnyMWM", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["person, spray, yell", "wind, chatter, people"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a man speaks followed by another man speaking outside"], "sample_ids": ["vYkA3cfXp5Q", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["engine, accelerate, idle", "two men, speak, follow"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["an engine is idling", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of a vehicle engine accelerating then running on idle?", "label": 0}, {"captions": ["the wind blows while a vehicle engine runs", "water is sprayed across a hard surface"], "sample_ids": ["xyL9F5VrjkE", "sQwlkXjQabo"], "start_seconds": ["20", "10"], "properties": ["wind, blows, vehicle", "water, spray, surface"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sYITalLZjj4", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["water, rushes, background, birds", "music, gunfire, explosion"], "captions_pred_video": ["two ducks are swimming in the water near each other", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["wind blows and birds chirp", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sQGXqGcwOTc", "vJ7JPEFhyLA"], "start_seconds": ["3", "16"], "properties": ["cling, speak, dishes", "three men, wind, flow"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about liquid flowing?", "label": 1}, {"captions": ["a machine beeps continuously", "a dog barks and whimpers"], "sample_ids": ["y682ml90jGw", "sShpyu2l4YQ"], "start_seconds": ["11", "0"], "properties": ["beeps, machine, continuously", "barks, whimpers, dog"], "captions_pred_video": [null, "the puppies are playing with a toy"], "captions_pred_audio": ["a beeping sound is being made ", "a dog is barking and growling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "a child speaks in closed space"], "sample_ids": ["u7C-AEBQM", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["ticks, rhythmic, quiet", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "some men converse over an engine running"], "sample_ids": ["w34HjHr6gAY", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["beeps, squawk, child speaking", "men, converse, engine"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a video of a child speaking?", "label": 0}, {"captions": ["people cheer as a vehicle engine revs", "a child speaks in closed space"], "sample_ids": ["xjhAnI2q6hM", "yW6FWLSLkx4"], "start_seconds": ["6", "40"], "properties": ["engine revs, vehicle, people", "child, space, speak"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "wind blows as people chatter quietly"], "sample_ids": ["wwyfGO2J4", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["people, applaud, hoot", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tOj4tdLRaA", "uZesmtKZGSw"], "start_seconds": ["70", "250"], "properties": ["woman, laugh, baby", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a crowd yells, reacts and applauds", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wztCSUxOf8", "y8WEcpOlT3I"], "start_seconds": ["130", "40"], "properties": ["a crowd, yells, applauds", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man is speaking with wind noise in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["water flows followed by women screaming", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["w5W5Kqtc8E", "vzceMbklWc"], "start_seconds": ["100", "180"], "properties": ["water, flow, women", "water, faucet, sink"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "water is running and a man is speaking"], "question": "which entity shows water flowing?", "label": 0}, {"captions": ["wind blows and people scream while an engine revs", "a duck quacks continuously"], "sample_ids": ["w5W5Kqtc8E", "vh30P49Po6s"], "start_seconds": ["100", "30"], "properties": ["wind, engine, scream", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a man speaks while rain falls onto a hard surface"], "sample_ids": ["w5W5Kqtc8E", "wqN6IIHw3po"], "start_seconds": ["100", "30"], "properties": ["water, splashes, motorboat", "rain, surface, fall"], "captions_pred_video": [null, "in your own words what is happening in this screenshot? blood splattered all over the place"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and water is splashing"], "question": "which entity is a video of rain falling?", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "three men talk while wind blows and some liquid flows"], "sample_ids": ["smGI3C1NZc", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["water, drain, toilet", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing?", "label": 0}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a male speaks and another male speaks"], "sample_ids": ["sLUnaPT5gM8", "viuTg1M-dqg"], "start_seconds": ["0", "30"], "properties": ["loud, laughter, intermittent", "two males, speaking, male"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a pigeon cooing as an insect buzzes by briefly"], "sample_ids": ["zj2R0XoFr5k", "yZrFNS7GFBQ"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, woman", "pigeon, buzzes, insect"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of the bird in the cage"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "an owl hoots in the background "], "question": "which entity is a bird?", "label": 1}, {"captions": ["birds chirp and objects are moved around", "water pouring and bubbling"], "sample_ids": ["yPUYU6t3rwo", "uyRfq-jKPpo"], "start_seconds": ["370", "50"], "properties": ["birds chirp, objects are moved around, birds", "water, bubbles, pouring"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["insects buzz and a man speaks", "water is running from a faucet"], "question": "which entity is more likely to be in a bathroom", "label": 1}, {"captions": ["a infant makes noise and is excited", "an infant crying frantically"], "sample_ids": ["wIJK3-5y0kA", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["noise, excited, infant", "cry, infant, frantically"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "of the baby crying in the car seat"], "captions_pred_audio": ["a baby cries and a woman speaks", "a baby cries loudly"], "question": "which infant is crying frantically", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "waves crash against a shoreline and people speak"], "sample_ids": ["yeFvk9x0wWI", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["chirp, twitter, clatter", "wave, crash, shoreline"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "footage of a person surfing in the ocean"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["smGI3C1NZc", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["water, drain, toilet", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "pigeons vocalize and birds chirp"], "sample_ids": ["xfudFO976zE", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["animal, bleats, cry", "vocalize, bird, chirp"], "captions_pred_video": ["footage is blurry and shaky", "of the pigeon in the cage"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a man is speaking and a bee is buzzing"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["a dog barks and whimpers", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sShpyu2l4YQ", "sLUnaPT5gM8"], "start_seconds": ["0", "0"], "properties": ["barks, whimpers, dog", "loud, laughter, intermittent"], "captions_pred_video": ["the puppies are playing with a toy", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a dog is barking and growling", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an aircraft engine runs as people speak", "distant men speak as a spray can nozzle is depressed"], "sample_ids": ["wTjoRj1se3U", "rwtmaKiCcQU"], "start_seconds": ["390", "30"], "properties": ["engine, run, people", "nozzle, depressed, spray can"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "shows a man spraying paint on a wall with a spray gun"], "captions_pred_audio": ["a jet engine is running and people are talking", "spraying and people speaking"], "question": "which entity is a spray can?", "label": 1}, {"captions": ["a propeller rotates loudly and intensely", "a vehicle engine accelerating then running on idle"], "sample_ids": ["ugHJF0hfYkg", "vYkA3cfXp5Q"], "start_seconds": ["10", "30"], "properties": ["loud, intense, propeller", "engine, accelerate, idle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a helicopter is flying overhead ", "an engine is idling"], "question": "which is a vehicle", "label": 1}, {"captions": ["a person speaks briefly", "several insects fly while two men talk"], "sample_ids": ["zOZleIRqZm4", "s-T9OVOiMLo"], "start_seconds": ["80", "330"], "properties": ["person, talk, brief", "several, fly, men"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be a video", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sLUnaPT5gM8", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["loud, laughter, intermittent", "a woman, something, fried"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman is speaking while food is frying in the background"], "question": "which entity is a video", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "winds blows roughly as a vehicle races past"], "sample_ids": ["w8uLijTqtlU", "xjvTpk2Zpr8"], "start_seconds": ["70", "70"], "properties": ["wind, microphone, noise", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry and shaky", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["the wind is blowing strongly", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["tdWhHV3X25Q", "ukg5L09Wpvo"], "start_seconds": ["60", "150"], "properties": ["applause, audience, yells", "clickety-clack, train, whistle"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a train blows its whistle and blows its horn "], "question": "which entity is quieter", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yYEVLuqEytU", "ukg5L09Wpvo"], "start_seconds": ["40", "150"], "properties": ["animal, pig, background", "clickety-clack, train, whistle"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a train horn sounds and railroad crossing ring", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["s7knHCFW82w", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["horn, sound, train", "engine, idle, woman"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "a clock ticktocks"], "sample_ids": ["vs65y4qmyBE", "v-g-j2uTByM"], "start_seconds": ["340", "30"], "properties": ["wind, blows, strongly", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "vehicles pass by on a roadway"], "sample_ids": ["vMf1dLD6Sng", "tgbONvsP47Y"], "start_seconds": ["6", "0"], "properties": ["frog, bird, vocalize", "pass, vehicle, roadway"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a frog croaks loudly", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "paper folding and crinkling"], "sample_ids": ["wudZTNBtVqc", "zPpG3RD8lSs"], "start_seconds": ["60", "20"], "properties": ["accelerates, engine, wind", "paper, fold, crinkle"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a car accelerates and revs its engine ", "the wind blows and a mouse clicks "], "question": "which entity is stationary", "label": 1}, {"captions": ["a goat screams and people speak in the background", "a man speaks as a car is passing by"], "sample_ids": ["xC8kbrKJmco", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["background, goat, scream", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a goat is bleating ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "waves crash against a shoreline and people speak"], "sample_ids": ["yDoT73BWsdA", "yFB25fqfU8I"], "start_seconds": ["10", "300"], "properties": ["engine revs, tires squeal, vehicle", "wave, crash, shoreline"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["someone snores nearby", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["spJCm8tD9Zo", "vbZ-0lGPneg"], "start_seconds": ["90", "30"], "properties": ["someone snores, nearby, someone", "a woman, a television program, a bird"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "vehicles pass by on a roadway"], "sample_ids": ["wvKpEYswXO0", "tgbONvsP47Y"], "start_seconds": ["150", "0"], "properties": ["plastic, tap, speak", "pass, vehicle, roadway"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["xM4joTqDVp4", "vlS6YMeWAPo"], "start_seconds": ["160", "40"], "properties": ["background, chirp, birds", "sheep, baa, birds"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a goat bleats and birds chirp"], "question": "which entity has more birds", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["y2ZBGpgbhHM", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["birds, tweet, pant", "clickety-clack, train, whistle"], "captions_pred_video": [null, "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["birds chirping and a dog panting", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "an engine runs loudly"], "sample_ids": ["vms5XGTDVQc", "vqZuVbG6-HI"], "start_seconds": ["220", "130"], "properties": ["paper, crumpled, crinkled", "loud, engine, run"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "footage is blurry because it's raining outside"], "captions_pred_audio": ["paper is crumpled and crinkled", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "a motor vehicle roars, drowning out people speaking in the background"], "sample_ids": ["u21-Z5gJCB8", "s4Uz1Ffgo04"], "start_seconds": ["30", "100"], "properties": ["background, voice, man", "roars, background, people speaking"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yLy-WycbVVE", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["background, people, talk", "female, spraying, scream"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "pigeons vocalize and birds chirp"], "sample_ids": ["u21-Z5gJCB8", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["background, voice, man", "vocalize, bird, chirp"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a stream of water runs briefly"], "sample_ids": ["yYEVLuqEytU", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["animal, pig, background", "stream, water, run"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a helicopter engine runs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["t5ZbXbniOWk", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["engine, helicopter, run", "water, radio, man"], "captions_pred_video": ["the image is loading a screenshot of the game flight simulator 2004 on nintendo gamecube", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a helicopter?", "label": 1}, {"captions": ["an insect buzzes around continuously", "some men converse over an engine running"], "sample_ids": ["v25l1jef3JY", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["buzzes, continuously, insect", "men, converse, engine"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is not a person?", "label": 0}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["w2JXXIAdUdg", "w34HjHr6gAY"], "start_seconds": ["10", "30"], "properties": ["snoring, distance, person", "beeps, hit, woman"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "some tunes played by whistling"], "sample_ids": ["se87d6yxEOA", "u6BnG6YZqJ4"], "start_seconds": ["10", "0"], "properties": ["run, whistle, pass", "tune, play, whistling"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vddP56-ogds", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["water, splash, person, laugh", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a consistent ticking pattern", "birds chirp and objects are moved around"], "sample_ids": ["sCeWURVHfOM", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["ticking, pattern, clock", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["- a close-up view of the clock's inner workings", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["ticking of a clock", "insects buzz and a man speaks"], "question": "which entity is more like a clock", "label": 0}, {"captions": ["someone sprays liquid onto a hard surface", "an insect buzzes around continuously"], "sample_ids": ["sQwlkXjQabo", "v25l1jef3JY"], "start_seconds": ["10", "0"], "properties": ["liquid, surface, spray", "buzzes, continuously, insect"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["spraying followed by silence", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a stream of water runs briefly", "people speak as gunfire rings out"], "sample_ids": ["x-PeY8Yb8M4", "wqTCwqVRDlk"], "start_seconds": ["300", "80"], "properties": ["stream, water, run", "gunfire, ring, speak"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car is driving on a wet road ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xjhAnI2q6hM", "zl9Dqx-j7q4"], "start_seconds": ["6", "6"], "properties": ["engine revs, vehicle, people", "engine, laugh, loud"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a jet engine roars "], "question": "which entity is about a vehicle engine?", "label": 0}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xKB8O8LTs6s", "wqZ135Ssz0"], "start_seconds": ["70", "60"], "properties": ["music, radio, gunshots", "two men, woman, birds"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "paper is crumpling consistently"], "sample_ids": ["vcmWSmvti8", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["music, man, fire", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "paper is crumpled and crinkled"], "question": "which entity is not a video of a man speaking as music plays before artillery is fired?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sxYkFKFIZD0", "yDoT73BWsdA"], "start_seconds": ["20", "10"], "properties": ["screech, man, door", "engine, revs, vehicle"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a race car accelerates and revs its engine "], "question": "which vehicle is moving", "label": 1}, {"captions": ["birds chirp and objects are moved around", "vehicles pass by on a roadway"], "sample_ids": ["yPUYU6t3rwo", "tgbONvsP47Y"], "start_seconds": ["370", "0"], "properties": ["birds chirp, objects are moved around, birds", "pass, vehicle, roadway"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "footage of a fire truck entering a garage"], "captions_pred_audio": ["insects buzz and a man speaks", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "a man speaks as a car is passing by"], "sample_ids": ["vcmWSmvti8", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["music, man, fire", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a man speaking as a car passes by?", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "dishes cling together then a man begins to speak"], "sample_ids": ["ziUT9IFTkjg", "sQGXqGcwOTc"], "start_seconds": ["10", "3"], "properties": ["background, birds, rustling", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["y4tPJXBKDig", "siJFXfGWgDk"], "start_seconds": ["20", "50"], "properties": ["a, noise, talk", "man, woman, vehicle"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a man is speaking and birds are chirping in the background "], "question": "which entity has a man and a woman speaking?", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "people speak as gunfire rings out"], "sample_ids": ["xV7Mg1QucSc", "wqTCwqVRDlk"], "start_seconds": ["14", "80"], "properties": ["alarm, ticktocks, laughs", "gunfire, ring, speak"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["ul60S8TXDA8", "tiDFTC-5vU"], "start_seconds": ["60", "30"], "properties": ["sound, distance, bell", "male, duck, laugh"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", null], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["a man speaks as a car is passing by", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sK4u5T8hW78", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["a, car, pass", "music, gunfire, explosion"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["yYEVLuqEytU", "xjvTpk2Zpr8"], "start_seconds": ["40", "70"], "properties": ["grunt, slurp, background", "wind, blows, vehicle"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["a baby laugh at a sputter", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sLUnaPT5gM8", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["laugh, sputter, baby", "animal, grunts, snorts"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "an airplane engine spools and people speak"], "sample_ids": ["slZLHwNbbt4", "wTjoRj1se3U"], "start_seconds": ["300", "390"], "properties": ["a, horn, run", "airplane, engine, spool"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a jet engine is running and people are talking"], "question": "which entity is a machine", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a man speaks as a motor runs in the background"], "sample_ids": ["y2ZBGpgbhHM", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["birds, tweet, pant", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["birds chirping and a dog panting", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "a toilet flushes and a female speaks"], "sample_ids": ["vYkA3cfXp5Q", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["speed, idle, accelerate", "female, flushes, toilet"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage is blurry and out of focus"], "captions_pred_audio": ["an engine is idling", "a toilet flushes and a man speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "water pouring and bubbling"], "sample_ids": ["u--KhUW8l1Y", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["engine, sound, horn", "water, bubbles, pouring"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["ukg5L09Wpvo", "w5W5Kqtc8E"], "start_seconds": ["150", "100"], "properties": ["a train, a horn, a bell", "wind, blow, vehicle"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["yajyRTUQk3U", "sapQIQUhFc"], "start_seconds": ["400", "280"], "properties": ["a woman, something, fried", "liquid, flow, distance"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking and a stream is flowing in the background "], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["a person whistles and clicks a mouse", "continuous snoring"], "sample_ids": ["zCrAfDfv6-A", "sLkeqCDJIyw"], "start_seconds": ["30", "120"], "properties": ["person, mouse, click", "loud, snoring, noise"], "captions_pred_video": ["shows a man with glasses and a green shirt sitting on a couch in a living room", ", what is the man doing on the couch? sleeping"], "captions_pred_audio": ["a person whistles a song", "a person is snoring loudly"], "question": "which noise is louder", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a car accelerates and wind blows"], "sample_ids": ["ul60S8TXDA8", "u0TrcHhkPQ"], "start_seconds": ["60", "20"], "properties": ["sound, distance, bell", "accelerates, wind, blows"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", null], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "water rushes and then a vehicle zooms past"], "sample_ids": ["vzxHnu-SFEw", "s4Uz1Ffgo04"], "start_seconds": ["80", "100"], "properties": ["two objects, woman, speak", "water, rushes, vehicle"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is more active", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zofjfKhqLk8", "vb1fPSDI4c"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "multiple, people, yell"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a crowd of people are talking and laughing"], "question": "which entity has a lot of noise", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "pigeons vocalize and birds chirp"], "sample_ids": ["smGI3C1NZc", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["water, drain, toilet", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "some light rustling followed by a loud burp and a girl speaking"], "sample_ids": ["wy1eKjR7KC0", "vdoxuJn9lTc"], "start_seconds": ["30", "40"], "properties": ["people, talk, distance", "burp, loud, girl"], "captions_pred_video": ["two police officers riding motorcycles down the street", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a child speaks followed by a burp"], "question": "which entity is quieter", "label": 0}, {"captions": ["a drill drills through something then people begin laughing", "water running down a sink while a man is talking"], "sample_ids": ["tEE3MpBt1sg", "vSeGhaZt-aI"], "start_seconds": ["50", "50"], "properties": ["drill, something, laugh", "water, sink, talk"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a video of water running down a sink?", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["sU53zg9Jp7s", "ziUT9IFTkjg"], "start_seconds": ["380", "10"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "background, birds, rustling"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "birds are chirping and a chime is ringing "], "question": "which entity has a doorbell ringing?", "label": 0}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "pigeons vocalize and birds chirp"], "sample_ids": ["uC9dtII1KDI", "uiS58TNyUiw"], "start_seconds": ["150", "430"], "properties": ["wind, gusts, distance", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "of the pigeon in the cage"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a saw finishes running as metal clings in the background", "people speak as gunfire rings out"], "sample_ids": ["zofjfKhqLk8", "wqTCwqVRDlk"], "start_seconds": ["10", "80"], "properties": ["background, metal, clings", "gunfire, ring, speak"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wtDqrBygTcU", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["man, engine, run", "multiple, people, yell"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", null], "captions_pred_audio": ["a man is speaking and a motor is running", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "wind blows as people chatter quietly"], "sample_ids": ["yVumC9TGknc", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["humming, clock, birds", "wind, chatter, people"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage is blurry and out of focus"], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a stream of water flows as people talk and wind blows"], "sample_ids": ["wqN6IIHw3po", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["rain, surface, fall", "stream, water, flow"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and water is splashing", "a man is speaking with wind noise in the background "], "question": "which entity is flowing", "label": 1}, {"captions": ["a stream of water flows quickly", "wind blows as people chatter quietly"], "sample_ids": ["wbHTKEJZyhc", "xBxDz0CFVn0"], "start_seconds": ["20", "30"], "properties": ["stream, water, flow", "wind, chatter, people"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "footage is blurry and out of focus"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is moving more slowly", "label": 1}, {"captions": ["a soft wind underscores a woman laughing", "a duck quacks continuously"], "sample_ids": ["s6DESzUTGjY", "vh30P49Po6s"], "start_seconds": ["16", "30"], "properties": ["wind, laugh, woman", "quacks, continuously, duck"], "captions_pred_video": ["how to set up an aquarium in 10 easy steps youtube youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aquarium in 10 easy steps youtube how to set up an aqu", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a motorboat is moving with wind noise in the background ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a man speaks as a motor runs in the background"], "sample_ids": ["w2M4i1mklOA", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["loud, chime, bell", "background, motor, run"], "captions_pred_video": ["footage of an antique clock", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is quieter", "label": 1}, {"captions": ["scraping and female speech with distant music", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["yHeVV-xeOxQ", "y2bVZ7rz-5M"], "start_seconds": ["130", "280"], "properties": ["female, speech, music", "motor noise, horn, siren"], "captions_pred_video": ["of a girl milking a goat's udder", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honking?", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "people applaud and hoot and chat quietly"], "sample_ids": ["zO-LSSY92ZM", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["liquid, surface, sound", "people, applaud, hoot"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", null], "captions_pred_audio": ["steam is hissing and hissing", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an engine runs and a man speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yT5WfYMRr-U", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["engine, run, man", "a woman, a television program, a bird"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["people speak as gunfire rings out", "water flows and trickles"], "sample_ids": ["wqTCwqVRDlk", "tB7hWb9gTuQ"], "start_seconds": ["80", "30"], "properties": ["gunfire, ring, speak", "water, flow, trickle"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and a gun is fired", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an infant crying as a woman laughs", "paper is crumpling consistently"], "sample_ids": ["xhmRY9yhC7c", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["a, laugh, infant", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a baby crying in a baby bouncer", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a baby cries and a woman speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["someone snores nearby", "someone is burping continuously"], "sample_ids": ["spJCm8tD9Zo", "y636gklDioE"], "start_seconds": ["90", "20"], "properties": ["someone snores, nearby, someone", "burps, burps, burps"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a dog sitting on a red chair in front of an old telephone"], "captions_pred_audio": ["a person is snoring loudly", "a person burps loudly several times"], "question": "which entity is more annoying", "label": 1}, {"captions": ["a person is whistling", "winds blows roughly as a vehicle races past"], "sample_ids": ["sIXTftIuUgw", "xjvTpk2Zpr8"], "start_seconds": ["90", "70"], "properties": ["person, whistling, person", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a person whistling a song", "a jet engine roars and wind blows "], "question": "which entity is a person", "label": 0}, {"captions": ["continuous snoring", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sLkeqCDJIyw", "wDVMhEdTiVw"], "start_seconds": ["120", "30"], "properties": ["loud, snoring, noise", "gun, shoot, water"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a person is snoring loudly", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "dishes cling together then a man begins to speak"], "sample_ids": ["uC9dtII1KDI", "sQGXqGcwOTc"], "start_seconds": ["150", "3"], "properties": ["wind, gusts, distance", "cling, speak, dishes"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "mechanisms are operating and water is splashing "], "question": "which entity is about a woman speaking in the distance?", "label": 0}, {"captions": ["paper is repeatedly crumpled and crinkled", "a man speaks followed by another man speaking outside"], "sample_ids": ["vms5XGTDVQc", "viuTg1M-dqg"], "start_seconds": ["220", "30"], "properties": ["paper, crumpled, crinkled", "two men, speak, follow"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["paper is crumpled and crinkled", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "an airplane engine spools and people speak"], "sample_ids": ["vs65y4qmyBE", "wTjoRj1se3U"], "start_seconds": ["340", "390"], "properties": ["engine, run, man", "airplane, engine, spool"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a jet engine is running and people are talking"], "question": "which entity is a machine?", "label": 0}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a horn rings out as a machine runs by"], "sample_ids": ["t69a8aRKhmc", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["a, b, c", "a, horn, run"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity has a horn?", "label": 1}, {"captions": ["birds vocalize and a man speaks", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["v0wPrLBI3hg", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["vocalize, bird, speak", "gun, shoot, water"], "captions_pred_video": ["footage of the pigeons feeding on the ground", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["pigeons coo and flap wings while a man speaks ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "people cheer as a vehicle engine revs"], "sample_ids": ["uYT5gxnyMWM", "xjhAnI2q6hM"], "start_seconds": ["50", "6"], "properties": ["female, spraying, scream", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a person sniffs and sneezes", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["uRlbY6aoBU", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["sneezes, person, sniffs", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is sneezing ", "a man is speaking and ducks are quacking"], "question": "which entity is a person", "label": 0}, {"captions": ["a man talks while a clock does ticktock", "vehicles pass by on a roadway"], "sample_ids": ["spYNpeN7rPY", "tgbONvsP47Y"], "start_seconds": ["1", "0"], "properties": ["a clock, ticktock, man", "pass, vehicle, roadway"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a man speaks followed by another man speaking outside"], "sample_ids": ["sfAvvZwdLCY", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["flushes, drains, water", "two men, speak, follow"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["yYJksgsxx5U", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["audio, woman, silverware", "People, motor, brakes"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", null], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a woman speaking over chopping and silverware noises?", "label": 0}, {"captions": ["someone snores nearby", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["spJCm8tD9Zo", "yajyRTUQk3U"], "start_seconds": ["90", "400"], "properties": ["someone snores, nearby, someone", "a woman, something, fried"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tDlysoZiA1I", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["animal, grunt, multiple", "applause, audience, yells"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "a horse runs while two women talk"], "sample_ids": ["s3cTDAj31g", "sdvI1mHAsc"], "start_seconds": ["80", "20"], "properties": ["man, talk, woman", "two women, horse, run"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a baby is crying", "horses clip-clop and a woman speaks"], "question": "which entity has more people", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "a child speaks in closed space"], "sample_ids": ["vBslzh7saPw", "yW6FWLSLkx4"], "start_seconds": ["90", "40"], "properties": ["engine, roar, louder", "child, space, speak"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["vZAw4apG0Es", "tDlysoZiA1I"], "start_seconds": ["30", "0"], "properties": ["background, clock, ticktocks", "animal, grunts, chirps"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a clock is ticking and people are talking", "birds are chirping and a rooster is crowing "], "question": "which entity is more animal like", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xfudFO976zE", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["animal, bleats, cry", "a woman, something, fried"], "captions_pred_video": ["footage is blurry and shaky", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["zdYdyF9-m8U", "yDoT73BWsdA"], "start_seconds": ["7", "10"], "properties": ["wind, crash, shoreline", "engine, revs, vehicle"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["waves crash and wind blows ", "a race car accelerates and revs its engine "], "question": "which entity is stationary", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["y8WEcpOlT3I", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["harsh, wind, blows", "three men, wind, flow"], "captions_pred_video": ["on how to use a sewing machine youtube", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["a man speaks as horns blow", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["tHyNqRyK34A", "wqZ135Ssz0"], "start_seconds": ["24", "60"], "properties": ["a, man, speaks", "two men, woman, birds"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", null], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vddP56-ogds", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["water, flow, laugh", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["w34HjHr6gAY", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["beeps, squawk, child speaking", "motor noise, horn, siren"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honking?", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["uoGVs9yUqY4", "yDoT73BWsdA"], "start_seconds": ["30", "10"], "properties": ["multiple, vocalize, wind", "engine, revs, vehicle"], "captions_pred_video": ["for how to make a wooden shed door youtube", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["y8WEcpOlT3I", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["harsh, wind, blows", "background, motor, run"], "captions_pred_video": ["on how to use a sewing machine youtube", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["bees buzz and wind blows", "a child speaks in closed space"], "sample_ids": ["tMJne1a4AFI", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["bees buzz, wind blows, bees", "child, space, speak"], "captions_pred_video": ["a swarm of bees on the ground", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["motors runs briefly and tires screech", "wind blowing followed by a zoom"], "sample_ids": ["yRx9txMcBl0", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["motors, tires, screech", "wind, blow, zoom"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a car is revving its engine and skidding ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom?", "label": 1}, {"captions": ["a rumbling clap in the distance followed by a horn and the rumbling grows louder", "a clock ticktocks"], "sample_ids": ["slZLHwNbbt4", "v-g-j2uTByM"], "start_seconds": ["300", "30"], "properties": ["clap, distance, horn", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["birds chirp in the background while a horse neighs followed by a girl speaking", "water flows and trickles"], "sample_ids": ["s59PfAghdkM", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["bird, chirp, background, horse, neigh", "water, flow, trickle"], "captions_pred_video": ["is an anime scene featuring two people and a horse in the foreground and a fence in the background", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["birds are chirping a horse is neighing and a woman is speaking ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a toilet flushes and water sputters as it drains", "a propeller rotates loudly and intensely"], "sample_ids": ["smGI3C1NZc", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["water, drain, toilet", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a toilet is flushed", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["someone is burping continuously", "multiple people speak and children yell while water gurgles"], "sample_ids": ["y636gklDioE", "vb1fPSDI4c"], "start_seconds": ["20", "30"], "properties": ["burps, burps, burps", "multiple, people, yell"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", null], "captions_pred_audio": ["a person burps loudly several times", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vqZuVbG6-HI", "y8WEcpOlT3I"], "start_seconds": ["130", "40"], "properties": ["background, male, female", "harsh, wind, blows"], "captions_pred_video": ["footage is blurry because it's raining outside", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking with wind noise in the background "], "question": "which entity has a harsh wind blowing?", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "females talk and laugh over gusting wind"], "sample_ids": ["xvDdE3zNf8Y", "un9VQlzgZM"], "start_seconds": ["120", "5"], "properties": ["A, crumple, paper", "females, talk, laugh"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", null], "captions_pred_audio": ["a woman speaks and crumples paper", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is a group of people", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vlS6YMeWAPo", "tdWhHV3X25Q"], "start_seconds": ["40", "60"], "properties": ["sheep, baa, birds", "applause, audience, yells"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a goat bleats and birds chirp", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a man speaks as a car is passing by"], "sample_ids": ["u--KhUW8l1Y", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["horn, siren, life", "a, car, pass"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["tQWGZLItBXk", "y8WEcpOlT3I"], "start_seconds": ["170", "40"], "properties": ["music, person, ding", "harsh, wind, blows"], "captions_pred_video": ["worms revolution screenshots", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a man is speaking with wind noise in the background "], "question": "which entity has a harsh wind blowing?", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "small dogs yip and bark sharply"], "sample_ids": ["sa6TLVbooCc", "v-wcQf4BDY0"], "start_seconds": ["240", "120"], "properties": ["people, laugh, child", "bark, yip, sharply"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["y2ZBGpgbhHM", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["animal, growl, bird", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["birds chirping and a dog panting", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a duck quacks continuously"], "sample_ids": ["xKB8O8LTs6s", "vh30P49Po6s"], "start_seconds": ["70", "30"], "properties": ["music, gunfire, explosion", "quacks, continuously, duck"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a duck is quacking loudly"], "question": "which entity is a duck?", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["rqfQRErjfk8", "yajyRTUQk3U"], "start_seconds": ["170", "400"], "properties": ["crowd, cheers, applauds", "a woman, something, fried"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a woman is speaking while food is frying in the background"], "question": "which entity is a demonstration of cooking?", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a bird chirps in response to a woman chirping for the birds"], "sample_ids": ["zl9Dqx-j7q4", "uOpoD0gGXcs"], "start_seconds": ["6", "120"], "properties": ["motors rev, laugh, loudly", "chirps, woman, bird"], "captions_pred_video": ["footage of a man driving a car in the dark", "a herd of cows grazing in the field"], "captions_pred_audio": ["a jet engine roars ", "birds are chirping and a man is speaking"], "question": "which entity is a response to a human action", "label": 1}, {"captions": ["an airplane flies overhead as a woman speaks", "water pouring and bubbling"], "sample_ids": ["zj2R0XoFr5k", "uyRfq-jKPpo"], "start_seconds": ["50", "50"], "properties": ["airplane, fly, overhead", "water, bubbles, pouring"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a clock ticktocks"], "sample_ids": ["zgUgkpk78xU", "v-g-j2uTByM"], "start_seconds": ["70", "30"], "properties": ["clinking, humming, horn", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "vehicles pass by on a roadway"], "sample_ids": ["zY3icUyMdh8", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["dog, bark, engine", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a car is driving on the road "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zTLVJCo4WEE", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "male, duck, laugh"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking?", "label": 1}, {"captions": ["an engine starts and increases in power", "roadway noise occurs and a truck accelerates"], "sample_ids": ["zjTG0gaGCUI", "tgbONvsP47Y"], "start_seconds": ["80", "0"], "properties": ["power, increase, engine", "noise, truck, accelerate"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "a motor slows to a stopover traffic noises"], "sample_ids": ["zcDwZ6W7E3E", "zofjfKhqLk8"], "start_seconds": ["180", "10"], "properties": ["man, speak, motorcycles", "noise, stop, motor"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a large engine is running and a bell is ringing"], "question": "which is a stopover", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a telephone rings followed by a woman talking"], "sample_ids": ["vzxHnu-SFEw", "tGcFnX0GHI"], "start_seconds": ["80", "0"], "properties": ["two objects, woman, speak", "ring, talk, woman"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which woman is talking", "label": 1}, {"captions": ["a man talks while vehicles pass by", "small dogs yip and bark sharply"], "sample_ids": ["sK4u5T8hW78", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["a, man, talk", "bark, yip, sharply"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uEU-Hg5MTN8", "wDVMhEdTiVw"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "gun, shoot, water"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause injury", "label": 1}, {"captions": ["paper folding and crinkling", "a child speaks in closed space"], "sample_ids": ["zPpG3RD8lSs", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["paper, fold, crinkle", "child, space, speak"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is not a child?", "label": 0}, {"captions": ["some men converse over an engine running", "someone is typing on a computer keyboard"], "sample_ids": ["sCiy7QS1U", "v0x1odnXtP0"], "start_seconds": ["300", "210"], "properties": ["men, converse, engine", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a person is typing on a keyboard"], "question": "which is a more active activity", "label": 0}, {"captions": ["a kid speaks followed by music playing", "a large crowd cheers and applauds"], "sample_ids": ["tQWGZLItBXk", "rqfQRErjfk8"], "start_seconds": ["170", "170"], "properties": ["music, kid, speak", "crowd, cheers, applauds"], "captions_pred_video": ["worms revolution screenshots", "a man hugging another man in front of an orchestra"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a crowd of people clapping and cheering"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tOj4tdLRaA", "su6FAOcOA8c"], "start_seconds": ["70", "4"], "properties": ["woman, laugh, baby", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking and a subway train is moving "], "question": "which woman is speaking", "label": 1}, {"captions": ["a child speaks in closed space", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["yW6FWLSLkx4", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["child, space, speak", "a, scream, girl"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity is a girl?", "label": 1}, {"captions": ["children cry and people talk", "music plays, a person speaks, followed by whooshes and a ding"], "sample_ids": ["xLwHe825Zs", "tQWGZLItBXk"], "start_seconds": ["18", "170"], "properties": ["people talk, children cry, people talk", "music, person, ding"], "captions_pred_video": [null, "worms revolution screenshots"], "captions_pred_audio": ["a baby cries and a woman speaks", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has a person speaking?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "wind blows as people chatter quietly"], "sample_ids": ["x6ijhqRY38s", "xBxDz0CFVn0"], "start_seconds": ["250", "30"], "properties": ["bowl, silverware, man", "wind, chatter, people"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["food is frying then a woman speaks", "a man woman speak while crickets sing"], "sample_ids": ["ukxt9I7eMMg", "zTLVJCo4WEE"], "start_seconds": ["30", "30"], "properties": ["food, woman, speak", "a, crickets, sing"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman speaks and crickets chirp"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a toilet flushes and a female speaks"], "sample_ids": ["tDlfY3nmx1A", "yaln9y8I7ms"], "start_seconds": ["160", "230"], "properties": ["applause, laugh, man", "female, flushes, toilet"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage is blurry and out of focus"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a toilet flushes and a man speaks"], "question": "which entity is a video of a toilet flushing?", "label": 1}, {"captions": ["a rumble grows louder", "a stream of water runs briefly"], "sample_ids": ["y4MY9mp8-TA", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["loudness, increase, rumble", "stream, water, run"], "captions_pred_video": ["a helicopter flying in the sky", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a helicopter flies overhead ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a person whistles a meandering tune"], "sample_ids": ["t25U-v4k4ts", "uFoga8sHpiw"], "start_seconds": ["40", "90"], "properties": ["bees buzz, birds chirp, man speaks", "person, tune, whistle"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage of a bird in a cage"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a person whistles a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "waves crash against a shoreline and people speak"], "sample_ids": ["tjmoSi330GM", "yFB25fqfU8I"], "start_seconds": ["23", "300"], "properties": ["speed, water, boat", "wave, crash, shoreline"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which is moving slower", "label": 0}, {"captions": ["birds chirp and an insect buzzes around", "several insects fly while two men talk"], "sample_ids": ["t97k0cejSQE", "s-T9OVOiMLo"], "start_seconds": ["250", "330"], "properties": ["bird, chirp, insect", "several, fly, men"], "captions_pred_video": ["a bee on a purple thistle flower", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more insects", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "a man speaks as a motor runs in the background"], "sample_ids": ["sHbXC6na9hg", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["a person, saw, wood", "background, motor, run"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["an engine is idling and vibrating", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["an airplane engine runs", "a duck quacks continuously"], "sample_ids": ["yVPZ2MNWpms", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["engine, airplane, runs", "quacks, continuously, duck"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a car is driving by on the road ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "plastic is tapped on while someone speaks"], "sample_ids": ["xO-Q2BlIIPU", "wvKpEYswXO0"], "start_seconds": ["30", "150"], "properties": ["two men, exclamation, speak", "plastic, tap, speak"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is about a man speaking to another man?", "label": 0}, {"captions": ["a woman speaks in a fast tone with a male", "a stream of water flows as people talk and wind blows"], "sample_ids": ["sTpirNYo8vQ", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, tone, fast", "stream, water, flow"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a male speaks over some small clicks", "a telephone rings followed by a woman talking"], "sample_ids": ["uXxVebHsGZ8", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["male, clicks, speak", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "some men converse over an engine running"], "sample_ids": ["shmR4OZtzqA", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["man, engine, idle", "men, converse, engine"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", null], "captions_pred_audio": ["a man speaks while a motor runs", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows a man speaking as a vehicle engine idles?", "label": 0}, {"captions": ["a person screams glaringly", "a train horn blows as it passes by"], "sample_ids": ["xC8kbrKJmco", "zVacuqSb4LI"], "start_seconds": ["0", "30"], "properties": ["glaringly, screams, person", "horn, blows, train"], "captions_pred_video": [null, "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a goat is bleating ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows strongly", "waves crash against a shoreline and people speak"], "sample_ids": ["w8uLijTqtlU", "yFB25fqfU8I"], "start_seconds": ["70", "300"], "properties": ["wind, blows, strongly", "wave, crash, shoreline"], "captions_pred_video": ["footage is blurry and shaky", "footage of a person surfing in the ocean"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more powerful", "label": 1}, {"captions": ["bees buzz and wind blows", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tMJne1a4AFI", "uZesmtKZGSw"], "start_seconds": ["0", "250"], "properties": ["bees buzz, wind blows, bees", "men, talk, cars"], "captions_pred_video": ["a swarm of bees on the ground", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vb1fPSDI4c", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["multiple, people, yell", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a weapon fires multiple times", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sMC07Ucy7kg", "zFjIWfSD-4"], "start_seconds": ["10", "410"], "properties": ["weapon, fire, multiple", "People, motor, brakes"], "captions_pred_video": ["footage is from a car's point of view", null], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "an engine runs loudly"], "sample_ids": ["wnpJndXuxLc", "vqZuVbG6-HI"], "start_seconds": ["50", "130"], "properties": ["blows, vehicle, train", "loud, engine, run"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a lawn mower is running and men are speaking "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tw76HGONaKg", "uZesmtKZGSw"], "start_seconds": ["570", "250"], "properties": ["music, click, man", "men, talk, cars"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["goats bleat and metal clings", "pigeons vocalize and birds chirp"], "sample_ids": ["tH17JPjDPnc", "uiS58TNyUiw"], "start_seconds": ["260", "430"], "properties": ["bleat, metal, clings", "vocalize, bird, chirp"], "captions_pred_video": ["feed of the goats eating hay in the barn", "of the pigeon in the cage"], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking and a bee is buzzing"], "question": "which animal is vocalizing", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a woman talks while a baby cries and a man whispers"], "sample_ids": ["sG7TyPnFDR0", "smDKStoHBJo"], "start_seconds": ["180", "0"], "properties": ["beeps, machine, smoke alarm", "a, talk, baby, cry"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "a man holding a crying baby in his arms"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a baby is crying and a woman is speaking"], "question": "which entity has a baby?", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a man speaks as a car is passing by"], "sample_ids": ["vZAw4apG0Es", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["background, tick, repeat", "a, car, pass"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a moving object", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "long loud burping by a man"], "sample_ids": ["uzQnlJXBbOM", "xmiUIOhtZyQ"], "start_seconds": ["50", "60"], "properties": ["ringing, beep, stop", "loud, burp, man"], "captions_pred_video": ["footage of a person using a cell phone on a table", "homer simpson drinking a beer"], "captions_pred_audio": ["a telephone rings and a man speaks", "a person burps and music plays in the background "], "question": "which is louder", "label": 1}, {"captions": ["someone sprays a liquid onto a hard surface making a hiss sound", "a person snores loudly multiple times at a close distance"], "sample_ids": ["zO-LSSY92ZM", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["liquid, surface, sound", "loud, multiple, distance"], "captions_pred_video": ["youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'", null], "captions_pred_audio": ["steam is hissing and hissing", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks continuously", "wind blows as people chatter quietly"], "sample_ids": ["vlJS7LN2XyM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks continuously", "wind, chatter, people"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage is blurry and out of focus"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a train engine runs and a horn blows", "several insects fly while two men talk"], "sample_ids": ["zPX9o1uDiI", "s-T9OVOiMLo"], "start_seconds": ["40", "330"], "properties": ["engine, horn, run", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["yZmhM1HcsyE", "vbZ-0lGPneg"], "start_seconds": ["4", "30"], "properties": ["engine, roar, water", "a woman, a television program, a bird"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a duck quacks continuously"], "sample_ids": ["sxYkFKFIZD0", "vh30P49Po6s"], "start_seconds": ["20", "30"], "properties": ["screech, man, door", "quacks, continuously, duck"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "birds chirp in the background while a horse neighs followed by a girl speaking"], "sample_ids": ["vs65y4qmyBE", "s59PfAghdkM"], "start_seconds": ["340", "0"], "properties": ["engine, run, man", "bird, chirp, background, horse, neigh"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "is an anime scene featuring two people and a horse in the foreground and a fence in the background"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "birds are chirping a horse is neighing and a woman is speaking "], "question": "which entity has a horse neighing?", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "some men converse over an engine running"], "sample_ids": ["x5cuQjOdM3E", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["cat, meows, young woman", "men, converse, engine"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more active", "label": 1}, {"captions": ["a man talks while vehicles pass by", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sK4u5T8hW78", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["a, man, talk", "gun, shoot, water"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used for hunting", "label": 1}, {"captions": ["a stream runs then someone speaks", "a woman sneezes then speaks"], "sample_ids": ["wbHTKEJZyhc", "x4dZyf9Gbj0"], "start_seconds": ["20", "130"], "properties": ["stream, run, someone", "sneezes, speaks, woman"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "footage is blurry and out of focus"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a woman sneezes and speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a person sniffles and sneezes", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["uRlbY6aoBU", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["sneezes, sniffles, person", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is sneezing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["s4Uz1Ffgo04", "vbZ-0lGPneg"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "a woman, a television program, a bird"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman is speaking and a dog is whimpering"], "question": "which entity is more quiet", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "paper is crumpling consistently"], "sample_ids": ["w8uLijTqtlU", "v5cSxLaHADY"], "start_seconds": ["70", "0"], "properties": ["wind, microphone, noise", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry and shaky", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["the wind is blowing strongly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["an insect buzzes around continuously", "three men talk while wind blows and some liquid flows"], "sample_ids": ["v25l1jef3JY", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["buzzes, continuously, insect", "three men, wind, flow"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a person", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "dishes cling together then a man begins to speak"], "sample_ids": ["uYT5gxnyMWM", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["person, spray, yell", "cling, speak, dishes"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "mechanisms are operating and water is splashing "], "question": "which entity is about a person speaking?", "label": 0}, {"captions": ["a vehicle engine runs while a siren and horn sound", "people applaud and hoot and chat quietly"], "sample_ids": ["u--KhUW8l1Y", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["engine, sound, horn", "people, applaud, hoot"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", null], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["uRExseg-0XI", "vfYTJq7nU"], "start_seconds": ["210", "130"], "properties": ["woman, man, water", "rustling, ducks, quack"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", null], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a duck quacks and a woman speaks"], "question": "which entity has more water", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a telephone rings followed by a woman talking"], "sample_ids": ["zY3icUyMdh8", "tGcFnX0GHI"], "start_seconds": ["20", "0"], "properties": ["dog, bark, engine", "ring, talk, woman"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "a clock alarm sounds and gears turn"], "sample_ids": ["zALy31PjDl0", "w2M4i1mklOA"], "start_seconds": ["21", "30"], "properties": ["a man, a vehicle, a horn", "alarm, gears, turn"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage of an antique clock"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a clock is ticking and a bell is ringing "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a machine runs continuously", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["wdXV3Pv0jiY", "tDlysoZiA1I"], "start_seconds": ["11", "0"], "properties": ["machine, running, continuously", "animal, grunts, chirps"], "captions_pred_video": ["footage is blurry and shaky", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "birds are chirping and a rooster is crowing "], "question": "which entity is not a machine?", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["shmR4OZtzqA", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["man, engine, idle", "rooster, crow, background, men"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man speaks while a motor runs", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "animals bleat and moo as a person speaks"], "sample_ids": ["vddP56-ogds", "tPJvjq9QePY"], "start_seconds": ["30", "40"], "properties": ["liquid, laughs, man", "animal, bleat, moo"], "captions_pred_video": [null, "a dog and a sheep in a barn"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a baby cries and a man speaks"], "question": "which entity is more animal-like", "label": 1}, {"captions": ["people speak as gunfire rings out", "a man speaks as a car is passing by"], "sample_ids": ["wqTCwqVRDlk", "sK4u5T8hW78"], "start_seconds": ["80", "30"], "properties": ["gunfire, ring, speak", "a, car, pass"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a man is speaking with background noise and breathing sounds "], "question": "which entity is more calm", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["sa6TLVbooCc", "yDoT73BWsdA"], "start_seconds": ["240", "10"], "properties": ["people, laugh, child", "engine, revs, vehicle"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sDSppXIlJrs", "sSMl2vc3ek"], "start_seconds": ["27", "20"], "properties": ["microphone, water, wind", "loud, multiple, distance"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", null], "captions_pred_audio": ["the wind is blowing and water is splashing", "a person snoring loudly"], "question": "which is louder", "label": 1}, {"captions": ["someone is snoring while sleeping", "a man speaks followed by another man speaking outside"], "sample_ids": ["ujMt0-D-x2k", "viuTg1M-dqg"], "start_seconds": ["0", "30"], "properties": ["snore, sleep, someone", "two men, speak, follow"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity shows two men speaking?", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "some tunes played by whistling"], "sample_ids": ["xvDdE3zNf8Y", "u6BnG6YZqJ4"], "start_seconds": ["120", "0"], "properties": ["a, female, speaks", "tune, play, whistling"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman speaks and crumples paper", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a stream of water runs briefly"], "sample_ids": ["t25U-v4k4ts", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["bees buzz, birds chirp, man speaks", "stream, water, run"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["t97k0cejSQE", "tDVADusiIoc"], "start_seconds": ["250", "60"], "properties": ["bird, chirp, insect", "water, radio, man"], "captions_pred_video": ["a bee on a purple thistle flower", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["t25U-v4k4ts", "vlS6YMeWAPo"], "start_seconds": ["40", "40"], "properties": ["a, chirps, bird", "sheep, baa, birds"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a goat bleats and birds chirp"], "question": "which entity has more animals", "label": 1}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "birds chirp in the background while a horse neighs followed by a girl speaking"], "sample_ids": ["wvKpEYswXO0", "s59PfAghdkM"], "start_seconds": ["150", "0"], "properties": ["sound, water, running", "bird, chirp, background, horse, neigh"], "captions_pred_video": ["of the person preparing food in the kitchen", "is an anime scene featuring two people and a horse in the foreground and a fence in the background"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "birds are chirping a horse is neighing and a woman is speaking "], "question": "which entity has a horse neighing?", "label": 1}, {"captions": ["continuous sneezing together with speech", "a woman speaks happily and an animal chirps"], "sample_ids": ["x4dZyf9Gbj0", "uWAAAL4CIoc"], "start_seconds": ["130", "0"], "properties": ["continuous, sneeze, speech", "a woman, chirps, animal"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a woman sneezes and speaks", "a woman is speaking and a dog is barking "], "question": "which entity is more like a bird", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["y8WEcpOlT3I", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["harsh, wind, blows", "rustling, ducks, quack"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a duck quacks and a woman speaks"], "question": "which entity is about a harsh wind blowing?", "label": 0}, {"captions": ["heavy rain splashes as it falls", "vehicles pass by on a roadway"], "sample_ids": ["wP8ZKrlx3oA", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["fall, rain, splash", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["children cry and people talk", "a telephone rings followed by a woman talking"], "sample_ids": ["xLwHe825Zs", "tGcFnX0GHI"], "start_seconds": ["18", "0"], "properties": ["people talk, children cry, people talk", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby cries and a woman speaks", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["a child yells and another yells", "a toilet flushes and water drains"], "sample_ids": ["vMDHu7Lxcgw", "sfAvvZwdLCY"], "start_seconds": ["410", "20"], "properties": ["two, yell, child", "water drains, flushes, water"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["birds chirp as a bell rings", "water flows as men speak and yell"], "sample_ids": ["ziUT9IFTkjg", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["chirp, bell, ring", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["an airplane engine runs", "an insect buzzes around continuously"], "sample_ids": ["yVPZ2MNWpms", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["engine, airplane, runs", "buzzes, continuously, insect"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a car is driving by on the road ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vcmWSmvti8", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["music, man, fire", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more liquid flowing", "label": 1}, {"captions": ["food is frying and sizzles", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zNRChLjqcU", "xKB8O8LTs6s"], "start_seconds": ["220", "70"], "properties": ["food is frying, sizzles, food", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["water is running from a faucet into a sink", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a movie?", "label": 1}, {"captions": ["some tunes played by whistling", "water runs from a faucet while some men speak and the water runs in the sink"], "sample_ids": ["u6BnG6YZqJ4", "vzceMbklWc"], "start_seconds": ["0", "180"], "properties": ["tune, play, whistling", "water, faucet, sink"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", null], "captions_pred_audio": ["a person whistling a song", "water is running and a man is speaking"], "question": "which entity is a video of a person playing a tune?", "label": 0}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "dishes cling together then a man begins to speak"], "sample_ids": ["uqFtmnhuqA8", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["a, b, c", "cling, speak, dishes"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "mechanisms are operating and water is splashing "], "question": "which entity is about a clock?", "label": 0}, {"captions": ["a man speaks as crickets sing", "an adult speaks and is typing on a computer keyboard"], "sample_ids": ["ryFDPxgDOGc", "x9JovgqUcs"], "start_seconds": ["570", "500"], "properties": ["a, crickets, sing", "An adult is speaking, typing, and using a computer keyboard"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man speaks and types on a keyboard"], "question": "which entity is speaking", "label": 1}, {"captions": ["a child speaks", "a man is snoring loudly and repeatedly"], "sample_ids": ["yW6FWLSLkx4", "sncRqQ67iJU"], "start_seconds": ["40", "460"], "properties": ["a, child, speaks", "loud, repeatedly, man"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a person is snoring"], "question": "which entity is louder", "label": 1}, {"captions": ["a person sniffles and sneezes", "water is sprayed across a hard surface"], "sample_ids": ["uRlbY6aoBU", "sQwlkXjQabo"], "start_seconds": ["0", "10"], "properties": ["sneezes, sniffles, person", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is sneezing ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a person is snoring while sleeping", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vJrjSeP17yE", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["a person is sleeping, snoring, person", "wind, blow, vehicle"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a door slams shut roughly", "water flows as men speak and yell"], "sample_ids": ["zkKdxzNC97Y", "vJ7JPEFhyLA"], "start_seconds": ["27", "16"], "properties": ["a door, slams, shut", "water, flow, men"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 0}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a man speaks while rain falls onto a hard surface"], "sample_ids": ["wvKpEYswXO0", "wqN6IIHw3po"], "start_seconds": ["150", "30"], "properties": ["sound, water, running", "rain, surface, fall"], "captions_pred_video": ["of the person preparing food in the kitchen", "in your own words what is happening in this screenshot? blood splattered all over the place"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking and water is splashing"], "question": "which entity has a hard surface?", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xV7Mg1QucSc", "tdWhHV3X25Q"], "start_seconds": ["14", "60"], "properties": ["alarm, ticktocks, laughs", "applause, audience, yells"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a person screams glaringly", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["xC8kbrKJmco", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["glaringly, screams, person", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a goat is bleating ", "a woman is speaking and a baby is crying"], "question": "which entity is a person?", "label": 0}, {"captions": ["a train horn blows as it passes by", "a car speeding up in the distance"], "sample_ids": ["zVacuqSb4LI", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["horn, blows, train", "distance, car, speed"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", null], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["xjvTpk2Zpr8", "tDVADusiIoc"], "start_seconds": ["70", "60"], "properties": ["wind, blows, vehicle", "water, radio, man"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a vehicle racing past?", "label": 0}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "pigeons vocalize and birds chirp"], "sample_ids": ["xyL9F5VrjkE", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["wind, motor, distance", "vocalize, bird, chirp"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "of the pigeon in the cage"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a clock ticktocks continuously", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["vlJS7LN2XyM", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["ticktocks, clock, ticktocks continuously", "background, male, female"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a ticktock of a clock", "a lawn mower is running and men are speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["people speak softly as food sizzles", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yhQ2Lg-7qDY", "zj2R0XoFr5k"], "start_seconds": ["130", "50"], "properties": ["food, sizzle, speak", "airplane, boy, fly"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about to fly", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "someone whistles a tune"], "sample_ids": ["xM4joTqDVp4", "sIXTftIuUgw"], "start_seconds": ["160", "90"], "properties": ["background, chirp, birds", "someone, tune, whistle"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", null], "captions_pred_audio": ["birds are chirping and a train is moving ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a stream of water flows quickly", "some men converse over an engine running"], "sample_ids": ["wbHTKEJZyhc", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["stream, water, flow", "men, converse, engine"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a still image?", "label": 0}, {"captions": ["a man speaking with light rustling", "waves crash against a shoreline and people speak"], "sample_ids": ["zOZleIRqZm4", "yFB25fqfU8I"], "start_seconds": ["80", "300"], "properties": ["light, rustling, man", "wave, crash, shoreline"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["a man talks followed by a woman shouting", "three men talk while wind blows and some liquid flows"], "sample_ids": ["s3cTDAj31g", "vJ7JPEFhyLA"], "start_seconds": ["80", "16"], "properties": ["man, talk, woman", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vzxHnu-SFEw", "y8WEcpOlT3I"], "start_seconds": ["80", "40"], "properties": ["two objects, woman, speak", "harsh, wind, blows"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["an animal growls followed by birds chirping", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["y2ZBGpgbhHM", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["animal, growl, bird", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking and a dog is whimpering"], "question": "which entity is a television program?", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a child speaks in closed space"], "sample_ids": ["vbZ-0lGPneg", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["a woman, a television program, a bird", "child, space, speak"], "captions_pred_video": ["of a man holding a baby duck in his hands", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["an engine idles quietly then gradually becomes louder", "small dogs yip and bark sharply"], "sample_ids": ["vbr9mHKc8WM", "v-wcQf4BDY0"], "start_seconds": ["40", "120"], "properties": ["noise, loudness, engine", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["an engine is idling", "a dog barks and growls"], "question": "which entity is louder", "label": 1}, {"captions": ["an airplane engine runs", "winds blows roughly as a vehicle races past"], "sample_ids": ["yVPZ2MNWpms", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["engine, airplane, runs", "wind, blows, vehicle"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a car is driving by on the road ", "a jet engine roars and wind blows "], "question": "which entity is moving", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a man speaks as a car is passing by"], "sample_ids": ["sLUnaPT5gM8", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["loud, laughter, intermittent", "a, car, pass"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking to a car passing by?", "label": 1}, {"captions": ["birds chirp and wind blows", "goats bleat and people speak"], "sample_ids": ["sxIvBMSavMQ", "z5iUE5h0EPs"], "start_seconds": ["210", "30"], "properties": ["birds, chirp, wind", "goats bleat, people speak, language"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "of the goat in the barn"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a goat bleats and a man speaks"], "question": "which entity is a language", "label": 1}, {"captions": ["a door opens and birds chirp", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["yeFvk9x0wWI", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["door, open, birds", "loud, laughter, intermittent"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["water is sprayed across a hard surface", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sQwlkXjQabo", "vb1fPSDI4c"], "start_seconds": ["10", "30"], "properties": ["water, spray, surface", "multiple, people, yell"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", null], "captions_pred_audio": ["spraying followed by silence", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["dogs barking and whimpering", "an insect buzzes around continuously"], "sample_ids": ["tIY7qOV3rEM", "v25l1jef3JY"], "start_seconds": ["0", "0"], "properties": ["barking, whimpering, dog", "buzzes, continuously, insect"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a fly is buzzing around a microphone "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sofxkNWaP0s", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["wind, engine, louder", "men, talk, cars"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a man speaking and wind blowing as an aircraft engine becomes louder?", "label": 0}, {"captions": ["a man speaks as a motor runs in the background", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["xZepNM9qcRA", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["background, motor, run", "background, birds, rustling"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "birds are chirping and a chime is ringing "], "question": "which background is quieter", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["wRBHTgrbiwg", "yks4cLgIDMc"], "start_seconds": ["50", "170"], "properties": ["bird, owl, speak", "background, speaking, child"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["there are rhythmical snoring nearby", "plastic is tapped on while someone speaks"], "sample_ids": ["ujMt0-D-x2k", "wvKpEYswXO0"], "start_seconds": ["0", "150"], "properties": ["snoring, rhythmical, nearby", "plastic, tap, speak"], "captions_pred_video": ["of the dog playing with a toy on the floor", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["spYNpeN7rPY", "tDVADusiIoc"], "start_seconds": ["1", "60"], "properties": ["a clock, ticktock, man", "water, radio, man"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a clock?", "label": 0}, {"captions": ["a young woman speaks over spraying and another person yells", "paper folding and crinkling"], "sample_ids": ["uYT5gxnyMWM", "zPpG3RD8lSs"], "start_seconds": ["50", "20"], "properties": ["person, spray, yell", "paper, fold, crinkle"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of a skill", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "several insects fly while two men talk"], "sample_ids": ["wudZTNBtVqc", "s-T9OVOiMLo"], "start_seconds": ["60", "330"], "properties": ["accelerates, engine, wind", "several, fly, men"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vlJS7LN2XyM", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["background, clocks, ticking", "male, duck, laugh"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and ducks are quacking"], "question": "which entity is more active", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "an insect buzzes around continuously"], "sample_ids": ["sZPuqDgX2V0", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["engine, accelerate, intercom", "buzzes, continuously, insect"], "captions_pred_video": [null, "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["an engine runs loudly", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["vqZuVbG6-HI", "ziUT9IFTkjg"], "start_seconds": ["130", "10"], "properties": ["loud, engine, run", "background, birds, rustling"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "birds are chirping and a chime is ringing "], "question": "which is quieter", "label": 1}, {"captions": ["a dark barks and whimpers", "a man talks followed by a woman shouting"], "sample_ids": ["sYj4hpDUZDQ", "s3cTDAj31g"], "start_seconds": ["30", "80"], "properties": ["barks, whimpers, dark", "man, talk, woman"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", null], "captions_pred_audio": ["a dog barks and a cat meows", "a man is speaking and a baby is crying"], "question": "which entity is talking", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wqZ135Ssz0", "wz7N8YRy74I"], "start_seconds": ["60", "30"], "properties": ["man, woman, squawks", "rooster, crow, background, men"], "captions_pred_video": [null, "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a car accelerates and wind blows", "paper is crumpling consistently"], "sample_ids": ["u0TrcHhkPQ", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["accelerates, wind, blows", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "distant men speak as a spray can nozzle is depressed"], "sample_ids": ["tEE3MpBt1sg", "rwtmaKiCcQU"], "start_seconds": ["50", "30"], "properties": ["drill, something, laugh", "nozzle, depressed, spray can"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "shows a man spraying paint on a wall with a spray gun"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "spraying and people speaking"], "question": "which entity is about a drill?", "label": 0}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a duck quacks loudly and continuously"], "sample_ids": ["uqFtmnhuqA8", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "loud, continuous, quacks"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "an airplane engine runs"], "sample_ids": ["xV7Mg1QucSc", "yVPZ2MNWpms"], "start_seconds": ["14", "0"], "properties": ["alarm, ticktocks, laughs", "engine, airplane, runs"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "some tunes played by whistling"], "sample_ids": ["sShpyu2l4YQ", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["growl, bark, yip", "tune, play, whistling"], "captions_pred_video": ["the puppies are playing with a toy", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a dog is barking and growling", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "an airplane engine spools and people speak"], "sample_ids": ["sa6TLVbooCc", "wTjoRj1se3U"], "start_seconds": ["240", "390"], "properties": ["people, laugh, child", "airplane, engine, spool"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a jet engine is running and people are talking"], "question": "which entity is about a child speaking?", "label": 0}, {"captions": ["an animal bleats and cries out and metal bangs", "water flows and trickles"], "sample_ids": ["xfudFO976zE", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["animal, bleats, cry", "water, flow, trickle"], "captions_pred_video": ["footage is blurry and shaky", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["dogs barking and whimpering", "a infant makes noise and is excited"], "sample_ids": ["tIY7qOV3rEM", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["barking, whimpering, dog", "noise, excited, infant"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yks4cLgIDMc", "xBxDz0CFVn0"], "start_seconds": ["170", "30"], "properties": ["background, speaking, child", "stream, water, flow"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a child is crying", "a man is speaking with wind noise in the background "], "question": "which entity is a stream of water flowing?", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "wind blowing followed by a zoom"], "sample_ids": ["sQGXqGcwOTc", "vr8ZXjEBhMQ"], "start_seconds": ["3", "150"], "properties": ["cling, speak, dishes", "wind, blow, zoom"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["tOSWIURC-4", "y2bVZ7rz-5M"], "start_seconds": ["0", "280"], "properties": ["noise, engine, revs", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a lawn mower is running ", "a truck is honking its horn and a siren is blaring "], "question": "which noise is made by a motor", "label": 1}, {"captions": ["ticking continues without interruption", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["v-g-j2uTByM", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["ticking, continuous, clock", "a, scream, girl"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a clock is ticking loudly", "a woman is speaking and a baby is crying"], "question": "which entity is not continuous", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "several insects fly while two men talk"], "sample_ids": ["uPDn2BFTHk", "s-T9OVOiMLo"], "start_seconds": ["140", "330"], "properties": ["lady, laugh, baby", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["s4Uz1Ffgo04", "ukg5L09Wpvo"], "start_seconds": ["100", "150"], "properties": ["roars, background, people speaking", "a train, a horn, a bell"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a baby cries and a woman speaks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tMbMDvT50j8", "uYT5gxnyMWM"], "start_seconds": ["12", "50"], "properties": ["a, cry, woman", "female, spraying, scream"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["several beeps are followed by a hit and a woman talking", "a child babbles as a woman speaks"], "sample_ids": ["w34HjHr6gAY", "wEBlkGWVWwE"], "start_seconds": ["30", "260"], "properties": ["beeps, hit, woman", "a, babble, woman"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "shows a person writing on the whiteboard"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a woman is speaking and a child is speaking with background noise and clapping "], "question": "which entity has a woman talking?", "label": 0}, {"captions": ["a train horn sounds and railroad crossing ring", "a child speaks in closed space"], "sample_ids": ["s7knHCFW82w", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["horn, sound, train", "child, space, speak"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "a man speaks as a motor runs in the background"], "sample_ids": ["tqR406bGiE", "xZepNM9qcRA"], "start_seconds": ["40", "30"], "properties": ["flush, water, gurgle", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a toilet is flushed", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["birds chirp and wind blows", "birds chirp and objects are moved around"], "sample_ids": ["sxIvBMSavMQ", "yPUYU6t3rwo"], "start_seconds": ["210", "370"], "properties": ["birds, chirp, wind", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "insects buzz and a man speaks"], "question": "which entity is about birds chirping?", "label": 0}, {"captions": ["vehicle engines race around a track as a man commentates", "a man speaks followed by another man speaking outside"], "sample_ids": ["sZPuqDgX2V0", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["commentator, race, track", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a man speaking followed by another man speaking outside?", "label": 1}, {"captions": ["an airplane engine roars increasingly louder", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["vBslzh7saPw", "su6FAOcOA8c"], "start_seconds": ["90", "4"], "properties": ["engine, roar, louder", "engine, idle, woman"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a woman is speaking and a subway train is moving "], "question": "which engine is quieter", "label": 1}, {"captions": ["a toilet flushes and water drains", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sfAvvZwdLCY", "vYkA3cfXp5Q"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "engine, accelerate, idle"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a toilet is flushed", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["a stream runs then someone speaks", "a car speeding up in the distance"], "sample_ids": ["wbHTKEJZyhc", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["stream, run, someone", "distance, car, speed"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", null], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "wind blowing followed by a zoom"], "sample_ids": ["sOa7g-44Dag", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["audio, scratching, man", "wind, blow, zoom"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a video", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "winds blows roughly as a vehicle races past"], "sample_ids": ["sNB8zxXneIM", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["several, quack, cocks", "wind, blows, vehicle"], "captions_pred_video": ["a group of geese in a cage", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a child speaks in closed space"], "sample_ids": ["zsLxS-uLJTw", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["horn, blast, train", "child, space, speak"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["an aircraft engine runs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["yLCORCnd35Q", "yDoT73BWsdA"], "start_seconds": ["0", "10"], "properties": ["engine, aircraft, runs", "engine, revs, vehicle"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a race car accelerates and revs its engine "], "question": "which entity has a running engine", "label": 0}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "gunshots ring out, a man yells, and more shots follow"], "sample_ids": ["vfYTJq7nU", "vKrYfzleLB8"], "start_seconds": ["130", "110"], "properties": ["ducks, quack, man", "a, ring, gunshots"], "captions_pred_video": [null, "stock footage of a person holding a gun in their hand"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a man is speaking with background noise and a cap gun is fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a machine beeps continuously"], "sample_ids": ["vuUVPzd2FXw", "y682ml90jGw"], "start_seconds": ["160", "11"], "properties": ["a, steam, release", "beeps, machine, continuously"], "captions_pred_video": ["of the person cooking on the grill with a spatula", null], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "an insect buzzes around continuously"], "sample_ids": ["sAam2NqGhLY", "v25l1jef3JY"], "start_seconds": ["20", "0"], "properties": ["snoring, breathing, child", "buzzes, continuously, insect"], "captions_pred_video": ["of a little girl sleeping on a couch", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a person is snoring", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "some tunes played by whistling"], "sample_ids": ["zuua6-5goWw", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["sound, pop, bird", "tune, play, whistling"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a child speaks in closed space"], "sample_ids": ["vSeGhaZt-aI", "yW6FWLSLkx4"], "start_seconds": ["50", "40"], "properties": ["water, bubbles, run", "child, space, speak"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "pigeons vocalize and birds chirp"], "sample_ids": ["xvDdE3zNf8Y", "uiS58TNyUiw"], "start_seconds": ["120", "430"], "properties": ["a, female, speaks", "vocalize, bird, chirp"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "of the pigeon in the cage"], "captions_pred_audio": ["a woman speaks and crumples paper", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["several insects fly while two men talk", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["s-T9OVOiMLo", "uZesmtKZGSw"], "start_seconds": ["330", "250"], "properties": ["several, fly, men", "men, talk, cars"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "an engine runs loudly"], "sample_ids": ["zY3icUyMdh8", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["dog, bark, engine", "loud, engine, run"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a car accelerates and wind blows", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["u0TrcHhkPQ", "su6FAOcOA8c"], "start_seconds": ["20", "4"], "properties": ["accelerates, wind, blows", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a machine runs continuously", "someone is typing on a computer keyboard"], "sample_ids": ["wdXV3Pv0jiY", "v0x1odnXtP0"], "start_seconds": ["11", "210"], "properties": ["machine, running, continuously", "keyboard, type, computer"], "captions_pred_video": ["footage is blurry and shaky", "how to make money on youtube in spanish"], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a person is typing on a keyboard"], "question": "which is not a machine", "label": 1}, {"captions": ["a person speaks over rustling leaves", "people applaud and hoot and chat quietly"], "sample_ids": ["zOZleIRqZm4", "wwyfGO2J4"], "start_seconds": ["80", "90"], "properties": ["rustling, leaves, person", "people, applaud, hoot"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "pigeons vocalize and birds chirp"], "sample_ids": ["ukxt9I7eMMg", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["food, pan, cook", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of the pigeon in the cage"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a person", "label": 1}, {"captions": ["a clock ticktocks continuously", "a infant makes noise and is excited"], "sample_ids": ["vlJS7LN2XyM", "wIJK3-5y0kA"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, ticktocks continuously", "noise, excited, infant"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a ticktock of a clock", "a baby cries and a woman speaks"], "question": "which entity is quieter", "label": 0}, {"captions": ["a person burps loudly for a long time nearby", "an aircraft engine runs as wind blows heavily"], "sample_ids": ["vf44CgrjT0A", "xjvTpk2Zpr8"], "start_seconds": ["20", "70"], "properties": ["loud, long, person", "engine, run, wind"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a loud burp", "a jet engine roars and wind blows "], "question": "which entity is running", "label": 1}, {"captions": ["someone whistles a tune", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sIXTftIuUgw", "vJ7JPEFhyLA"], "start_seconds": ["90", "16"], "properties": ["someone, tune, whistle", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person whistling a song", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be a song", "label": 0}, {"captions": ["a helicopter engine runs continuously", "paper is crumpling consistently"], "sample_ids": ["ugHJF0hfYkg", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["engine, running, continuously", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a helicopter is flying overhead ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["sapQIQUhFc", "uYT5gxnyMWM"], "start_seconds": ["280", "50"], "properties": ["water, trickles, flow", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a machine clanks and thumps and a male speaks", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sWZzXuWYY", "xKB8O8LTs6s"], "start_seconds": ["420", "70"], "properties": ["male, clanks, thumps", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "multiple people speak and children yell while water gurgles"], "sample_ids": ["sa6TLVbooCc", "vb1fPSDI4c"], "start_seconds": ["240", "30"], "properties": ["people, laugh, child", "multiple, people, yell"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", null], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "people speak as gunfire rings out"], "sample_ids": ["yswmmRZFItk", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["background, frog, croak", "gunfire, ring, speak"], "captions_pred_video": ["a close up of a frog in the water", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a frog is croaking", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a vehicle accelerates before a race car idles then accelerates quickly"], "sample_ids": ["zFjIWfSD-4", "sjlVMgdGSK0"], "start_seconds": ["410", "30"], "properties": ["People, motor, brakes", "accelerates, vehicle, race car"], "captions_pred_video": [null, "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a toilet flushes and a female speaks"], "sample_ids": ["s7knHCFW82w", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["blow horn, get close, train", "female, flushes, toilet"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "footage is blurry and out of focus"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "an electric engine works nearby followed by a child talking"], "sample_ids": ["sOa7g-44Dag", "xSKJGCItUWE"], "start_seconds": ["30", "10"], "properties": ["audio, scratching, man", "engine, work, child"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a high pitched engine is running and a child speaks"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["wind blows as people chatter quietly", "paper is crumpling consistently"], "sample_ids": ["xBxDz0CFVn0", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["wind, chatter, people", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry and out of focus", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "a motor vehicle roars, drowning out people speaking in the background"], "sample_ids": ["vZAw4apG0Es", "s4Uz1Ffgo04"], "start_seconds": ["30", "100"], "properties": ["background, tick, repeat", "roars, background, people speaking"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of an ambulance arriving at the scene of an accident"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking while a boat is moving and wind is blowing "], "question": "which entity is louder", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "a car accelerates and wind blows"], "sample_ids": ["spYNpeN7rPY", "u0TrcHhkPQ"], "start_seconds": ["1", "20"], "properties": ["a clock, ticktock, man", "accelerates, wind, blows"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", null], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["w34HjHr6gAY", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["beeps, squawk, child speaking", "loud, jet engine, roar"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "water flows as men speak and yell"], "sample_ids": ["vqZuVbG6-HI", "vJ7JPEFhyLA"], "start_seconds": ["130", "16"], "properties": ["background, male, female", "water, flow, men"], "captions_pred_video": ["footage is blurry because it's raining outside", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water flowing", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "pigeons vocalize and birds chirp"], "sample_ids": ["ukg5L09Wpvo", "uiS58TNyUiw"], "start_seconds": ["150", "430"], "properties": ["a train, a horn, a bell", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "of the pigeon in the cage"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a duck quacks continuously"], "sample_ids": ["vBHyYJ8pL0", "vh30P49Po6s"], "start_seconds": ["2", "30"], "properties": ["noise, door, opening", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a dark barks and whimpers", "a frog croaks as other frogs croak in the background"], "sample_ids": ["sYj4hpDUZDQ", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["barks, whimpers, dark", "background, frog, croak"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", "a close up of a frog in the water"], "captions_pred_audio": ["a dog barks and a cat meows", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "two women and a man talk while a kid cries"], "sample_ids": ["sWZzXuWYY", "wyllXV6PjKo"], "start_seconds": ["420", "30"], "properties": ["male, speech, banging", "a kid, talk, cry"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman speaks and a baby cries"], "question": "which entity has a kid crying?", "label": 1}, {"captions": ["food is frying and sizzles", "a woman speaks as she rubs two objects together"], "sample_ids": ["zNRChLjqcU", "vzxHnu-SFEw"], "start_seconds": ["220", "80"], "properties": ["food is frying, sizzles, food", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["water is running from a faucet into a sink", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is not a person?", "label": 0}, {"captions": ["a man speaks as a machine runs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vD6lYD1l0BY", "uYT5gxnyMWM"], "start_seconds": ["330", "50"], "properties": ["a, machine, run", "female, spraying, scream"], "captions_pred_video": ["game controller being held in the hands of the person", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a woman is speaking and a baby is crying"], "question": "which entity has a female spraying and screaming?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "water flows and trickles"], "sample_ids": ["x9JovgqUcs", "tB7hWb9gTuQ"], "start_seconds": ["500", "30"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man speaks and types on a keyboard", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "an infant crying as a woman laughs"], "sample_ids": ["y8WEcpOlT3I", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["harsh, wind, blows", "a, laugh, infant"], "captions_pred_video": ["on how to use a sewing machine youtube", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["some men converse over an engine running", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sCiy7QS1U", "zj2R0XoFr5k"], "start_seconds": ["300", "50"], "properties": ["men, converse, engine", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["water flows and trickles", "a stream of water flows as people talk and wind blows"], "sample_ids": ["tB7hWb9gTuQ", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["water, flow, trickle", "stream, water, flow"], "captions_pred_video": ["the rocks on the beach are surrounded by water and the sky is visible in the background", "footage is blurry and out of focus"], "captions_pred_audio": ["water is splashing and gurgling", "a man is speaking with wind noise in the background "], "question": "which entity is flowing", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "a man speaks as a motor runs in the background"], "sample_ids": ["sZPuqDgX2V0", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["engine, accelerate, intercom", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "water pouring and bubbling"], "sample_ids": ["weDbePuc-Xc", "uyRfq-jKPpo"], "start_seconds": ["40", "50"], "properties": ["music, slaps, human", "water, bubbles, pouring"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "a toilet flushes and water drains"], "sample_ids": ["wqADXCzngMw", "sfAvvZwdLCY"], "start_seconds": ["340", "20"], "properties": ["audio, humming, revving", "water drains, flushes, water"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a stream of water flows as people talk and wind blows"], "sample_ids": ["ukxt9I7eMMg", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["continuous, woman, speaking", "stream, water, flow"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is a video of a stream of water flowing?", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a woman speaks happily and an animal chirps"], "sample_ids": ["sG7TyPnFDR0", "uWAAAL4CIoc"], "start_seconds": ["180", "0"], "properties": ["beeps, machine, smoke alarm", "a woman, chirps, animal"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", null], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a woman is speaking and a dog is barking "], "question": "which entity is more likely to be a bird?", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["x9JovgqUcs", "xKB8O8LTs6s"], "start_seconds": ["500", "70"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "music, gunfire, explosion"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man speaks and types on a keyboard", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more calm", "label": 0}, {"captions": ["a door opens and birds chirp", "birds chirp quietly and an adult man speaks"], "sample_ids": ["yeFvk9x0wWI", "zuua6-5goWw"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "birds, chirp, quiet, man, speaks"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "birds are chirping and a man is speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a machine clanks and thumps and a male speaks"], "sample_ids": ["tQWGZLItBXk", "sWZzXuWYY"], "start_seconds": ["170", "420"], "properties": ["voice, music, whoosh", "male, clanks, thumps"], "captions_pred_video": ["worms revolution screenshots", null], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a sewing machine runs and a man speaks"], "question": "which entity has a male speaking?", "label": 1}, {"captions": ["long loud burping by a man", "vehicles pass by on a roadway"], "sample_ids": ["xmiUIOhtZyQ", "tgbONvsP47Y"], "start_seconds": ["60", "0"], "properties": ["loud, burp, man", "pass, vehicle, roadway"], "captions_pred_video": ["homer simpson drinking a beer", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person burps and music plays in the background ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a horn rings out as a machine runs by"], "sample_ids": ["xBxDz0CFVn0", "slZLHwNbbt4"], "start_seconds": ["30", "300"], "properties": ["wind, chatter, people", "a, horn, run"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "dishes cling together then a man begins to speak"], "sample_ids": ["wRV8yMk886E", "sQGXqGcwOTc"], "start_seconds": ["0", "3"], "properties": ["liquid, spray, nozzle", "cling, speak, dishes"], "captions_pred_video": ["two cars are parked in a parking lot at night", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man speaks followed by a loud burst", "mechanisms are operating and water is splashing "], "question": "which entity is about speaking?", "label": 0}, {"captions": ["electronic beeps occur in a short series", "dishes cling together then a man begins to speak"], "sample_ids": ["y682ml90jGw", "sQGXqGcwOTc"], "start_seconds": ["11", "3"], "properties": ["beeps, series, electronic", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a beeping sound is being made ", "mechanisms are operating and water is splashing "], "question": "which entity is a video", "label": 1}, {"captions": ["wind blows and people scream while an engine revs", "a machine beeps continuously"], "sample_ids": ["w5W5Kqtc8E", "y682ml90jGw"], "start_seconds": ["100", "11"], "properties": ["wind, engine, scream", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["people clap and speak in the distance", "a door slams shut roughly"], "sample_ids": ["wwyfGO2J4", "zkKdxzNC97Y"], "start_seconds": ["90", "27"], "properties": ["clap, distance, speak", "a door, slams, shut"], "captions_pred_video": [null, "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a door is opened and closed"], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "an airplane engine runs"], "sample_ids": ["sapQIQUhFc", "yVPZ2MNWpms"], "start_seconds": ["280", "0"], "properties": ["water, stream, trickles", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "a clock ticktocks"], "sample_ids": ["tDlysoZiA1I", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, multiple", "ticktocks, clock, ticktocks"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a person is snoring while sleeping"], "sample_ids": ["vSeGhaZt-aI", "vJrjSeP17yE"], "start_seconds": ["50", "40"], "properties": ["water, sink, talk", "a person is sleeping, snoring, person"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "a man speaks as a car is passing by"], "sample_ids": ["sSMl2vc3ek", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["loud, multiple, distance", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "someone whistles a tune"], "sample_ids": ["v5P-ThUCINM", "sIXTftIuUgw"], "start_seconds": ["400", "90"], "properties": ["background, chirp, bird", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and birds are chirping", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "wind blowing followed by a zoom"], "sample_ids": ["vBHyYJ8pL0", "vr8ZXjEBhMQ"], "start_seconds": ["2", "150"], "properties": ["noise, door, opening", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is accompanied by a zoom", "label": 1}, {"captions": ["children speak as a female ask them questions", "a machine beeps continuously"], "sample_ids": ["wEBlkGWVWwE", "y682ml90jGw"], "start_seconds": ["260", "11"], "properties": ["female, speak, questions", "beeps, machine, continuously"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a beeping sound is being made "], "question": "which entity is not a machine?", "label": 0}, {"captions": ["roadway noise occurs and a truck accelerates", "roadway noise occurs and a truck accelerates"], "sample_ids": ["tgbONvsP47Y", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["noise, truck, accelerate", "noise, truck, accelerate"], "captions_pred_video": ["footage of a fire truck entering a garage", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a car is driving on the road ", "a car is driving on the road "], "question": "which truck accelerates", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "a toilet flushes and a female speaks"], "sample_ids": ["wz7N8YRy74I", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["rooster, crow, background, men", "female, flushes, toilet"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "some men converse over an engine running"], "sample_ids": ["wRV8yMk886E", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["liquid, spray, nozzle", "men, converse, engine"], "captions_pred_video": ["two cars are parked in a parking lot at night", null], "captions_pred_audio": ["a man speaks followed by a loud burst", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity shows men speaking and a nozzle spraying liquid?", "label": 0}, {"captions": ["a jet engine screams, then increases its power", "a car speeding up in the distance"], "sample_ids": ["vBslzh7saPw", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["power, scream, increase", "distance, car, speed"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["women speak and laugh as wind blows", "a telephone rings followed by a woman talking"], "sample_ids": ["un9VQlzgZM", "tGcFnX0GHI"], "start_seconds": ["5", "0"], "properties": ["wind, speak, laugh", "ring, talk, woman"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["an engine starts and increases in power", "a toilet flushes and a female speaks"], "sample_ids": ["zjTG0gaGCUI", "yaln9y8I7ms"], "start_seconds": ["80", "230"], "properties": ["power, increase, engine", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a toilet flushes and a man speaks"], "question": "which entity is not a toilet?", "label": 0}, {"captions": ["a clock ticktocks continuously", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vlJS7LN2XyM", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["ticktocks, clock, ticktocks continuously", "men, talk, cars"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a person is whistling", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["sIXTftIuUgw", "vbZ-0lGPneg"], "start_seconds": ["90", "30"], "properties": ["person, whistling, person", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking and a dog is whimpering"], "question": "which entity has more people", "label": 1}, {"captions": ["a small engine spits as it runs", "water splashes as an animal walks through"], "sample_ids": ["sZvwOuuPGP0", "w1ir-sZ3Im8"], "start_seconds": ["50", "90"], "properties": ["spits, engine, runs", "animal, water, splashes"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a medium engine is running ", "water splashes and gurgles as people speak"], "question": "which entity is not a person?", "label": 0}, {"captions": ["an engine runs loudly", "men speak and a nozzle sprays liquid"], "sample_ids": ["vqZuVbG6-HI", "wRV8yMk886E"], "start_seconds": ["130", "0"], "properties": ["loud, engine, run", "liquid, spray, nozzle"], "captions_pred_video": ["footage is blurry because it's raining outside", "two cars are parked in a parking lot at night"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a man speaks followed by a loud burst"], "question": "which entity is a spray?", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "water pouring and bubbling"], "sample_ids": ["xV7Mg1QucSc", "uyRfq-jKPpo"], "start_seconds": ["14", "50"], "properties": ["alarm, ticktocks, laughs", "water, bubbles, pouring"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a woman talking as an infant is crying", "someone is typing on a computer keyboard"], "sample_ids": ["tMbMDvT50j8", "v0x1odnXtP0"], "start_seconds": ["12", "210"], "properties": ["a, talk, infant", "keyboard, type, computer"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "how to make money on youtube in spanish"], "captions_pred_audio": ["a baby cries and a woman speaks", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["wyllXV6PjKo", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["a baby, a woman, a man", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a car accelerates and wind blows"], "sample_ids": ["sapQIQUhFc", "u0TrcHhkPQ"], "start_seconds": ["280", "20"], "properties": ["water, stream, trickles", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a man speaks while water drains"], "sample_ids": ["xSKJGCItUWE", "vSeGhaZt-aI"], "start_seconds": ["10", "50"], "properties": ["engine, run, boy", "water, drain, man"], "captions_pred_video": ["footage of the helicopter flying in the room", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a man speaking while water drains?", "label": 1}, {"captions": ["an audience gives applause as a man yells and a group sings", "an infant crying as a woman laughs"], "sample_ids": ["tdWhHV3X25Q", "xhmRY9yhC7c"], "start_seconds": ["60", "20"], "properties": ["applause, audience, yells", "a, laugh, infant"], "captions_pred_video": ["a man is talking to another man on a stage in front of a microphone", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "water flows and trickles"], "sample_ids": ["w0xsN8X18Y", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["music, surface, rain", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks and is crumpling paper", "a telephone rings followed by a woman talking"], "sample_ids": ["xvDdE3zNf8Y", "tGcFnX0GHI"], "start_seconds": ["120", "0"], "properties": ["A, crumple, paper", "ring, talk, woman"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", null], "captions_pred_audio": ["a woman speaks and crumples paper", "a dial tone sounds followed by a woman speaking"], "question": "which woman is talking", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "wind blows as people chatter quietly"], "sample_ids": ["ukg5L09Wpvo", "xBxDz0CFVn0"], "start_seconds": ["150", "30"], "properties": ["a train, a horn, a bell", "wind, chatter, people"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage is blurry and out of focus"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "an infant crying as a woman laughs"], "sample_ids": ["wIvYjuR3nrg", "xhmRY9yhC7c"], "start_seconds": ["9", "20"], "properties": ["birds, pigeons, vocalize", "a, laugh, infant"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["birds are chirping and cooing", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a propeller rotates loudly and intensely"], "sample_ids": ["w34HjHr6gAY", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["beeps, hit, woman", "loud, intense, propeller"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a person snoring several times", "a stream of water runs briefly"], "sample_ids": ["spJCm8tD9Zo", "x-PeY8Yb8M4"], "start_seconds": ["90", "300"], "properties": ["snore, person, several", "stream, water, run"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person is snoring loudly", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sZPuqDgX2V0", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["commentator, race, track", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "an airplane engine runs"], "sample_ids": ["uoGVs9yUqY4", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["multiple, vocalize, wind", "engine, airplane, runs"], "captions_pred_video": ["for how to make a wooden shed door youtube", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a horn honks and then loudly blares", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["wnpJndXuxLc", "ziUT9IFTkjg"], "start_seconds": ["50", "10"], "properties": ["horn, honk, loud", "background, birds, rustling"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "birds are chirping and a chime is ringing "], "question": "which is quieter", "label": 1}, {"captions": ["sirens ring and approach with humming of distant traffic", "a man speaks followed by another man speaking outside"], "sample_ids": ["xERFUeZONz8", "viuTg1M-dqg"], "start_seconds": ["0", "30"], "properties": ["ring, approach, traffic", "two men, speak, follow"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["an emergency vehicle siren blares", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "an airplane accelerates briefly"], "sample_ids": ["xZepNM9qcRA", "zjTG0gaGCUI"], "start_seconds": ["30", "80"], "properties": ["background, motor, run", "accelerates, airplane, briefly"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", null], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a jet engine roars as wind blows "], "question": "which is a moving object", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zFjIWfSD-4", "wDVMhEdTiVw"], "start_seconds": ["410", "30"], "properties": ["People, motor, brakes", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tOSWIURC-4", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["noise, engine, revs", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking and a crowd is clapping"], "question": "which entity is a response to a performance", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "a man is snoring loudly and repeatedly"], "sample_ids": ["vmrxwuAMb2I", "sncRqQ67iJU"], "start_seconds": ["40", "460"], "properties": ["a dog, inhales, exhales", "loud, repeatedly, man"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "of an airplane flying in the dark sky at night"], "captions_pred_audio": ["a dog barks and growls", "a person is snoring"], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "a clock ticktocks"], "sample_ids": ["vbpKkWvfOu4", "v-g-j2uTByM"], "start_seconds": ["560", "30"], "properties": ["a, woman, man", "ticktocks, clock, ticktocks"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "people cheer as a vehicle engine revs"], "sample_ids": ["spJCm8tD9Zo", "xjhAnI2q6hM"], "start_seconds": ["90", "6"], "properties": ["snores, wheezes, sleeps", "engine revs, vehicle, people"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person is snoring loudly", "a truck is revving its engine and a man is speaking "], "question": "which entity is a person", "label": 0}, {"captions": ["a train horn blows as it passes by", "a woman speaks with water running"], "sample_ids": ["zVacuqSb4LI", "wTideSjRFS0"], "start_seconds": ["30", "30"], "properties": ["horn, blows, train", "water, running, woman"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of a woman cooking in a kitchen with a microwave oven"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a woman is speaking while water is running in the background"], "question": "which entity is a moving object", "label": 0}, {"captions": ["birds tweet and squawk", "birds chirp as a man speaks and a younger person speaks"], "sample_ids": ["w1mlz3Pe4fU", "xl2PIWyXaM"], "start_seconds": ["300", "160"], "properties": ["squawk, tweet, scream", "chirp, man, younger person"], "captions_pred_video": ["of a bird in a cage", null], "captions_pred_audio": ["birds are chirping and singing", "birds are chirping and people are talking"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a man speaks as a motor runs in the background"], "sample_ids": ["shmR4OZtzqA", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["man, engine, idle", "background, motor, run"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man speaks while a motor runs", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["bees buzz and wind blows", "a stream of water flows quickly"], "sample_ids": ["tMJne1a4AFI", "wbHTKEJZyhc"], "start_seconds": ["0", "20"], "properties": ["bees buzz, wind blows, bees", "stream, water, flow"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and"], "captions_pred_audio": ["a swarm of bees buzzing around", "a waterfall is flowing and people are speaking "], "question": "which entity is moving faster", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "some tunes played by whistling"], "sample_ids": ["vddP56-ogds", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["water, flow, laugh", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a toilet door squeaks as it is opened", "speaking following by laughing and clapping"], "sample_ids": ["sdXV-ylviw", "u2f5NpsoHBg"], "start_seconds": ["190", "30"], "properties": ["door, toilet, squeaks", "person, laugh, clap"], "captions_pred_video": [null, "is being projected on a screen at the front of the stage"], "captions_pred_audio": ["a dog barks and taps with background noise ", "a woman is speaking and a crowd is clapping"], "question": "which entity is a person", "label": 1}, {"captions": ["some tunes played by whistling", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["u6BnG6YZqJ4", "vbZ-0lGPneg"], "start_seconds": ["0", "30"], "properties": ["tune, play, whistling", "a woman, a television program, a bird"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking and a dog is whimpering"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man speaks as he moves silverware in a bowl", "some men converse over an engine running"], "sample_ids": ["x6ijhqRY38s", "sCiy7QS1U"], "start_seconds": ["250", "300"], "properties": ["bowl, silverware, man", "men, converse, engine"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", null], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "people speak as gunfire rings out"], "sample_ids": ["xKB8O8LTs6s", "wqTCwqVRDlk"], "start_seconds": ["70", "80"], "properties": ["music, gunfire, explosion", "gunfire, ring, speak"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man is speaking and a gun is fired"], "question": "which entity has more gunfire", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "a clock ticktocks"], "sample_ids": ["tK4VlLsNxak", "v-g-j2uTByM"], "start_seconds": ["120", "30"], "properties": ["a, dial, telephone", "ticktocks, clock, ticktocks"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vzxHnu-SFEw", "w34HjHr6gAY"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "beeps, hit, woman"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a beep sounds followed by a child speaking"], "question": "which woman is talking", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tOj4tdLRaA", "wDVMhEdTiVw"], "start_seconds": ["70", "30"], "properties": ["woman, laugh, baby", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause harm", "label": 1}, {"captions": ["wind blows and a stream of water flows nearby", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sYITalLZjj4", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["stream, flow, wind", "loud, jet engine, roar"], "captions_pred_video": ["two ducks are swimming in the water near each other", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["wind blows and birds chirp", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["tQWGZLItBXk", "vbZ-0lGPneg"], "start_seconds": ["170", "30"], "properties": ["music, person, ding", "a woman, a television program, a bird"], "captions_pred_video": ["worms revolution screenshots", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["birds chirp and objects are moved around", "water flows as men speak and yell"], "sample_ids": ["yPUYU6t3rwo", "vJ7JPEFhyLA"], "start_seconds": ["370", "16"], "properties": ["birds chirp, objects are moved around, birds", "water, flow, men"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["insects buzz and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["su6FAOcOA8c", "y8WEcpOlT3I"], "start_seconds": ["4", "40"], "properties": ["engine, run, woman", "harsh, wind, blows"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking with wind noise in the background "], "question": "which entity is a man speaking to another man?", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["zofjfKhqLk8", "w34HjHr6gAY"], "start_seconds": ["10", "30"], "properties": ["background, metal, clings", "beeps, hit, woman"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "winds blows roughly as a vehicle races past"], "sample_ids": ["yajyRTUQk3U", "xjvTpk2Zpr8"], "start_seconds": ["400", "70"], "properties": ["noise, woman, speak", "wind, blows, vehicle"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "an airplane engine runs"], "sample_ids": ["wqN6IIHw3po", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["rain, surface, fall", "engine, airplane, runs"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and water is splashing", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a male speaks over some small clicks", "children speak and play together"], "sample_ids": ["uXxVebHsGZ8", "yVVP8XvWJTo"], "start_seconds": ["30", "260"], "properties": ["male, clicks, speak", "children, speak, play"], "captions_pred_video": [null, "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "children are speaking and breathing with background noise "], "question": "which entity is more social", "label": 1}, {"captions": ["birds chirp and objects are moved around", "rain falls onto a hard surface and thunder roars before music plays"], "sample_ids": ["yPUYU6t3rwo", "xNMovAf3o50"], "start_seconds": ["370", "0"], "properties": ["birds chirp, objects are moved around, birds", "rain, thunder, music"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "tieng mua - the falling rain lynk lee"], "captions_pred_audio": ["insects buzz and a man speaks", "thunder and rain with music playing in the background "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["someone whistles a tune", "wind blows as people chatter quietly"], "sample_ids": ["sIXTftIuUgw", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["someone, tune, whistle", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a person whistling a song", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["small dogs yip and bark sharply", "birds chirp and a dog breathes heavily"], "sample_ids": ["v-wcQf4BDY0", "y2ZBGpgbhHM"], "start_seconds": ["120", "30"], "properties": ["bark, yip, sharply", "dog, chirp, breathe"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", null], "captions_pred_audio": ["a dog barks and growls", "birds chirping and a dog panting"], "question": "which entity is a dog", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["zsLxS-uLJTw", "siJFXfGWgDk"], "start_seconds": ["20", "50"], "properties": ["horn, blast, train", "a, bird, vehicle"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a man is speaking and birds are chirping in the background "], "question": "which entity is a vehicle passing nearby?", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "a car speeding up in the distance"], "sample_ids": ["uC9dtII1KDI", "u0TrcHhkPQ"], "start_seconds": ["150", "20"], "properties": ["wind, gusts, distance", "distance, car, speed"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", null], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a horse runs while two women talk", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sdvI1mHAsc", "vJ7JPEFhyLA"], "start_seconds": ["20", "16"], "properties": ["two women, horse, run", "three men, wind, flow"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 0}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yYJksgsxx5U", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["audio, woman, silverware", "female, spraying, scream"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a video?", "label": 1}, {"captions": ["motors runs briefly and tires screech", "a man speaks followed by another man speaking outside"], "sample_ids": ["yRx9txMcBl0", "viuTg1M-dqg"], "start_seconds": ["40", "30"], "properties": ["motors, tires, screech", "two men, speak, follow"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker?", "label": 1}, {"captions": ["an engine runs and wind blows", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vs65y4qmyBE", "xfaoyyzw2WU"], "start_seconds": ["340", "180"], "properties": ["engine, run, wind", "loud, jet engine, roar"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "an airplane engine runs"], "sample_ids": ["u--KhUW8l1Y", "yVPZ2MNWpms"], "start_seconds": ["0", "0"], "properties": ["horn, siren, life", "engine, airplane, runs"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a car is driving by on the road "], "question": "which entity is a machine", "label": 1}, {"captions": ["a door slams shut roughly", "birds chirp and wind blows"], "sample_ids": ["zkKdxzNC97Y", "sxIvBMSavMQ"], "start_seconds": ["27", "210"], "properties": ["a door, slams, shut", "birds, chirp, wind"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a"], "captions_pred_audio": ["a door is opened and closed", "birds are chirping and insects are buzzing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks as a motor runs in the background", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xZepNM9qcRA", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["background, motor, run", "clickety-clack, train, whistle"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yks4cLgIDMc", "sSMl2vc3ek"], "start_seconds": ["170", "20"], "properties": ["background, speaking, child", "loud, multiple, distance"], "captions_pred_video": ["footage of two kids wrestling on the floor", null], "captions_pred_audio": ["a man is speaking and a child is crying", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "some men converse over an engine running"], "sample_ids": ["tqR406bGiE", "sCiy7QS1U"], "start_seconds": ["40", "300"], "properties": ["flush, water, gurgle", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a toilet?", "label": 0}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a car accelerates and wind blows"], "sample_ids": ["y8dSeubCNI", "u0TrcHhkPQ"], "start_seconds": ["4", "20"], "properties": ["men, women, car", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine revving and people talking in the background", "a race car accelerates and revs its engine "], "question": "which car is moving faster", "label": 1}, {"captions": ["a vehicle engine runs while a siren and horn sound", "a vehicle is skidding and squealing tires"], "sample_ids": ["u--KhUW8l1Y", "soTOh3zYJfY"], "start_seconds": ["0", "40"], "properties": ["engine, sound, horn", "vehicle, skid, tires"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a red car drifting on a winding road with smoke coming out of it"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a race car accelerates and revs its engine "], "question": "which vehicle is skidding", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "an infant crying frantically"], "sample_ids": ["sofxkNWaP0s", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["wind, engine, louder", "cry, infant, frantically"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "several insects fly while two men talk"], "sample_ids": ["uC9dtII1KDI", "s-T9OVOiMLo"], "start_seconds": ["150", "330"], "properties": ["wind, gusts, distance", "several, fly, men"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a person uses a saw to cut some wood"], "sample_ids": ["vhJWZheqaE", "sHbXC6na9hg"], "start_seconds": ["0", "0"], "properties": ["water drains unevenly, toilet flushes, water drains", "a person, saw, wood"], "captions_pred_video": [null, "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a toilet is flushed", "an engine is idling and vibrating"], "question": "which entity is a person?", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "bees buzz as wind blows"], "sample_ids": ["xV7Mg1QucSc", "tMJne1a4AFI"], "start_seconds": ["14", "0"], "properties": ["alarm, ticktocks, laughs", "bees, buzz, wind"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "a swarm of bees on the ground"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a swarm of bees buzzing around"], "question": "which entity is not a clock?", "label": 1}, {"captions": ["a man speaks uses a drill", "a person snores loudly multiple times at a close distance"], "sample_ids": ["x5eIC7S0fbg", "sSMl2vc3ek"], "start_seconds": ["60", "20"], "properties": ["A man is speaking, uses a drill, and is a tool", "loud, multiple, distance"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", null], "captions_pred_audio": ["a man is speaking and using a power tool ", "a person snoring loudly"], "question": "which entity is a tool", "label": 0}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["siJFXfGWgDk", "uYT5gxnyMWM"], "start_seconds": ["50", "50"], "properties": ["man, woman, vehicle", "a, scream, girl"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "water is sprayed across a hard surface"], "sample_ids": ["y8WEcpOlT3I", "sQwlkXjQabo"], "start_seconds": ["40", "10"], "properties": ["wind, speak, buffeting", "water, spray, surface"], "captions_pred_video": ["on how to use a sewing machine youtube", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a person is snoring while sleeping", "some tunes played by whistling"], "sample_ids": ["vJrjSeP17yE", "u6BnG6YZqJ4"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "tune, play, whistling"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a person snoring loudly", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["a man woman speak while crickets sing", "multiple people speak and children yell while water gurgles"], "sample_ids": ["zTLVJCo4WEE", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "multiple, people, yell"], "captions_pred_video": ["- a boy with a rifle aiming at a target", null], "captions_pred_audio": ["a woman speaks and crickets chirp", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a woman speaks and other women and a man talk with her"], "sample_ids": ["vbZ-0lGPneg", "vbpKkWvfOu4"], "start_seconds": ["30", "560"], "properties": ["a woman, a television program, a bird", "a, woman, man"], "captions_pred_video": ["of a man holding a baby duck in his hands", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["distant humming of an engine", "pigeons vocalize and birds chirp"], "sample_ids": ["yVPZ2MNWpms", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["sound, distance, engine", "vocalize, bird, chirp"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "of the pigeon in the cage"], "captions_pred_audio": ["a car is driving by on the road ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["an airplane accelerates briefly", "water is sprayed across a hard surface"], "sample_ids": ["zjTG0gaGCUI", "sQwlkXjQabo"], "start_seconds": ["80", "10"], "properties": ["accelerates, airplane, briefly", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a jet engine roars as wind blows ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a vehicle accelerates squealing tires"], "sample_ids": ["ukxt9I7eMMg", "sd7xVssqlw"], "start_seconds": ["30", "50"], "properties": ["food, pan, cook", "accelerates, tires, squealing"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a person is snoring while sleeping", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vJrjSeP17yE", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["a person is sleeping, snoring, person", "People, motor, brakes"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a person?", "label": 0}, {"captions": ["a woman speaks and is crumpling paper", "paper is crumpling consistently"], "sample_ids": ["xvDdE3zNf8Y", "v5cSxLaHADY"], "start_seconds": ["120", "0"], "properties": ["A, crumple, paper", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman speaks and crumples paper", "paper is crumpled and crinkled"], "question": "which entity is crumpling paper", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["tDVADusiIoc", "uEU-Hg5MTN8"], "start_seconds": ["60", "27"], "properties": ["water, radio, man", "animal, grunts, snorts"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman speaking and an animal grunting and snorting?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "people speak as gunfire rings out"], "sample_ids": ["s4Uz1Ffgo04", "wqTCwqVRDlk"], "start_seconds": ["100", "80"], "properties": ["roars, background, people speaking", "gunfire, ring, speak"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "a man speaks as a motor runs in the background"], "sample_ids": ["vf44CgrjT0A", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["loud, long, person", "background, motor, run"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a loud burp", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a woman speaks happily and an animal chirps"], "sample_ids": ["sapQIQUhFc", "uWAAAL4CIoc"], "start_seconds": ["280", "0"], "properties": ["water, stream, trickles", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman is speaking and a dog is barking "], "question": "which entity is more active", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["xzKKf9bKNUo", "yajyRTUQk3U"], "start_seconds": ["10", "400"], "properties": ["background, noise, snoring", "a woman, something, fried"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a child yells and another yells", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vMDHu7Lxcgw", "y8WEcpOlT3I"], "start_seconds": ["410", "40"], "properties": ["two, yell, child", "harsh, wind, blows"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a man is speaking with wind noise in the background "], "question": "which entity has two people speaking?", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["w34HjHr6gAY", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["beeps, hit, woman", "a woman, a television program, a bird"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a woman is speaking and a dog is whimpering"], "question": "which entity has a woman talking?", "label": 0}, {"captions": ["food is frying then a woman speaks", "three men talk while wind blows and some liquid flows"], "sample_ids": ["ukxt9I7eMMg", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["food, woman, speak", "three men, wind, flow"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a frog vocalizes while birds chirp", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["vMf1dLD6Sng", "wqZ135Ssz0"], "start_seconds": ["6", "60"], "properties": ["frog, bird, vocalize", "two men, woman, birds"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", null], "captions_pred_audio": ["a frog croaks loudly", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity is a human activity", "label": 1}, {"captions": ["food is frying then a woman speaks", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ukxt9I7eMMg", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["food, woman, speak", "female, spraying, scream"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["water flows as men speak and yell", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vJ7JPEFhyLA", "vlS6YMeWAPo"], "start_seconds": ["16", "40"], "properties": ["water, flow, men", "sheep, baa, birds"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "people speak as gunfire rings out"], "sample_ids": ["ylpYOorfH4o", "wqTCwqVRDlk"], "start_seconds": ["410", "80"], "properties": ["engine, running, wind", "gunfire, ring, speak"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a war zone", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a man speaks followed by another man speaking outside"], "sample_ids": ["sK4u5T8hW78", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "two men, speak, follow"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["uYT5gxnyMWM", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["a, scream, girl", "engine, laugh, loud"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a jet engine roars "], "question": "which entity is followed by a scream", "label": 0}, {"captions": ["food is frying then a woman speaks", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["ukxt9I7eMMg", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["food, woman, speak", "animal, grunts, snorts"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman speaking?", "label": 0}, {"captions": ["a train horn blows as it passes by", "people applaud and hoot and chat quietly"], "sample_ids": ["zVacuqSb4LI", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["horn, blows, train", "people, applaud, hoot"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", null], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sQwlkXjQabo", "wDVMhEdTiVw"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "gun, shoot, water"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["spraying followed by silence", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a diesel truck engine runs while wind blows", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xyL9F5VrjkE", "ukg5L09Wpvo"], "start_seconds": ["20", "150"], "properties": ["engine, run, wind", "clickety-clack, train, whistle"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "people speak in the background as a clock ticktocks"], "sample_ids": ["vz8868znkVQ", "vZAw4apG0Es"], "start_seconds": ["60", "30"], "properties": ["audio, click, kid speaking", "background, clock, ticktocks"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a clock is ticking and people are talking"], "question": "which entity has a clock ticking in the background?", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "an insect buzzes around continuously"], "sample_ids": ["uRExseg-0XI", "v25l1jef3JY"], "start_seconds": ["210", "0"], "properties": ["woman, man, water", "buzzes, continuously, insect"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["u7C-AEBQM", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["ticks, rhythmic, quiet", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a ticktock of a clock", "a truck is honking its horn and a siren is blaring "], "question": "which entity is louder", "label": 1}, {"captions": ["an engine runs and wind blows", "dishes cling together then a man begins to speak"], "sample_ids": ["vs65y4qmyBE", "sQGXqGcwOTc"], "start_seconds": ["340", "3"], "properties": ["engine, run, wind", "cling, speak, dishes"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a vehicle engine runs as a siren and horn sound", "wind blows as people chatter quietly"], "sample_ids": ["u--KhUW8l1Y", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["sound, vehicle, horn", "wind, chatter, people"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a horse runs while two women talk", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sdvI1mHAsc", "zFjIWfSD-4"], "start_seconds": ["20", "410"], "properties": ["two women, horse, run", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a stable?", "label": 0}, {"captions": ["a man speaks while water trickles and flows", "an infant crying as a woman laughs"], "sample_ids": ["sapQIQUhFc", "xhmRY9yhC7c"], "start_seconds": ["280", "20"], "properties": ["water, trickles, flow", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "the clinking of a train bell with the humming of an engine and a train horn blowing"], "sample_ids": ["wSVhSdj0F0", "zgUgkpk78xU"], "start_seconds": ["10", "70"], "properties": ["horn honks, keys jingle, slam", "clinking, humming, horn"], "captions_pred_video": [null, "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a train?", "label": 1}, {"captions": ["birds vocalize and chirp continuously", "a man speaking with light rustling"], "sample_ids": ["w1mlz3Pe4fU", "zOZleIRqZm4"], "start_seconds": ["300", "80"], "properties": ["vocalize, chirp, continuously", "light, rustling, man"], "captions_pred_video": ["of a bird in a cage", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking with crickets chirping in the background"], "question": "which entity is speaking", "label": 1}, {"captions": ["someone snores nearby", "paper folding and crinkling"], "sample_ids": ["spJCm8tD9Zo", "zPpG3RD8lSs"], "start_seconds": ["90", "20"], "properties": ["someone snores, nearby, someone", "paper, fold, crinkle"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a person is snoring loudly", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vMf1dLD6Sng", "tiDFTC-5vU"], "start_seconds": ["6", "30"], "properties": ["frog, bird, vocalize", "male, duck, laugh"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", null], "captions_pred_audio": ["a frog croaks loudly", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["animals bleat and moo as a person speaks", "water flows as men speak and yell"], "sample_ids": ["tPJvjq9QePY", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["animal, bleat, moo", "water, flow, men"], "captions_pred_video": ["a dog and a sheep in a barn", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby cries and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a toilet flushes and water sputters as it drains", "water splashes as an animal walks through"], "sample_ids": ["smGI3C1NZc", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["water, drain, toilet", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a toilet is flushed", "water splashes and gurgles as people speak"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "people speak as gunfire rings out"], "sample_ids": ["wz7N8YRy74I", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["rooster, crow, background, people", "gunfire, ring, speak"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["children speak and play together", "people speak as gunfire rings out"], "sample_ids": ["yVVP8XvWJTo", "wqTCwqVRDlk"], "start_seconds": ["260", "80"], "properties": ["children, speak, play", "gunfire, ring, speak"], "captions_pred_video": ["footage of a playground at a school or daycare center", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a clock ticktocks briefly", "a woman and man speak while food is frying"], "sample_ids": ["u7C-AEBQM", "zk-xJGQU8-4"], "start_seconds": ["30", "130"], "properties": ["ticktocks, clock, ticktocks briefly", "food, man, woman"], "captions_pred_video": [null, "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a ticktock of a clock", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "some tunes played by whistling"], "sample_ids": ["ylpYOorfH4o", "u6BnG6YZqJ4"], "start_seconds": ["410", "0"], "properties": ["engine, running, wind", "tune, play, whistling"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["uiS58TNyUiw", "tdWhHV3X25Q"], "start_seconds": ["430", "60"], "properties": ["vocalize, bird, chirp", "applause, audience, yells"], "captions_pred_video": ["of the pigeon in the cage", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking and a crowd is clapping"], "question": "which entity is a response to a performance", "label": 1}, {"captions": ["a woman talking as an infant is crying", "dishes cling together then a man begins to speak"], "sample_ids": ["tMbMDvT50j8", "sQGXqGcwOTc"], "start_seconds": ["12", "3"], "properties": ["a, talk, infant", "cling, speak, dishes"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a baby cries and a woman speaks", "mechanisms are operating and water is splashing "], "question": "which entity is about a woman talking to an infant?", "label": 0}, {"captions": ["several ducks are quacking and squawking", "an engine idles quietly then gradually becomes louder"], "sample_ids": ["wfHeoPDLMaM", "vbr9mHKc8WM"], "start_seconds": ["30", "40"], "properties": ["quacking, squawking, ducks", "noise, loudness, engine"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", null], "captions_pred_audio": ["ducks are quacking", "an engine is idling"], "question": "which entity is quieter", "label": 1}, {"captions": ["a series of light horn beeps is followed by a loud steam whistle", "people cheer as a vehicle engine revs"], "sample_ids": ["wnpJndXuxLc", "xjhAnI2q6hM"], "start_seconds": ["50", "6"], "properties": ["beeps, loud, whistle", "engine revs, vehicle, people"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["w5W5Kqtc8E", "uYT5gxnyMWM"], "start_seconds": ["100", "50"], "properties": ["water, splashes, motorboat", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman is speaking and a baby is crying"], "question": "which entity is about a girl speaking followed by a scream?", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["sZPuqDgX2V0", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["engine, accelerate, intercom", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "a woman speaks as she rubs two objects together"], "sample_ids": ["w8uLijTqtlU", "vzxHnu-SFEw"], "start_seconds": ["70", "80"], "properties": ["wind, microphone, noise", "two objects, woman, speak"], "captions_pred_video": ["footage is blurry and shaky", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["the wind is blowing strongly", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "a car speeding up in the distance"], "sample_ids": ["y2bVZ7rz-5M", "u0TrcHhkPQ"], "start_seconds": ["280", "20"], "properties": ["engine, horn, siren", "distance, car, speed"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", null], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "wind blowing followed by a zoom"], "sample_ids": ["tDVADusiIoc", "vr8ZXjEBhMQ"], "start_seconds": ["60", "150"], "properties": ["water, radio, man", "wind, blow, zoom"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more calm", "label": 1}, {"captions": ["engines sputter roughly and tires squeal", "wind blowing followed by a zoom"], "sample_ids": ["zhx6hoYrHeI", "vr8ZXjEBhMQ"], "start_seconds": ["160", "150"], "properties": ["engine, sputter, rough", "wind, blow, zoom"], "captions_pred_video": ["footage of a man working on a motorcycle's tire", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a car accelerates and revs its engine ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["an insect buzzes around continuously", "a duck quacks several times"], "sample_ids": ["v25l1jef3JY", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["buzzes, continuously, insect", "quacks, duck, several"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "wind blowing followed by a zoom"], "sample_ids": ["tqR406bGiE", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["flush, water, gurgle", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a toilet is flushed", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sxYkFKFIZD0", "tiDFTC-5vU"], "start_seconds": ["20", "30"], "properties": ["screech, man, door", "male, duck, laugh"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking and ducks are quacking"], "question": "which entity has a door open?", "label": 0}, {"captions": ["people converse in the distance as a clock ticks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vZAw4apG0Es", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["people, clock, converse", "men, talk, cars"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an engine runs and a man speaks", "a small engine idles continuously"], "sample_ids": ["yT5WfYMRr-U", "y5WII6cTH7k"], "start_seconds": ["30", "40"], "properties": ["engine, run, man", "engine, idle, continuously"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", "footage of a sewing machine stitching a red and white hat"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "an engine is knocking and vibrating "], "question": "which engine is running", "label": 0}, {"captions": ["someone whistles a tune", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sIXTftIuUgw", "sLUnaPT5gM8"], "start_seconds": ["90", "0"], "properties": ["someone, tune, whistle", "loud, laughter, intermittent"], "captions_pred_video": [null, "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a person whistling a song", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a train horn sounds as a railroad passing bell rings", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zgUgkpk78xU", "zj2R0XoFr5k"], "start_seconds": ["70", "50"], "properties": ["horn, bell, train", "airplane, boy, fly"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a stream of water runs briefly"], "sample_ids": ["viuTg1M-dqg", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["two men, speak, follow", "stream, water, run"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a stream of water runs briefly"], "sample_ids": ["v7jJS8aAyA", "x-PeY8Yb8M4"], "start_seconds": ["10", "300"], "properties": ["wind, blows, loudly", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a stream of water runs briefly"], "sample_ids": ["vuUVPzd2FXw", "x-PeY8Yb8M4"], "start_seconds": ["160", "300"], "properties": ["a, steam, release", "stream, water, run"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "some men converse over an engine running"], "sample_ids": ["zgUgkpk78xU", "sCiy7QS1U"], "start_seconds": ["70", "300"], "properties": ["horn, bells, ring", "men, converse, engine"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a car accelerates and wind blows"], "sample_ids": ["x5cuQjOdM3E", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["cat, meows, young woman", "accelerates, wind, blows"], "captions_pred_video": ["a black background with an airplane flying in the sky", null], "captions_pred_audio": ["a cat meows and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "wind blows as people chatter quietly"], "sample_ids": ["yajyRTUQk3U", "xBxDz0CFVn0"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "wind, chatter, people"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["food is frying while a woman speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["yhQ2Lg-7qDY", "w5W5Kqtc8E"], "start_seconds": ["130", "100"], "properties": ["food, woman, speak", "wind, blow, vehicle"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "water pouring and bubbling"], "sample_ids": ["s4Uz1Ffgo04", "uyRfq-jKPpo"], "start_seconds": ["100", "50"], "properties": ["roars, background, people speaking", "water, bubbles, pouring"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "an infant crying as a woman laughs"], "sample_ids": ["sZvwOuuPGP0", "xhmRY9yhC7c"], "start_seconds": ["50", "20"], "properties": ["engine, diesel, truck", "a, laugh, infant"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a medium engine is running ", "a baby cries and a woman speaks"], "question": "which is not a person", "label": 0}, {"captions": ["an engine idles quietly then gradually becomes louder", "vehicles pass by on a roadway"], "sample_ids": ["vbr9mHKc8WM", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["noise, loudness, engine", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["an engine is idling", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["someone snores nearby", "a car accelerates and wind blows"], "sample_ids": ["spJCm8tD9Zo", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["someone snores, nearby, someone", "accelerates, wind, blows"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a clock ticktocks"], "sample_ids": ["wz7N8YRy74I", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, people", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["yVumC9TGknc", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["humming, clock, birds", "beeps, hit, woman"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a series of beeps and chirps", "a beep sounds followed by a child speaking"], "question": "which entity has a clock ticking?", "label": 0}, {"captions": ["a sleeping person snores and wheezes", "someone snores nearby"], "sample_ids": ["spJCm8tD9Zo", "spJCm8tD9Zo"], "start_seconds": ["90", "90"], "properties": ["snores, wheezes, sleeps", "someone snores, nearby, someone"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a person is snoring loudly", "a person is snoring loudly"], "question": "which entity is a person?", "label": 0}, {"captions": ["a woman sneezes then speaks", "paper folding and crinkling"], "sample_ids": ["x4dZyf9Gbj0", "zPpG3RD8lSs"], "start_seconds": ["130", "20"], "properties": ["sneezes, speaks, woman", "paper, fold, crinkle"], "captions_pred_video": ["footage is blurry and out of focus", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a woman sneezes and speaks", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of folding and crinkling?", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "an infant crying as a woman laughs"], "sample_ids": ["ukxt9I7eMMg", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["food, pan, cook", "a, laugh, infant"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["t97k0cejSQE", "ukg5L09Wpvo"], "start_seconds": ["250", "150"], "properties": ["bird, chirp, insect", "a train, a horn, a bell"], "captions_pred_video": ["a bee on a purple thistle flower", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "birds tweet and squawk"], "sample_ids": ["uEU-Hg5MTN8", "w1mlz3Pe4fU"], "start_seconds": ["27", "300"], "properties": ["a woman, laughs, animal", "squawk, tweet, scream"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of a bird in a cage"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "birds are chirping and singing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["vs65y4qmyBE", "sQGXqGcwOTc"], "start_seconds": ["340", "3"], "properties": ["engine, run, man", "cling, speak, dishes"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "mechanisms are operating and water is splashing "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "water pouring and bubbling"], "sample_ids": ["xKB8O8LTs6s", "uyRfq-jKPpo"], "start_seconds": ["70", "50"], "properties": ["music, radio, gunshots", "water, bubbles, pouring"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "wind blows as people chatter quietly"], "sample_ids": ["v0x1odnXtP0", "xBxDz0CFVn0"], "start_seconds": ["210", "30"], "properties": ["keyboard, type, computer", "wind, chatter, people"], "captions_pred_video": ["how to make money on youtube in spanish", "footage is blurry and out of focus"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is speaking with wind noise in the background "], "question": "which is quieter", "label": 0}, {"captions": ["white noise and snoring with some rustling in the background", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["xzKKf9bKNUo", "y8WEcpOlT3I"], "start_seconds": ["10", "40"], "properties": ["background, noise, snoring", "harsh, wind, blows"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking with wind noise in the background "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a car speeding up in the distance"], "sample_ids": ["sU53zg9Jp7s", "u0TrcHhkPQ"], "start_seconds": ["380", "20"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "distance, car, speed"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "wind blows as people chatter quietly"], "sample_ids": ["sQGXqGcwOTc", "xBxDz0CFVn0"], "start_seconds": ["3", "30"], "properties": ["audio, kid, giggles", "wind, chatter, people"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage is blurry and out of focus"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a machine runs continuously", "a car speeding up in the distance"], "sample_ids": ["wdXV3Pv0jiY", "u0TrcHhkPQ"], "start_seconds": ["11", "20"], "properties": ["machine, running, continuously", "distance, car, speed"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["mechanisms are operating and a bell is ringing ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["some clanking with distant murmuring", "some men converse over an engine running"], "sample_ids": ["uMTTDZ2mb4", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["clanking, murmuring, distant", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "wind blows as people chatter quietly"], "sample_ids": ["uqFtmnhuqA8", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "wind, chatter, people"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "footage is blurry and out of focus"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks uses a drill", "water is sprayed across a hard surface"], "sample_ids": ["x5eIC7S0fbg", "sQwlkXjQabo"], "start_seconds": ["60", "10"], "properties": ["A man is speaking, uses a drill, and is a tool", "water, spray, surface"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and using a power tool ", "spraying followed by silence"], "question": "which entity is a tool", "label": 0}, {"captions": ["several ducks are quacking and squawking", "several insects fly while two men talk"], "sample_ids": ["wfHeoPDLMaM", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["quacking, squawking, ducks", "several, fly, men"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["ducks are quacking", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a group of animals?", "label": 0}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "an airplane engine spools and people speak"], "sample_ids": ["vqZuVbG6-HI", "wTjoRj1se3U"], "start_seconds": ["130", "390"], "properties": ["background, male, female", "airplane, engine, spool"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a jet engine is running and people are talking"], "question": "which entity is about an airplane?", "label": 1}, {"captions": ["a clock ticktocks", "paper is crumpling consistently"], "sample_ids": ["v-g-j2uTByM", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["ticktocks, clock, ticktocks", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a clock is ticking loudly", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["wEBlkGWVWwE", "zj2R0XoFr5k"], "start_seconds": ["260", "50"], "properties": ["a, babble, woman", "airplane, boy, fly"], "captions_pred_video": ["shows a person writing on the whiteboard", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "people applaud and hoot and chat quietly"], "sample_ids": ["wqN6IIHw3po", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["rain, surface, fall", "people, applaud, hoot"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", null], "captions_pred_audio": ["a man is speaking and water is splashing", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be in a theater", "label": 1}, {"captions": ["a female speaks softly as paper crinkles", "water pouring and bubbling"], "sample_ids": ["xvDdE3zNf8Y", "uyRfq-jKPpo"], "start_seconds": ["120", "50"], "properties": ["a, female, speaks", "water, bubbles, pouring"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman speaks and crumples paper", "water is running from a faucet"], "question": "which entity is more likely to be a video of a person speaking?", "label": 0}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a horn honks and then loudly blares"], "sample_ids": ["wsHBIgzs9Fs", "wnpJndXuxLc"], "start_seconds": ["50", "50"], "properties": ["horn, continuous, buzzing", "horn, honk, loud"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["an electronic device bleeps once", "crowd applause while a guy laughs followed by another man speaking"], "sample_ids": ["tHJ6JSa8Y4", "tDlfY3nmx1A"], "start_seconds": ["0", "160"], "properties": ["bleeps, electronic, device", "applause, laugh, man"], "captions_pred_video": [null, "a man in a suit and tie is talking to another man in a suit and tie"], "captions_pred_audio": ["a clock is ticking and beeping", "a crowd is clapping and laughing and a man is speaking "], "question": "which entity is a man speaking?", "label": 1}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "vehicle engines race around a track as a man commentates"], "sample_ids": ["sapQIQUhFc", "sZPuqDgX2V0"], "start_seconds": ["280", "30"], "properties": ["liquid, flow, distance", "commentator, race, track"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a man is speaking and a helicopter is flying overhead "], "question": "which entity is about a race", "label": 1}, {"captions": ["a man speaks while rain falls onto a hard surface", "a man sprays as a scraping occurs in the background"], "sample_ids": ["wqN6IIHw3po", "sOa7g-44Dag"], "start_seconds": ["30", "30"], "properties": ["rain, surface, fall", "background, man, spray"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic"], "captions_pred_audio": ["a man is speaking and water is splashing", "a man is speaking and rubbing his hands together "], "question": "which entity is a man speaking?", "label": 0}, {"captions": ["a car accelerates and wind blows", "a woman speaks as she rubs two objects together"], "sample_ids": ["u0TrcHhkPQ", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["accelerates, wind, blows", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a process", "label": 1}, {"captions": ["a person speaks over rustling leaves", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zOZleIRqZm4", "tdWhHV3X25Q"], "start_seconds": ["80", "60"], "properties": ["rustling, leaves, person", "applause, audience, yells"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["an aircraft engine runs", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yLCORCnd35Q", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["engine, aircraft, runs", "three men, wind, flow"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a moving object", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "paper is crumpling consistently"], "sample_ids": ["vlS6YMeWAPo", "v5cSxLaHADY"], "start_seconds": ["40", "0"], "properties": ["sheep, baa, birds", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a goat bleats and birds chirp", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["multiple birds vocalize and wind blows", "a duck quacks continuously"], "sample_ids": ["uoGVs9yUqY4", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["multiple, vocalize, wind", "quacks, continuously, duck"], "captions_pred_video": ["for how to make a wooden shed door youtube", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["birds are chirping and flapping their wings with wind noise in the background ", "a duck is quacking loudly"], "question": "which entity is a single animal", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a propeller rotates loudly and intensely"], "sample_ids": ["rwTERCUno", "ugHJF0hfYkg"], "start_seconds": ["90", "10"], "properties": ["engine, idle, sputter", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["an engine is idling and vibrating", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "an airplane engine runs"], "sample_ids": ["vSeGhaZt-aI", "yVPZ2MNWpms"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, run", "engine, airplane, runs"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tGcFnX0GHI", "sSMl2vc3ek"], "start_seconds": ["0", "20"], "properties": ["ring, talk, woman", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["some people speak", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vbZ-0lGPneg", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "a, scream, girl"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman is speaking and a baby is crying"], "question": "which entity has a scream?", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wqADXCzngMw", "uYT5gxnyMWM"], "start_seconds": ["340", "50"], "properties": ["audio, humming, revving", "female, spraying, scream"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a woman is speaking and a baby is crying"], "question": "which entity is a video", "label": 1}, {"captions": ["a man speaking with light rustling", "water flows as men speak and yell"], "sample_ids": ["zOZleIRqZm4", "vJ7JPEFhyLA"], "start_seconds": ["80", "16"], "properties": ["light, rustling, man", "water, flow, men"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 0}, {"captions": ["a man speaks while water trickles and flows", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["sapQIQUhFc", "w34HjHr6gAY"], "start_seconds": ["280", "30"], "properties": ["water, trickles, flow", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["tapping occurs then a baby cries", "a person snores loudly multiple times at a close distance"], "sample_ids": ["wIJK3-5y0kA", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["a, cry, baby", "loud, multiple, distance"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a person snores loudly multiple times at a close distance"], "sample_ids": ["tDlysoZiA1I", "sSMl2vc3ek"], "start_seconds": ["0", "20"], "properties": ["animal, grunts, chirps", "loud, multiple, distance"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a stream of water flows as people talk and wind blows"], "sample_ids": ["tEE3MpBt1sg", "xBxDz0CFVn0"], "start_seconds": ["50", "30"], "properties": ["drill, something, laugh", "stream, water, flow"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage is blurry and out of focus"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["food is frying then a woman speaks", "waves crash against a shoreline and people speak"], "sample_ids": ["ukxt9I7eMMg", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["food, woman, speak", "wave, crash, shoreline"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a video of a woman speaking?", "label": 0}, {"captions": ["a baby laugh at a sputter", "someone is typing on a computer keyboard"], "sample_ids": ["sLUnaPT5gM8", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["laugh, sputter, baby", "keyboard, type, computer"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "how to make money on youtube in spanish"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a horse runs while two women talk", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sdvI1mHAsc", "w5W5Kqtc8E"], "start_seconds": ["20", "100"], "properties": ["two women, horse, run", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["w0xsN8X18Y", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["music, surface, rain", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "an engine runs loudly"], "sample_ids": ["zALy31PjDl0", "vqZuVbG6-HI"], "start_seconds": ["21", "130"], "properties": ["a man, a vehicle, a horn", "loud, engine, run"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "white noise and snoring with some rustling in the background"], "sample_ids": ["vqZuVbG6-HI", "xzKKf9bKNUo"], "start_seconds": ["130", "10"], "properties": ["background, male, female", "background, noise, snoring"], "captions_pred_video": ["footage is blurry because it's raining outside", "shows a woman laying on a bed with her eyes closed and her mouth open"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a person snoring loudly"], "question": "which entity has a background of noise?", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "waves crash against a shoreline and people speak"], "sample_ids": ["sQGXqGcwOTc", "yFB25fqfU8I"], "start_seconds": ["3", "300"], "properties": ["audio, kid, giggles", "wave, crash, shoreline"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a person surfing in the ocean"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a video", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a man speaks as a car is passing by"], "sample_ids": ["wyllXV6PjKo", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a kid, talk, cry", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a car passing by?", "label": 1}, {"captions": ["multiple adults speaking, and a child shouting in the background", "an emergency vehicle engine runs then a horn blows and siren sounds"], "sample_ids": ["yks4cLgIDMc", "y2bVZ7rz-5M"], "start_seconds": ["170", "280"], "properties": ["background, speaking, child", "engine, horn, siren"], "captions_pred_video": ["footage of two kids wrestling on the floor", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking and a child is crying", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn?", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "a chime of a clock followed by various tones of ticking with come clinking"], "sample_ids": ["tDVADusiIoc", "uqFtmnhuqA8"], "start_seconds": ["60", "30"], "properties": ["wind, radio, waves", "a, b, c"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "shows a clock on the wall of a room with a person standing in front of it"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "mechanisms are ticking and a hammer is striking "], "question": "which entity is a clock?", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["vBslzh7saPw", "sLUnaPT5gM8"], "start_seconds": ["90", "0"], "properties": ["power, scream, increase", "loud, laughter, intermittent"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "loud, continuous burping"], "sample_ids": ["vlJS7LN2XyM", "y636gklDioE"], "start_seconds": ["30", "20"], "properties": ["background, clocks, ticking", "loud, continuous, burping"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "a dog sitting on a red chair in front of an old telephone"], "captions_pred_audio": ["a ticktock of a clock", "a person burps loudly several times"], "question": "which entity is louder", "label": 1}, {"captions": ["a male speaks over some small clicks", "a man speaks followed by another man speaking outside"], "sample_ids": ["uXxVebHsGZ8", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["male, clicks, speak", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a person snoring several times", "people speaking indiscriminately in the distance with a person snoring loudly nearby"], "sample_ids": ["spJCm8tD9Zo", "w2JXXIAdUdg"], "start_seconds": ["90", "10"], "properties": ["snore, person, several", "snoring, distance, person"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a close up shot of a person's mouth with a toothbrush in it"], "captions_pred_audio": ["a person is snoring loudly", "a person snoring and a dog whimpering"], "question": "which person is snoring", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "a propeller rotates loudly and intensely"], "sample_ids": ["sQGXqGcwOTc", "ugHJF0hfYkg"], "start_seconds": ["3", "10"], "properties": ["cling, speak, dishes", "loud, intense, propeller"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "winds blows roughly as a vehicle races past"], "sample_ids": ["wz7N8YRy74I", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["rooster, crow, background, people", "wind, blows, vehicle"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a jet engine roars and wind blows "], "question": "which entity is about a vehicle racing past?", "label": 1}, {"captions": ["two frogs croak at each other", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["zg0X6BnhOLQ", "tdWhHV3X25Q"], "start_seconds": ["410", "60"], "properties": ["two frogs, croak, at each other", "applause, audience, yells"], "captions_pred_video": ["footage of lightning in the sky at night", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a frog is croaking", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "a car accelerates and wind blows"], "sample_ids": ["xSKJGCItUWE", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["engine, work, child", "accelerates, wind, blows"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["zF8yoL0rkbI", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["engine, run, someone", "a woman, laughs, animal"], "captions_pred_video": ["footage of the traffic on the street at night", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["an infant crying as a woman laughs", "multiple people speak and children yell while water gurgles"], "sample_ids": ["xhmRY9yhC7c", "vb1fPSDI4c"], "start_seconds": ["20", "30"], "properties": ["a, laugh, infant", "multiple, people, yell"], "captions_pred_video": ["of a baby crying in a baby bouncer", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["water rushes by", "heavy rain splashes as it falls"], "sample_ids": ["x-PeY8Yb8M4", "wP8ZKrlx3oA"], "start_seconds": ["300", "40"], "properties": ["water, rushes, by", "fall, rain, splash"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a car is driving on a wet road ", "a heavy rain is falling on a surface"], "question": "which entity is falling", "label": 1}, {"captions": ["children cry and people talk", "a man speaks as a car is passing by"], "sample_ids": ["xLwHe825Zs", "sK4u5T8hW78"], "start_seconds": ["18", "30"], "properties": ["people talk, children cry, people talk", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["vehicles pass by on a roadway", "wind blows as people chatter quietly"], "sample_ids": ["tgbONvsP47Y", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["pass, vehicle, roadway", "wind, chatter, people"], "captions_pred_video": ["footage of a fire truck entering a garage", "footage is blurry and out of focus"], "captions_pred_audio": ["a car is driving on the road ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "a vehicle engine accelerating then running on idle"], "sample_ids": ["vSeGhaZt-aI", "vYkA3cfXp5Q"], "start_seconds": ["50", "30"], "properties": ["water, bubbles, speak", "engine, accelerate, idle"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "an engine is idling"], "question": "which entity is a moving object", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["w2M4i1mklOA", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["loud, chime, bell", "A, game, keyboard"], "captions_pred_video": ["footage of an antique clock", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a man speaks and types on a computer keyboard "], "question": "which entity is playing a game", "label": 1}, {"captions": ["a horn honks and then loudly blares", "someone whistles a tune"], "sample_ids": ["wnpJndXuxLc", "sIXTftIuUgw"], "start_seconds": ["50", "90"], "properties": ["horn, honk, loud", "someone, tune, whistle"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a person whistling a song"], "question": "which is a musical instrument", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "water flows and trickles"], "sample_ids": ["wSVhSdj0F0", "tB7hWb9gTuQ"], "start_seconds": ["10", "30"], "properties": ["horn honks, keys jingle, slam", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "people applaud and hoot and chat quietly"], "sample_ids": ["vlS6YMeWAPo", "wwyfGO2J4"], "start_seconds": ["40", "90"], "properties": ["sheep, baa, birds", "people, applaud, hoot"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", null], "captions_pred_audio": ["a goat bleats and birds chirp", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "a woman speaks as she rubs two objects together"], "sample_ids": ["xyL9F5VrjkE", "vzxHnu-SFEw"], "start_seconds": ["20", "80"], "properties": ["wind, blows, vehicle", "two objects, woman, speak"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a physical action", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "frogs croak and vocalize"], "sample_ids": ["tK4VlLsNxak", "yswmmRZFItk"], "start_seconds": ["120", "0"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "croak, vocalize, frog"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a close up of a frog in the water"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a frog is croaking"], "question": "which entity is a frog", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "birds chirp and a dog breathes heavily"], "sample_ids": ["zuua6-5goWw", "y2ZBGpgbhHM"], "start_seconds": ["30", "30"], "properties": ["sound, pop, bird", "dog, chirp, breathe"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", null], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "birds chirping and a dog panting"], "question": "which entity is about a dog?", "label": 1}, {"captions": ["an infant crying frantically", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zwOBqeFTgiU", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["cry, infant, frantically", "engine, idle, woman"], "captions_pred_video": ["of the baby crying in the car seat", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a baby cries loudly", "a woman is speaking and a subway train is moving "], "question": "which entity is a human", "label": 1}, {"captions": ["an engine starts and increases in power", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["zjTG0gaGCUI", "yDoT73BWsdA"], "start_seconds": ["80", "10"], "properties": ["power, increase, engine", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["food is frying then a woman speaks", "water flows and trickles"], "sample_ids": ["ukxt9I7eMMg", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["food, woman, speak", "water, flow, trickle"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "a man speaks as a car is passing by"], "sample_ids": ["sZvwOuuPGP0", "sK4u5T8hW78"], "start_seconds": ["50", "30"], "properties": ["engine, diesel, truck", "a, car, pass"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a medium engine is running ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a vehicle", "label": 1}, {"captions": ["people speak then an engine runs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uMTTDZ2mb4", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["engine, run, people", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["people are talking and a car is driving by with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a person speaking and then an engine runs?", "label": 0}, {"captions": ["a railroad crossing bell rings as a train horn blows", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["tZGN5a7ybxo", "yDoT73BWsdA"], "start_seconds": ["60", "10"], "properties": ["ring, train, horn", "engine, revs, vehicle"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a train is moving and blowing its horn ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a person whistles a meandering tune", "people cheer as a vehicle engine revs"], "sample_ids": ["uFoga8sHpiw", "xjhAnI2q6hM"], "start_seconds": ["90", "6"], "properties": ["person, tune, whistle", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a bird in a cage", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person whistles a song", "a truck is revving its engine and a man is speaking "], "question": "which entity is more likely to be in a vehicle", "label": 1}, {"captions": ["a vehicle engine runs and someone speaks", "some men converse over an engine running"], "sample_ids": ["zF8yoL0rkbI", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["engine, run, someone", "men, converse, engine"], "captions_pred_video": ["footage of the traffic on the street at night", null], "captions_pred_audio": ["the wind is blowing hard and water is splashing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a running engine", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["v5P-ThUCINM", "w34HjHr6gAY"], "start_seconds": ["400", "30"], "properties": ["background, chirp, bird", "beeps, hit, woman"], "captions_pred_video": [null, "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "some men converse over an engine running"], "sample_ids": ["tDVADusiIoc", "sCiy7QS1U"], "start_seconds": ["60", "300"], "properties": ["water, radio, man", "men, converse, engine"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking over a radio as wind blows and water splashes?", "label": 0}, {"captions": ["someone is typing on a computer keyboard", "a woman speaks as she rubs two objects together"], "sample_ids": ["v0x1odnXtP0", "vzxHnu-SFEw"], "start_seconds": ["210", "80"], "properties": ["keyboard, type, computer", "two objects, woman, speak"], "captions_pred_video": ["how to make money on youtube in spanish", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking and breathing with mechanisms in the background "], "question": "which object is being rubbed together", "label": 0}, {"captions": ["a woman talks while something is fried and objects are tapped", "water is sprayed across a hard surface"], "sample_ids": ["yajyRTUQk3U", "sQwlkXjQabo"], "start_seconds": ["400", "10"], "properties": ["a woman, something, fried", "water, spray, surface"], "captions_pred_video": ["- a woman cooking in the kitchen", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a man speaks as a car is passing by"], "sample_ids": ["xV7Mg1QucSc", "sK4u5T8hW78"], "start_seconds": ["14", "30"], "properties": ["alarm, ticktocks, laughs", "a, car, pass"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["an airplane engine runs", "wind blowing followed by a zoom"], "sample_ids": ["yVPZ2MNWpms", "vr8ZXjEBhMQ"], "start_seconds": ["0", "150"], "properties": ["engine, airplane, runs", "wind, blow, zoom"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a car is driving by on the road ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is not a zoom?", "label": 0}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a woman speaks happily and an animal chirps"], "sample_ids": ["wqZ135Ssz0", "uWAAAL4CIoc"], "start_seconds": ["60", "0"], "properties": ["man, woman, squawks", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a woman is speaking and a dog is barking "], "question": "which entity has a bird squawks accompanied by a man and woman speaking?", "label": 0}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "paper is crumpling consistently"], "sample_ids": ["tDlfY3nmx1A", "v5cSxLaHADY"], "start_seconds": ["160", "0"], "properties": ["applause, laugh, man", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "paper is crumpled and crinkled"], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a man speaks over intermittent keyboard taps"], "sample_ids": ["weDbePuc-Xc", "tw76HGONaKg"], "start_seconds": ["40", "570"], "properties": ["music, slaps, human", "audio, man, keyboard"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a woman speaks as she rubs two objects together"], "sample_ids": ["yPUYU6t3rwo", "vzxHnu-SFEw"], "start_seconds": ["370", "80"], "properties": ["birds chirp, objects are moved around, birds", "two objects, woman, speak"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["insects buzz and a man speaks", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["humming of idling and revving engine with a man speaking", "a man speaks over intermittent keyboard taps"], "sample_ids": ["wqADXCzngMw", "tw76HGONaKg"], "start_seconds": ["340", "570"], "properties": ["audio, humming, revving", "audio, man, keyboard"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over it?", "label": 0}, {"captions": ["birds tweet and squawk", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["w1mlz3Pe4fU", "tdWhHV3X25Q"], "start_seconds": ["300", "60"], "properties": ["squawk, tweet, scream", "applause, audience, yells"], "captions_pred_video": ["of a bird in a cage", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking and a crowd is clapping"], "question": "which entity is a human activity", "label": 1}, {"captions": ["birds chirp and wind blows", "a man speaks uses a drill"], "sample_ids": ["sxIvBMSavMQ", "x5eIC7S0fbg"], "start_seconds": ["210", "60"], "properties": ["birds, chirp, wind", "A man is speaking, uses a drill, and is a tool"], "captions_pred_video": ["beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a", "a person in surgical gloves is using a needle to remove a small object from a tooth"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a man is speaking and using a power tool "], "question": "which entity is a tool", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a power tool runs and touches a surface"], "sample_ids": ["xfudFO976zE", "zfvPRf3chY"], "start_seconds": ["0", "290"], "properties": ["animal, bleats, cry", "power tool, run, touch"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a man is speaking while a power tool is being used "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a airplane flies overhead as a woman speaks"], "sample_ids": ["tQWGZLItBXk", "zj2R0XoFr5k"], "start_seconds": ["170", "50"], "properties": ["voice, music, whoosh", "airplane, fly, woman"], "captions_pred_video": ["worms revolution screenshots", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a person is snoring while sleeping", "water flows as men speak and yell"], "sample_ids": ["vJrjSeP17yE", "vJ7JPEFhyLA"], "start_seconds": ["40", "16"], "properties": ["a person is sleeping, snoring, person", "water, flow, men"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a person speaking and yelling?", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "waves crash against a shoreline and people speak"], "sample_ids": ["yYJksgsxx5U", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["audio, woman, silverware", "wave, crash, shoreline"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a video", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a person snores loudly multiple times at a close distance"], "sample_ids": ["y2ZBGpgbhHM", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["birds, tweet, pant", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds chirping and a dog panting", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "vehicles pass by on a roadway"], "sample_ids": ["uPDn2BFTHk", "tgbONvsP47Y"], "start_seconds": ["140", "0"], "properties": ["woman, laughs, speaks", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a telephone rings followed by a woman talking"], "sample_ids": ["uYT5gxnyMWM", "tGcFnX0GHI"], "start_seconds": ["50", "0"], "properties": ["female, spraying, scream", "ring, talk, woman"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a conversation", "label": 1}, {"captions": ["continuous snoring", "someone whistles a tune"], "sample_ids": ["sLkeqCDJIyw", "sIXTftIuUgw"], "start_seconds": ["120", "90"], "properties": ["loud, snoring, noise", "someone, tune, whistle"], "captions_pred_video": [", what is the man doing on the couch? sleeping", null], "captions_pred_audio": ["a person is snoring loudly", "a person whistling a song"], "question": "which noise is quieter", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "someone snores nearby"], "sample_ids": ["w2JXXIAdUdg", "spJCm8tD9Zo"], "start_seconds": ["10", "90"], "properties": ["snoring, distance, person", "someone snores, nearby, someone"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a person is snoring loudly"], "question": "which entity is a person?", "label": 0}, {"captions": ["leaves rustle while man speaks", "small dogs yip and bark sharply"], "sample_ids": ["zOZleIRqZm4", "v-wcQf4BDY0"], "start_seconds": ["80", "120"], "properties": ["leaves, rustle, speak", "bark, yip, sharply"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a dog barks and growls"], "question": "which entity is more quiet", "label": 0}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a man speaks and is typing on a keyboard"], "sample_ids": ["xKB8O8LTs6s", "x9JovgqUcs"], "start_seconds": ["70", "500"], "properties": ["music, radio, gunshots", "a, man, speaks, keyboard"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", null], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a man speaks and types on a keyboard"], "question": "which entity is typing on a keyboard?", "label": 1}, {"captions": ["a man is filing a hard object", "a propeller rotates loudly and intensely"], "sample_ids": ["vveS8HT7Uog", "ugHJF0hfYkg"], "start_seconds": ["100", "10"], "properties": ["a man, hard, object", "loud, intense, propeller"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a helicopter is flying overhead "], "question": "which object is rotating loudly", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sSMl2vc3ek", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["loud, multiple, distance", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vzceMbklWc", "yDoT73BWsdA"], "start_seconds": ["180", "10"], "properties": ["water, faucet, sink", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["water is running and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a toilet flushes and a female speaks"], "sample_ids": ["t25U-v4k4ts", "yaln9y8I7ms"], "start_seconds": ["40", "230"], "properties": ["bees buzz, birds chirp, man speaks", "female, flushes, toilet"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a toilet flushes and a man speaks"], "question": "which entity is a man speaking?", "label": 0}, {"captions": ["vehicle engines race around a track as a man commentates", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sZPuqDgX2V0", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["commentator, race, track", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is speaking with wind noise in the background "], "question": "which entity is a video of a race?", "label": 0}, {"captions": ["a heavy rain falls endlessly", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wP8ZKrlx3oA", "yajyRTUQk3U"], "start_seconds": ["40", "400"], "properties": ["heavy, rain, fall", "a woman, something, fried"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman is speaking while food is frying in the background"], "question": "which entity is a video of something being fried?", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["un9VQlzgZM", "y8WEcpOlT3I"], "start_seconds": ["5", "40"], "properties": ["females, talk, laugh", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is about a harsh wind blowing?", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a child speaks in closed space"], "sample_ids": ["rwTERCUno", "yW6FWLSLkx4"], "start_seconds": ["90", "40"], "properties": ["engine, idle, sputter", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["an engine is idling and vibrating", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["tDlfY3nmx1A", "vlS6YMeWAPo"], "start_seconds": ["160", "40"], "properties": ["applause, laugh, man", "sheep, baa, birds"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a telephone rings followed by a woman talking"], "sample_ids": ["uRExseg-0XI", "tGcFnX0GHI"], "start_seconds": ["210", "0"], "properties": ["woman, man, water", "ring, talk, woman"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", null], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "several insects fly while two men talk"], "sample_ids": ["wRV8yMk886E", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["liquid, spray, nozzle", "several, fly, men"], "captions_pred_video": ["two cars are parked in a parking lot at night", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more insects", "label": 1}, {"captions": ["a door opens and closes", "a man speaks while playing a video game on a keyboard"], "sample_ids": ["vBHyYJ8pL0", "tw76HGONaKg"], "start_seconds": ["2", "570"], "properties": ["open, close, door", "A, game, keyboard"], "captions_pred_video": [null, "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man speaks and types on a computer keyboard "], "question": "which entity is a person", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["sQGXqGcwOTc", "wqZ135Ssz0"], "start_seconds": ["3", "60"], "properties": ["cling, speak, dishes", "two men, woman, birds"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", null], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["people applaud and hoot and chat quietly", "wind blowing followed by a zoom"], "sample_ids": ["wwyfGO2J4", "vr8ZXjEBhMQ"], "start_seconds": ["90", "150"], "properties": ["people, applaud, hoot", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["wind blows strongly and a young man speaks", "a duck quacks continuously"], "sample_ids": ["vs65y4qmyBE", "vh30P49Po6s"], "start_seconds": ["340", "30"], "properties": ["wind, blows, strongly", "quacks, continuously, duck"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a car accelerates and wind blows"], "sample_ids": ["vlJS7LN2XyM", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["background, clocks, ticking", "accelerates, wind, blows"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a race car accelerates and revs its engine "], "question": "which entity is more active", "label": 1}, {"captions": ["a motorcycle engine works nearby", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tOSWIURC-4", "w5W5Kqtc8E"], "start_seconds": ["0", "100"], "properties": ["engine, work, nearby", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a lawn mower is running ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine?", "label": 1}, {"captions": ["a clang followed by a toilet flushing", "a car accelerates and wind blows"], "sample_ids": ["wNZ5thZM7XU", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["sound, flush, toilet", "accelerates, wind, blows"], "captions_pred_video": ["footage of a toilet in a bathroom stall", null], "captions_pred_audio": ["a toilet flushes", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a person speaks over rustling leaves", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zOZleIRqZm4", "vfYTJq7nU"], "start_seconds": ["80", "130"], "properties": ["rustling, leaves, person", "rustling, ducks, quack"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a duck quacks and a woman speaks"], "question": "which entity has a person speaking over rustling leaves?", "label": 0}, {"captions": ["wind blows strongly", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["w8uLijTqtlU", "vbZ-0lGPneg"], "start_seconds": ["70", "30"], "properties": ["wind, blows, strongly", "a woman, a television program, a bird"], "captions_pred_video": ["footage is blurry and shaky", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["the wind is blowing strongly", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["uJV8NDaHqqk", "y8WEcpOlT3I"], "start_seconds": ["100", "40"], "properties": ["loud, fly, chirp", "harsh, wind, blows"], "captions_pred_video": ["a bee hive in a wooden box", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["continuous sneezing together with speech", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["x4dZyf9Gbj0", "uYT5gxnyMWM"], "start_seconds": ["130", "50"], "properties": ["continuous, sneeze, speech", "female, spraying, scream"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman sneezes and speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["an airplane engine runs", "a toilet flushes and water sputters as it drains"], "sample_ids": ["yVPZ2MNWpms", "smGI3C1NZc"], "start_seconds": ["0", "30"], "properties": ["engine, airplane, runs", "water, drain, toilet"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "a woman speaks happily and an animal chirps"], "sample_ids": ["xzKKf9bKNUo", "uWAAAL4CIoc"], "start_seconds": ["10", "0"], "properties": ["background, noise, snoring", "a woman, chirps, animal"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", null], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a dog is barking "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tDVADusiIoc", "vYkA3cfXp5Q"], "start_seconds": ["60", "30"], "properties": ["water, radio, man", "engine, accelerate, idle"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a machine beeps continuously"], "sample_ids": ["y8dSeubCNI", "y682ml90jGw"], "start_seconds": ["4", "11"], "properties": ["men, women, car", "beeps, machine, continuously"], "captions_pred_video": [null, null], "captions_pred_audio": ["an engine revving and people talking in the background", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vlS6YMeWAPo", "xKB8O8LTs6s"], "start_seconds": ["40", "70"], "properties": ["noise, bleat, call", "music, gunfire, explosion"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a goat bleats and birds chirp", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["zj2R0XoFr5k", "tDVADusiIoc"], "start_seconds": ["50", "60"], "properties": ["airplane, fly, woman", "water, radio, man"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a train horn sounds as the train approaches", "a horn rings out as a machine runs by"], "sample_ids": ["slZLHwNbbt4", "slZLHwNbbt4"], "start_seconds": ["300", "300"], "properties": ["train, horn, sound", "a, horn, run"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a clock alarm sounds and gears turn", "a duck quacks continuously"], "sample_ids": ["w2M4i1mklOA", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["alarm, gears, turn", "quacks, continuously, duck"], "captions_pred_video": ["footage of an antique clock", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["the revving of an engine throttle followed by a man speaking", "wind blowing followed by a zoom"], "sample_ids": ["tezvROoo4bs", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["audio, throttle, speaking", "wind, blow, zoom"], "captions_pred_video": ["footage of a busy city street with cars parked on both sides of the road", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a car accelerates and revs while a man speaks ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a video", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vzxHnu-SFEw", "vb1fPSDI4c"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "multiple, people, yell"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a crowd of people are talking and laughing"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "birds chirp and objects are moved around"], "sample_ids": ["uiS58TNyUiw", "yPUYU6t3rwo"], "start_seconds": ["430", "370"], "properties": ["vocalize, bird, chirp", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of the pigeon in the cage", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "insects buzz and a man speaks"], "question": "which entity is a video of birds chirping?", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "water drips and bubbles as a man speaks"], "sample_ids": ["wvKpEYswXO0", "vSeGhaZt-aI"], "start_seconds": ["150", "50"], "properties": ["plastic, tap, speak", "water, bubbles, speak"], "captions_pred_video": ["of the person preparing food in the kitchen", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is a liquid", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "an engine runs loudly"], "sample_ids": ["wDVMhEdTiVw", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["gun, shoot, water", "loud, engine, run"], "captions_pred_video": ["a blurry image of trees and water in the forest", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "an airplane engine spools and people speak"], "sample_ids": ["wyllXV6PjKo", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["a kid, talk, cry", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a woman speaks and a baby cries", "a jet engine is running and people are talking"], "question": "which entity is about a kid?", "label": 0}, {"captions": ["someone snores nearby", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["spJCm8tD9Zo", "tdWhHV3X25Q"], "start_seconds": ["90", "60"], "properties": ["someone snores, nearby, someone", "applause, audience, yells"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "vehicles pass by on a roadway"], "sample_ids": ["yswmmRZFItk", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["background, frog, croak", "pass, vehicle, roadway"], "captions_pred_video": ["a close up of a frog in the water", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a frog is croaking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a stream runs then someone speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wbHTKEJZyhc", "wz7N8YRy74I"], "start_seconds": ["20", "30"], "properties": ["stream, run, someone", "rooster, crow, background, men"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vf9xf3vMsGM", "xfaoyyzw2WU"], "start_seconds": ["540", "180"], "properties": ["A man speaks while turning a water faucet on.", "loud, jet engine, roar"], "captions_pred_video": ["of the person washing their hands under the faucet", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking while water is running in the background", "an aircraft engine roars and a man speaks "], "question": "which entity is quieter", "label": 0}, {"captions": ["a woman speaks and is crumpling paper", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["xvDdE3zNf8Y", "zj2R0XoFr5k"], "start_seconds": ["120", "50"], "properties": ["A, crumple, paper", "airplane, boy, fly"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman speaks and crumples paper", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a car accelerates and wind blows"], "sample_ids": ["sDSppXIlJrs", "u0TrcHhkPQ"], "start_seconds": ["27", "20"], "properties": ["microphone, water, wind", "accelerates, wind, blows"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", null], "captions_pred_audio": ["the wind is blowing and water is splashing", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zsLxS-uLJTw", "w5W5Kqtc8E"], "start_seconds": ["20", "100"], "properties": ["horn, blast, train", "wind, blow, vehicle"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", null], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a vehicles accelerate quickly and someone laughs", "a person whistles a meandering tune"], "sample_ids": ["uWPRNLnpy7Y", "uFoga8sHpiw"], "start_seconds": ["10", "90"], "properties": ["accelerate, laugh, vehicle", "person, tune, whistle"], "captions_pred_video": ["is taken from a car driving down the street", "footage of a bird in a cage"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a person whistles a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a woman speaks and a baby laughs", "paper folding and crinkling"], "sample_ids": ["tOj4tdLRaA", "zPpG3RD8lSs"], "start_seconds": ["70", "20"], "properties": ["woman, laugh, baby", "paper, fold, crinkle"], "captions_pred_video": [null, "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a baby laughs and a woman speaks", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["an adult woman and an adult man speak", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["zTLVJCo4WEE", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["two people, adult, speak", "animal, grunts, snorts"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a woman is speaking and a baby is crying"], "question": "which entity has a more snorts", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "waves crash against a shoreline and people speak"], "sample_ids": ["uRExseg-0XI", "yFB25fqfU8I"], "start_seconds": ["210", "300"], "properties": ["woman, man, water", "wave, crash, shoreline"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more active", "label": 1}, {"captions": ["male speech with light ticking", "water is sprayed across a hard surface"], "sample_ids": ["xO-Q2BlIIPU", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["male, speech, ticking", "water, spray, surface"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "water is sprayed across a hard surface"], "sample_ids": ["rwTERCUno", "sQwlkXjQabo"], "start_seconds": ["90", "10"], "properties": ["engine, idle, sputter", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["an engine is idling and vibrating", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a clock ticktocks"], "sample_ids": ["w2M4i1mklOA", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["loud, chime, bell", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of an antique clock", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a clock is ticking loudly"], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks and other women and a man talk with her", "long loud burping by a man"], "sample_ids": ["vbpKkWvfOu4", "xmiUIOhtZyQ"], "start_seconds": ["560", "60"], "properties": ["a, woman, man", "loud, burp, man"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "homer simpson drinking a beer"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a person burps and music plays in the background "], "question": "which entity is a burp", "label": 1}, {"captions": ["women speak and laugh as wind blows", "someone whistles a tune"], "sample_ids": ["un9VQlzgZM", "sIXTftIuUgw"], "start_seconds": ["5", "90"], "properties": ["wind, speak, laugh", "someone, tune, whistle"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a duck quacks continuously"], "sample_ids": ["yPUYU6t3rwo", "vh30P49Po6s"], "start_seconds": ["370", "30"], "properties": ["birds chirp, objects are moved around, birds", "quacks, continuously, duck"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["insects buzz and a man speaks", "a duck is quacking loudly"], "question": "which entity is a bird", "label": 0}, {"captions": ["a woman and man are speaking", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["vbpKkWvfOu4", "uYT5gxnyMWM"], "start_seconds": ["560", "50"], "properties": ["two people, speaking, woman, man", "a, scream, girl"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a woman is speaking and a baby is crying"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a wooden clack accompanies nearby chirping birds"], "sample_ids": ["sWZzXuWYY", "yeFvk9x0wWI"], "start_seconds": ["420", "30"], "properties": ["male, clanks, thumps", "clack, bird, chirp"], "captions_pred_video": [null, "a mouse in a cage on the sidewalk in front of a fence"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "birds chirp in the background as a car drives by "], "question": "which entity is accompanied by birds", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a vehicle engine accelerating then running on idle"], "sample_ids": ["zgUgkpk78xU", "vYkA3cfXp5Q"], "start_seconds": ["70", "30"], "properties": ["clinking, humming, horn", "engine, accelerate, idle"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a vehicle accelerates and squeals tires"], "sample_ids": ["xjvTpk2Zpr8", "yRx9txMcBl0"], "start_seconds": ["70", "40"], "properties": ["engine, run, wind", "accelerates, tires, squeals"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a car is revving its engine and skidding "], "question": "which entity is moving", "label": 1}, {"captions": ["an airplane accelerates briefly", "someone sprays a liquid onto a hard surface making a hiss sound"], "sample_ids": ["zjTG0gaGCUI", "zO-LSSY92ZM"], "start_seconds": ["80", "30"], "properties": ["accelerates, airplane, briefly", "liquid, surface, sound"], "captions_pred_video": [null, "youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car's air filter youtube how to clean your car'"], "captions_pred_audio": ["a jet engine roars as wind blows ", "steam is hissing and hissing"], "question": "which entity is not a liquid?", "label": 0}, {"captions": ["speaking following by laughing and clapping", "several insects fly while two men talk"], "sample_ids": ["u2f5NpsoHBg", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["person, laugh, clap", "several, fly, men"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video of a person speaking and laughing and clapping?", "label": 0}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a child speaks in closed space"], "sample_ids": ["uEU-Hg5MTN8", "yW6FWLSLkx4"], "start_seconds": ["27", "40"], "properties": ["a woman, laughs, animal", "child, space, speak"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a baby cries and a woman speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tMbMDvT50j8", "zj2R0XoFr5k"], "start_seconds": ["12", "50"], "properties": ["a, cry, woman", "airplane, boy, fly"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a baby crying and a woman speaking?", "label": 0}, {"captions": ["dishes cling together then a man begins to speak", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sQGXqGcwOTc", "xKB8O8LTs6s"], "start_seconds": ["3", "70"], "properties": ["cling, speak, dishes", "music, gunfire, explosion"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "a stream of water runs briefly"], "sample_ids": ["wTjoRj1se3U", "x-PeY8Yb8M4"], "start_seconds": ["390", "300"], "properties": ["engine, run, people", "stream, water, run"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a jet engine is running and people are talking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vYkA3cfXp5Q", "xfaoyyzw2WU"], "start_seconds": ["30", "180"], "properties": ["engine, accelerate, idle", "loud, jet engine, roar"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["an engine is idling", "an aircraft engine roars and a man speaks "], "question": "which engine is louder", "label": 1}, {"captions": ["ducks quack as a man speaks and makes a duck sound", "water pouring and bubbling"], "sample_ids": ["vfYTJq7nU", "uyRfq-jKPpo"], "start_seconds": ["130", "50"], "properties": ["ducks, quack, man", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a duck quacks and a woman speaks", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "people applaud and hoot and chat quietly"], "sample_ids": ["vlJS7LN2XyM", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["background, clocks, ticking", "people, applaud, hoot"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "people speak as gunfire rings out"], "sample_ids": ["yRx9txMcBl0", "wqTCwqVRDlk"], "start_seconds": ["40", "80"], "properties": ["accelerates, tires, squeals", "gunfire, ring, speak"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["wind blows as people chatter quietly", "water flows and trickles"], "sample_ids": ["xBxDz0CFVn0", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["wind, chatter, people", "water, flow, trickle"], "captions_pred_video": ["footage is blurry and out of focus", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a vehicle engine accelerating then running on idle", "a clock ticktocks and sounds an alarm then a man laughs"], "sample_ids": ["vYkA3cfXp5Q", "xV7Mg1QucSc"], "start_seconds": ["30", "14"], "properties": ["engine, accelerate, idle", "alarm, ticktocks, laughs"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "a cuckoo clock hanging on the wall"], "captions_pred_audio": ["an engine is idling", "an alarm clock ticks and a woman laughs"], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["an engine runs and wind blows", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["vs65y4qmyBE", "xKB8O8LTs6s"], "start_seconds": ["340", "70"], "properties": ["engine, run, wind", "music, gunfire, explosion"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "an airplane engine spools and people speak"], "sample_ids": ["v7jJS8aAyA", "wTjoRj1se3U"], "start_seconds": ["10", "390"], "properties": ["wind, blows, loudly", "airplane, engine, spool"], "captions_pred_video": [null, "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a jet engine is running and people are talking"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["water gurgles, metal squeaks and the water stops", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["x4a9YGIw4ok", "yajyRTUQk3U"], "start_seconds": ["120", "400"], "properties": ["water, gurgles, stops", "a woman, something, fried"], "captions_pred_video": ["footage is blurry and out of focus", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a toilet flushes and water splashes", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a man speaks over intermittent keyboard taps", "a duck quacks loudly and continuously"], "sample_ids": ["tw76HGONaKg", "vh30P49Po6s"], "start_seconds": ["570", "30"], "properties": ["audio, man, keyboard", "loud, continuous, quacks"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tDlysoZiA1I", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["animal, grunts, chirps", "a woman, something, fried"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "some tunes played by whistling"], "sample_ids": ["wAAkbZToh8", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["burp, laugh, speak", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man burps and a woman speaks", "a person whistling a song"], "question": "which entity is playing a tune", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "people applaud and hoot and chat quietly"], "sample_ids": ["wvKpEYswXO0", "wwyfGO2J4"], "start_seconds": ["150", "90"], "properties": ["plastic, tap, speak", "people, applaud, hoot"], "captions_pred_video": ["of the person preparing food in the kitchen", null], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "people are clapping and speaking with background noise "], "question": "which entity is a group of people", "label": 1}, {"captions": ["a person snoring", "water flows and trickles"], "sample_ids": ["t8tv5YRMJUg", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["a person, snore, loud", "water, flow, trickle"], "captions_pred_video": ["of a man getting his face licked by another man", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a person sniffs and breathes heavily", "water is splashing and gurgling"], "question": "which entity is not loud", "label": 1}, {"captions": ["water flows followed by women screaming", "winds blows roughly as a vehicle races past"], "sample_ids": ["w5W5Kqtc8E", "xjvTpk2Zpr8"], "start_seconds": ["100", "70"], "properties": ["water, flow, women", "wind, blows, vehicle"], "captions_pred_video": [null, "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a jet engine roars and wind blows "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "a toilet flushes and water drains"], "sample_ids": ["wtDqrBygTcU", "sfAvvZwdLCY"], "start_seconds": ["30", "20"], "properties": ["man, engine, run", "water drains, flushes, water"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and a motor is running", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vbZ-0lGPneg", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["a woman, a television program, a bird", "airplane, boy, fly"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "a car accelerates and wind blows"], "sample_ids": ["wTideSjRFS0", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["food, sizzle, woman", "accelerates, wind, blows"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["someone snores nearby", "a woman speaks happily and an animal chirps"], "sample_ids": ["spJCm8tD9Zo", "uWAAAL4CIoc"], "start_seconds": ["90", "0"], "properties": ["someone snores, nearby, someone", "a woman, chirps, animal"], "captions_pred_video": ["of a man laying on the ground with his mouth open", null], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking and a dog is barking "], "question": "which entity is more active", "label": 1}, {"captions": ["a stream of water flows quickly", "pigeons vocalize and birds chirp"], "sample_ids": ["wbHTKEJZyhc", "uiS58TNyUiw"], "start_seconds": ["20", "430"], "properties": ["stream, water, flow", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "of the pigeon in the cage"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a stream of water?", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sU53zg9Jp7s", "yajyRTUQk3U"], "start_seconds": ["380", "400"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "a woman, something, fried"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", "- a woman cooking in the kitchen"], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["vms5XGTDVQc", "zj2R0XoFr5k"], "start_seconds": ["220", "50"], "properties": ["paper, crumpled, crinkled", "airplane, boy, fly"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["paper is crumpled and crinkled", "a woman speaks while a helicopter flies overhead "], "question": "which entity is not flying?", "label": 0}, {"captions": ["an audience gives applause", "women speak and laugh as wind blows"], "sample_ids": ["x6iCUDmRpKQ", "un9VQlzgZM"], "start_seconds": ["38", "5"], "properties": ["applause, audience, give", "wind, speak, laugh"], "captions_pred_video": ["a black background with the moon and stars in the sky", null], "captions_pred_audio": ["a group of people are clapping and cheering", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is not a person?", "label": 0}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a weapon fires multiple times"], "sample_ids": ["tDlysoZiA1I", "sMC07Ucy7kg"], "start_seconds": ["0", "10"], "properties": ["animal, grunts, chirps", "weapon, fire, multiple"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage is from a car's point of view"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["women speak and laugh as wind blows", "wind blowing followed by a zoom"], "sample_ids": ["un9VQlzgZM", "vr8ZXjEBhMQ"], "start_seconds": ["5", "150"], "properties": ["wind, speak, laugh", "wind, blow, zoom"], "captions_pred_video": [null, "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more likely to be a video of a wind blowing?", "label": 1}, {"captions": ["a person sniffles and sneezes", "a clock ticks quietly and rhythmically"], "sample_ids": ["uRlbY6aoBU", "u7C-AEBQM"], "start_seconds": ["0", "30"], "properties": ["sneezes, sniffles, person", "ticks, rhythmic, quiet"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is sneezing ", "a ticktock of a clock"], "question": "which entity is silent", "label": 1}, {"captions": ["a small musical boom and then birds tweet and a few dogs pant", "a stream of water runs briefly"], "sample_ids": ["y2ZBGpgbhHM", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["birds, tweet, pant", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["birds chirping and a dog panting", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a child speaks in closed space", "vehicles pass by on a roadway"], "sample_ids": ["yW6FWLSLkx4", "tgbONvsP47Y"], "start_seconds": ["40", "0"], "properties": ["child, space, speak", "pass, vehicle, roadway"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a person uses a saw to cut some wood", "some men converse over an engine running"], "sample_ids": ["sHbXC6na9hg", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["a person, saw, wood", "men, converse, engine"], "captions_pred_video": ["a man using a tractor to cut a log into firewood youtube", null], "captions_pred_audio": ["an engine is idling and vibrating", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a person cutting wood?", "label": 0}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "wind loudly blowing while people speak in the background followed by a horn blowing"], "sample_ids": ["vdoxuJn9lTc", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["burp, loud, girl", "wind, blow, loudly"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a child speaks followed by a burp", "a truck is revving its engine and a man is speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "a woman speaks happily and an animal chirps"], "sample_ids": ["w2M4i1mklOA", "uWAAAL4CIoc"], "start_seconds": ["30", "0"], "properties": ["loud, chime, bell", "a woman, chirps, animal"], "captions_pred_video": ["footage of an antique clock", null], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a woman is speaking and a dog is barking "], "question": "which entity is quieter", "label": 1}, {"captions": ["a drill runs and two people laugh", "people speak as gunfire rings out"], "sample_ids": ["tEE3MpBt1sg", "wqTCwqVRDlk"], "start_seconds": ["50", "80"], "properties": ["two people, laugh, drill", "gunfire, ring, speak"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "water flows and trickles"], "sample_ids": ["uZesmtKZGSw", "tB7hWb9gTuQ"], "start_seconds": ["250", "30"], "properties": ["car, track, man", "water, flow, trickle"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a person sniffs and sneezes"], "sample_ids": ["sZPuqDgX2V0", "uRlbY6aoBU"], "start_seconds": ["30", "0"], "properties": ["commentator, race, track", "sneezes, person, sniffs"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a man is sneezing "], "question": "which entity is not a person?", "label": 0}, {"captions": ["two men speak as a buffeting wind blows", "a female speaks softly as paper crinkles"], "sample_ids": ["y8WEcpOlT3I", "xvDdE3zNf8Y"], "start_seconds": ["40", "120"], "properties": ["wind, speak, buffeting", "a, female, speaks"], "captions_pred_video": ["on how to use a sewing machine youtube", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman speaks and crumples paper"], "question": "which entity is speaking softly", "label": 1}, {"captions": ["a person speaks over rustling leaves", "paper is crumpling consistently"], "sample_ids": ["zOZleIRqZm4", "v5cSxLaHADY"], "start_seconds": ["80", "0"], "properties": ["rustling, leaves, person", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vddP56-ogds", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["water, flow, laugh", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "a infant makes noise and is excited"], "sample_ids": ["tDVADusiIoc", "wIJK3-5y0kA"], "start_seconds": ["60", "30"], "properties": ["man, radio, blows", "noise, excited, infant"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a baby cries and wails as an adult female speaks", "water pouring and bubbling"], "sample_ids": ["zliInBdC98Y", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["a, baby, cries, wails", "water, bubbles, pouring"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a baby cries and a woman speaks", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a woman speaks as she rubs two objects together"], "sample_ids": ["vSeGhaZt-aI", "vzxHnu-SFEw"], "start_seconds": ["50", "80"], "properties": ["water, sink, talk", "two objects, woman, speak"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a man talking?", "label": 0}, {"captions": ["wind blows and people talk while livestock vocalizes", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vXlk0lIQBFo", "vb1fPSDI4c"], "start_seconds": ["470", "30"], "properties": ["wind, talk, vocalize", "multiple, people, yell"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", null], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a crowd of people are talking and laughing"], "question": "which entity has more people", "label": 1}, {"captions": ["a person is snoring while sleeping", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["vJrjSeP17yE", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["a person is sleeping, snoring, person", "a woman, laughs, animal"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["some men converse over an engine running", "a man speaks as a car is passing by"], "sample_ids": ["sCiy7QS1U", "sK4u5T8hW78"], "start_seconds": ["300", "30"], "properties": ["men, converse, engine", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a goat screams and people speak in the background", "some tunes played by whistling"], "sample_ids": ["xC8kbrKJmco", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["background, goat, scream", "tune, play, whistling"], "captions_pred_video": [null, "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a goat is bleating ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a male speaks over some small clicks", "a clock ticktocks"], "sample_ids": ["uXxVebHsGZ8", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["male, clicks, speak", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["tapping occurs then a baby cries", "an infant crying frantically"], "sample_ids": ["wIJK3-5y0kA", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["a, cry, baby", "cry, infant, frantically"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "of the baby crying in the car seat"], "captions_pred_audio": ["a baby cries and a woman speaks", "a baby cries loudly"], "question": "which entity is crying frantically", "label": 1}, {"captions": ["a clock ticktocks in wind", "waves crash against a shoreline and people speak"], "sample_ids": ["yVumC9TGknc", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["ticktocks, clock, wind", "wave, crash, shoreline"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["a person snoring", "a man talks as something metal hits against and glass is set down"], "sample_ids": ["t8tv5YRMJUg", "x6ijhqRY38s"], "start_seconds": ["0", "250"], "properties": ["a person, snore, loud", "something metal, glass, hit"], "captions_pred_video": ["of a man getting his face licked by another man", "a chef preparing a dish with a bottle of wine and a plate of food on a table"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a man is speaking and dishes are clanging "], "question": "which entity is not a person?", "label": 1}, {"captions": ["a heavy rain falls endlessly", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wP8ZKrlx3oA", "vYkA3cfXp5Q"], "start_seconds": ["40", "30"], "properties": ["heavy, rain, fall", "engine, accelerate, idle"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a heavy rain is falling on a surface", "an engine is idling"], "question": "which entity is a moving object", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["yNtRmrn0io8", "uEU-Hg5MTN8"], "start_seconds": ["210", "27"], "properties": ["storm, distance, strike", "a woman, laughs, animal"], "captions_pred_video": ["footage of a house in the middle of the night", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["rain falls and thunder roars", "a woman is speaking and a baby is crying"], "question": "which entity is more likely to be a natural occurrence", "label": 0}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["su6FAOcOA8c", "wqZ135Ssz0"], "start_seconds": ["4", "60"], "properties": ["engine, run, woman", "two men, woman, birds"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", null], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has a woman making an announcement?", "label": 0}, {"captions": ["a woman speaks and a baby laughs", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tOj4tdLRaA", "vfYTJq7nU"], "start_seconds": ["70", "130"], "properties": ["woman, laugh, baby", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby laughs and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is about a baby?", "label": 0}, {"captions": ["a man speaks as a car is passing by", "a man speaks as a car is passing by"], "sample_ids": ["sK4u5T8hW78", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "a, car, pass"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a photograph", "label": 1}, {"captions": ["water runs into a sink while men speak", "dishes cling together then a man begins to speak"], "sample_ids": ["vzceMbklWc", "sQGXqGcwOTc"], "start_seconds": ["180", "3"], "properties": ["water, sink, run", "cling, speak, dishes"], "captions_pred_video": [null, "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["water is running and a man is speaking", "mechanisms are operating and water is splashing "], "question": "which entity is about a sink?", "label": 0}, {"captions": ["birds coo incessantly", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["yZrFNS7GFBQ", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["coo, bird, incessant", "music, gunfire, explosion"], "captions_pred_video": ["of the bird in the cage", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["an owl hoots in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "water flows as men speak and yell"], "sample_ids": ["vzxHnu-SFEw", "vJ7JPEFhyLA"], "start_seconds": ["80", "16"], "properties": ["two objects, woman, speak", "water, flow, men"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing?", "label": 1}, {"captions": ["a person is snoring while sleeping", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["vJrjSeP17yE", "vlS6YMeWAPo"], "start_seconds": ["40", "40"], "properties": ["a person is sleeping, snoring, person", "sheep, baa, birds"], "captions_pred_video": ["a black background with a small plane flying in the sky", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a person snoring loudly", "a goat bleats and birds chirp"], "question": "which entity is a person", "label": 0}, {"captions": ["an engine idles quietly then gradually becomes louder", "humming and rattling of an engine idling as it revs"], "sample_ids": ["vbr9mHKc8WM", "xMXvkIcaG0Y"], "start_seconds": ["40", "30"], "properties": ["noise, loudness, engine", "sound, humming, rattling"], "captions_pred_video": [null, "footage of a car's hood being opened up to reveal the engine underneath the hood"], "captions_pred_audio": ["an engine is idling", "an engine is revving and accelerating "], "question": "which entity is quieter", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["vbpKkWvfOu4", "wz7N8YRy74I"], "start_seconds": ["560", "30"], "properties": ["a, man, speaks", "rooster, crow, background, men"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a person is whistling a tune", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["scYRUkrFLiQ", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["a, tune, whistle", "a woman, a television program, a bird"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person?", "label": 0}, {"captions": ["repeated tapping is accompanied by water running and a woman speaking softly", "a infant makes noise and is excited"], "sample_ids": ["wvKpEYswXO0", "wIJK3-5y0kA"], "start_seconds": ["150", "30"], "properties": ["sound, water, running", "noise, excited, infant"], "captions_pred_video": ["of the person preparing food in the kitchen", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a baby cries and a woman speaks"], "question": "which entity is making noise", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wRBHTgrbiwg", "sLUnaPT5gM8"], "start_seconds": ["50", "0"], "properties": ["bird, owl, speak", "loud, laughter, intermittent"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a jet engine roars, almost making a man inaudible", "a vehicle engine runs and someone speaks"], "sample_ids": ["xfaoyyzw2WU", "zF8yoL0rkbI"], "start_seconds": ["180", "30"], "properties": ["loud, jet engine, roar", "engine, run, someone"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "footage of the traffic on the street at night"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "the wind is blowing hard and water is splashing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["v5P-ThUCINM", "y2bVZ7rz-5M"], "start_seconds": ["400", "280"], "properties": ["background, chirp, bird", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a truck is honking its horn and a siren is blaring "], "question": "which entity has a horn honking?", "label": 1}, {"captions": ["wind noise takes place into a microphone while rustling occurs", "rain falls on a surface as men speak and thunder roars"], "sample_ids": ["w8uLijTqtlU", "w0xsN8X18Y"], "start_seconds": ["70", "30"], "properties": ["wind, microphone, noise", "rain, thunder, surface"], "captions_pred_video": ["footage is blurry and shaky", null], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking while a motorboat is moving in the background "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["ylpYOorfH4o", "sLUnaPT5gM8"], "start_seconds": ["410", "0"], "properties": ["engine, running, wind", "loud, laughter, intermittent"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking and an engine is revving", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "paper is crumpling consistently"], "sample_ids": ["sjlVMgdGSK0", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["accelerates, vehicle, race car", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a car accelerates and revs its engine ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["an engine runs and wind blows", "a vehicle engine accelerates and wind blows"], "sample_ids": ["vs65y4qmyBE", "wudZTNBtVqc"], "start_seconds": ["340", "60"], "properties": ["engine, run, wind", "accelerates, engine, wind"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage is of a parking lot with cars parked in it"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a woman sneezes then speaks", "vehicles pass by on a roadway"], "sample_ids": ["x4dZyf9Gbj0", "tgbONvsP47Y"], "start_seconds": ["130", "0"], "properties": ["sneezes, speaks, woman", "pass, vehicle, roadway"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a woman sneezes and speaks", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks uses a drill", "a car accelerates and wind blows"], "sample_ids": ["x5eIC7S0fbg", "u0TrcHhkPQ"], "start_seconds": ["60", "20"], "properties": ["A man is speaking, uses a drill, and is a tool", "accelerates, wind, blows"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", null], "captions_pred_audio": ["a man is speaking and using a power tool ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "a man speaks as a motor runs in the background"], "sample_ids": ["wqZ135Ssz0", "xZepNM9qcRA"], "start_seconds": ["60", "30"], "properties": ["two men, woman, birds", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "someone whistles a tune"], "sample_ids": ["viuTg1M-dqg", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["two men, speak, follow", "someone, tune, whistle"], "captions_pred_video": ["footage of water coming out of a hole in the ground", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a person whistling a song"], "question": "which is not a musical instrument", "label": 0}, {"captions": ["birds chirp then an animal grunts", "women speak as water runs briefly, children call out, and a man speaks"], "sample_ids": ["tDlysoZiA1I", "uRExseg-0XI"], "start_seconds": ["0", "210"], "properties": ["animal, grunt, chirp", "woman, man, water"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking while water is running and birds are chirping "], "question": "which entity has more people", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "multiple people speak and children yell while water gurgles"], "sample_ids": ["t97k0cejSQE", "vb1fPSDI4c"], "start_seconds": ["250", "30"], "properties": ["bird, chirp, insect", "multiple, people, yell"], "captions_pred_video": ["a bee on a purple thistle flower", null], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["females talk and laugh over gusting wind", "a stream of water flows as people talk and wind blows"], "sample_ids": ["un9VQlzgZM", "xBxDz0CFVn0"], "start_seconds": ["5", "30"], "properties": ["females, talk, laugh", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a heavy rain falls endlessly"], "sample_ids": ["tiDFTC-5vU", "wP8ZKrlx3oA"], "start_seconds": ["30", "40"], "properties": ["male, duck, laugh", "heavy, rain, fall"], "captions_pred_video": [null, "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a heavy rain is falling on a surface"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["an insect buzzes around continuously", "a car speeding up in the distance"], "sample_ids": ["v25l1jef3JY", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["buzzes, continuously, insect", "distance, car, speed"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["low humming with a clock ticking and birds chirping", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yVumC9TGknc", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["humming, clock, birds", "rustling, ducks, quack"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", null], "captions_pred_audio": ["a series of beeps and chirps", "a duck quacks and a woman speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a guy speaks with birds chirping in the background", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["v5P-ThUCINM", "wDVMhEdTiVw"], "start_seconds": ["400", "30"], "properties": ["background, chirp, bird", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and birds are chirping", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xyL9F5VrjkE", "uZesmtKZGSw"], "start_seconds": ["20", "250"], "properties": ["wind, motor, distance", "men, talk, cars"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["scraping and female speech with distant music", "a drill runs and two people laugh"], "sample_ids": ["yHeVV-xeOxQ", "tEE3MpBt1sg"], "start_seconds": ["130", "50"], "properties": ["female, speech, music", "two people, laugh, drill"], "captions_pred_video": ["of a girl milking a goat's udder", "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["a woman is speaking and scraping sounds are heard in the background ", "people are laughing breathing and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sOa7g-44Dag", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["audio, scratching, man", "a woman, something, fried"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "an engine sputters followed by a car zooming by"], "sample_ids": ["zl9Dqx-j7q4", "u5RmF3c3Aw"], "start_seconds": ["6", "60"], "properties": ["motors rev, laugh, loudly", "engine, car, zoom"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a race car accelerates and skids with wind noise in the background "], "question": "which entity is a car?", "label": 1}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vmrxwuAMb2I", "tdWhHV3X25Q"], "start_seconds": ["40", "60"], "properties": ["a dog, inhales, exhales", "applause, audience, yells"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a dog barks and growls", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "small dogs yip and bark sharply"], "sample_ids": ["sEprKHm8Sj8", "v-wcQf4BDY0"], "start_seconds": ["90", "120"], "properties": ["car, tires, slows", "bark, yip, sharply"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "a person is snoring while sleeping"], "sample_ids": ["tK4VlLsNxak", "vJrjSeP17yE"], "start_seconds": ["120", "40"], "properties": ["a, dial, telephone", "a person is sleeping, snoring, person"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a man rubs two objects together then speaks", "a motorcycle engine works nearby"], "sample_ids": ["vveS8HT7Uog", "tOSWIURC-4"], "start_seconds": ["100", "0"], "properties": ["a man, objects, speak", "engine, work, nearby"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a lawn mower is running "], "question": "which object is moving", "label": 1}, {"captions": ["a person snoring", "vehicles pass by on a roadway"], "sample_ids": ["t8tv5YRMJUg", "tgbONvsP47Y"], "start_seconds": ["0", "0"], "properties": ["a person, snore, loud", "pass, vehicle, roadway"], "captions_pred_video": ["of a man getting his face licked by another man", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["someone whistles a tune", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sIXTftIuUgw", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["someone, tune, whistle", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person whistling a song", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "someone whistles a tune"], "sample_ids": ["wy1eKjR7KC0", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["people, talk, distance", "someone, tune, whistle"], "captions_pred_video": ["two police officers riding motorcycles down the street", null], "captions_pred_audio": ["a man is speaking and a siren is going off", "a person whistling a song"], "question": "which is a musical instrument", "label": 1}, {"captions": ["a man speaks, then dials a rotary telephone", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["tK4VlLsNxak", "uZesmtKZGSw"], "start_seconds": ["120", "250"], "properties": ["a, dial, telephone", "men, talk, cars"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about a man speaking and dialing a rotary telephone?", "label": 0}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sofxkNWaP0s", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["wind, engine, louder", "People, motor, brakes"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", null], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a man speaks while water drains", "a muffled toilet flushes and the water drains"], "sample_ids": ["vSeGhaZt-aI", "sfAvvZwdLCY"], "start_seconds": ["50", "20"], "properties": ["water, drain, man", "flushes, drains, water"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a toilet is flushed"], "question": "which entity has more water", "label": 1}, {"captions": ["a bus engine idles while a woman speaks making an announcement", "a saw finishes running as metal clings in the background"], "sample_ids": ["su6FAOcOA8c", "zofjfKhqLk8"], "start_seconds": ["4", "10"], "properties": ["engine, idle, woman", "background, metal, clings"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a large engine is running and a bell is ringing"], "question": "which entity is about a saw?", "label": 1}, {"captions": ["a woman speaks as frying food sizzles", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wTideSjRFS0", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["food, sizzle, woman", "applause, audience, yells"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "birds chirp in the background while a horse neighs followed by a girl speaking"], "sample_ids": ["vSeGhaZt-aI", "s59PfAghdkM"], "start_seconds": ["50", "0"], "properties": ["water, bubbles, speak", "bird, chirp, background, horse, neigh"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "is an anime scene featuring two people and a horse in the foreground and a fence in the background"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "birds are chirping a horse is neighing and a woman is speaking "], "question": "which entity has a horse in it?", "label": 1}, {"captions": ["birds chirp as a bell rings", "bees buzz and wind blows"], "sample_ids": ["ziUT9IFTkjg", "tMJne1a4AFI"], "start_seconds": ["10", "0"], "properties": ["chirp, bell, ring", "bees buzz, wind blows, bees"], "captions_pred_video": [null, "a swarm of bees on the ground"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a swarm of bees buzzing around"], "question": "which entity is buzzing", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "an infant crying frantically"], "sample_ids": ["yYEVLuqEytU", "zwOBqeFTgiU"], "start_seconds": ["40", "30"], "properties": ["grunt, slurp, background", "cry, infant, frantically"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "of the baby crying in the car seat"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a baby cries loudly"], "question": "which entity is a baby", "label": 1}, {"captions": ["people speak and tapping occurs", "a man speaking with light rustling"], "sample_ids": ["tFCUUGdREgA", "zOZleIRqZm4"], "start_seconds": ["70", "80"], "properties": ["people, tap, speak", "light, rustling, man"], "captions_pred_video": ["a person riding a white horse in an indoor arena", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a man is speaking and walking with wind noise in the background ", "a man is speaking with crickets chirping in the background"], "question": "which entity is more quiet", "label": 1}, {"captions": ["dogs bark as an engine runs and a person whistles", "a woman speaks happily and an animal chirps"], "sample_ids": ["zY3icUyMdh8", "uWAAAL4CIoc"], "start_seconds": ["20", "0"], "properties": ["dog, bark, engine", "a woman, chirps, animal"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "birds chirp and objects are moved around"], "sample_ids": ["yZrFNS7GFBQ", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["pigeon, buzzes, insect", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["of the bird in the cage", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["an owl hoots in the background ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "music plays and a woman speaks on a radio before gunshots are fired"], "sample_ids": ["wqZ135Ssz0", "xKB8O8LTs6s"], "start_seconds": ["60", "70"], "properties": ["two men, woman, birds", "music, radio, gunshots"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has a woman speaking on a radio?", "label": 1}, {"captions": ["a man speaks as a boat engine runs", "vehicles pass by on a roadway"], "sample_ids": ["wtDqrBygTcU", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["man, engine, run", "pass, vehicle, roadway"], "captions_pred_video": ["shows a person riding on the back of a boat as it speeds through the water", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a motor is running", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["someone whistles a song", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sIXTftIuUgw", "uZesmtKZGSw"], "start_seconds": ["90", "250"], "properties": ["someone, song, whistle", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person whistling a song", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["someone is typing on a computer keyboard", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["v0x1odnXtP0", "su6FAOcOA8c"], "start_seconds": ["210", "4"], "properties": ["keyboard, type, computer", "engine, idle, woman"], "captions_pred_video": ["how to make money on youtube in spanish", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a person is typing on a keyboard", "a woman is speaking and a subway train is moving "], "question": "which is not a person", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "paper is crumpling consistently"], "sample_ids": ["y8dSeubCNI", "v5cSxLaHADY"], "start_seconds": ["4", "0"], "properties": ["men, women, car", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["an engine revving and people talking in the background", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["an audience gives applause", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["x6iCUDmRpKQ", "tDVADusiIoc"], "start_seconds": ["38", "60"], "properties": ["applause, audience, give", "water, radio, man"], "captions_pred_video": ["a black background with the moon and stars in the sky", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a group of people are clapping and cheering", "a man is speaking while the wind is blowing and water is splashing"], "question": "which is not a person", "label": 0}, {"captions": ["waves crash against a shoreline and people speak", "a clock ticktocks"], "sample_ids": ["yFB25fqfU8I", "v-g-j2uTByM"], "start_seconds": ["300", "30"], "properties": ["wave, crash, shoreline", "ticktocks, clock, ticktocks"], "captions_pred_video": ["footage of a person surfing in the ocean", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and laughing while water is splashing and gurgling", "a clock is ticking loudly"], "question": "which entity is a clock?", "label": 1}, {"captions": ["children cry and people talk", "paper is crumpling consistently"], "sample_ids": ["xLwHe825Zs", "v5cSxLaHADY"], "start_seconds": ["18", "0"], "properties": ["people talk, children cry, people talk", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a baby cries and a woman speaks", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sK4u5T8hW78", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["a, car, pass", "wind, blow, vehicle"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a sleeping person snores and wheezes"], "sample_ids": ["sfAvvZwdLCY", "spJCm8tD9Zo"], "start_seconds": ["20", "90"], "properties": ["water drains, flushes, water", "snores, wheezes, sleeps"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a toilet is flushed", "a person is snoring loudly"], "question": "which entity is a source of noise", "label": 1}, {"captions": ["a machine beeps continuously", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["y682ml90jGw", "yDoT73BWsdA"], "start_seconds": ["11", "10"], "properties": ["beeps, machine, continuously", "engine, revs, vehicle"], "captions_pred_video": [null, "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a beeping sound is being made ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "pigeons vocalize and birds chirp"], "sample_ids": ["yJ0TePmaOo", "uiS58TNyUiw"], "start_seconds": ["390", "430"], "properties": ["two hard objects, man, speak", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a infant makes noise and is excited", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wIJK3-5y0kA", "vb1fPSDI4c"], "start_seconds": ["30", "30"], "properties": ["noise, excited, infant", "multiple, people, yell"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a dog whimpers as someone inhales/exhales briefly", "an insect buzzes around continuously"], "sample_ids": ["vmrxwuAMb2I", "v25l1jef3JY"], "start_seconds": ["40", "0"], "properties": ["a dog, inhales, exhales", "buzzes, continuously, insect"], "captions_pred_video": ["of the dog laying on the bed with his head out of the blanket", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a dog barks and growls", "a fly is buzzing around a microphone "], "question": "which entity is not a dog?", "label": 1}, {"captions": ["a helicopter engine idles continuously", "vehicles pass by on a roadway"], "sample_ids": ["ugHJF0hfYkg", "tgbONvsP47Y"], "start_seconds": ["10", "0"], "properties": ["engine, idle, continuously", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a helicopter is flying overhead ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "people speak as gunfire rings out"], "sample_ids": ["tIY7qOV3rEM", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "gunfire, ring, speak"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a flush is followed by gurgling water, then another flush", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["tqR406bGiE", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["flush, water, gurgle", "People, motor, brakes"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about water?", "label": 0}, {"captions": ["water flows as men speak and yell", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vJ7JPEFhyLA", "w5W5Kqtc8E"], "start_seconds": ["16", "100"], "properties": ["water, flow, men", "wind, blow, vehicle"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a toilet flushes and water drains", "a motor noise is accompanied by a door opening and closing"], "sample_ids": ["sfAvvZwdLCY", "vBHyYJ8pL0"], "start_seconds": ["20", "2"], "properties": ["water drains, flushes, water", "noise, door, opening"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is accompanied by a door opening and closing?", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a duck quacks continuously"], "sample_ids": ["uRExseg-0XI", "vh30P49Po6s"], "start_seconds": ["210", "30"], "properties": ["woman, man, water", "quacks, continuously, duck"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man is filing a hard object", "an infant crying as a woman laughs"], "sample_ids": ["vveS8HT7Uog", "xhmRY9yhC7c"], "start_seconds": ["100", "20"], "properties": ["a man, hard, object", "a, laugh, infant"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a baby cries and a woman speaks"], "question": "which object is harder to file", "label": 0}, {"captions": ["an aircraft engine runs as people speak", "a man speaks as a motor runs in the background"], "sample_ids": ["wTjoRj1se3U", "xZepNM9qcRA"], "start_seconds": ["390", "30"], "properties": ["engine, run, people", "background, motor, run"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a jet engine is running and people are talking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["zofjfKhqLk8", "sLUnaPT5gM8"], "start_seconds": ["10", "0"], "properties": ["noise, stop, motor", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yaln9y8I7ms", "tDVADusiIoc"], "start_seconds": ["230", "60"], "properties": ["female, flushes, toilet", "water, radio, man"], "captions_pred_video": ["footage is blurry and out of focus", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has a female speaking?", "label": 0}, {"captions": ["a person is snoring while sleeping", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vJrjSeP17yE", "vfYTJq7nU"], "start_seconds": ["40", "130"], "properties": ["a person is sleeping, snoring, person", "rustling, ducks, quack"], "captions_pred_video": ["a black background with a small plane flying in the sky", null], "captions_pred_audio": ["a person snoring loudly", "a duck quacks and a woman speaks"], "question": "which entity is a video", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["tQWGZLItBXk", "uYT5gxnyMWM"], "start_seconds": ["170", "50"], "properties": ["music, kid, speak", "a, scream, girl"], "captions_pred_video": ["worms revolution screenshots", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman is speaking and a baby is crying"], "question": "which entity is followed by a scream", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xzKKf9bKNUo", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["background, noise, snoring", "wind, blow, vehicle"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", null], "captions_pred_audio": ["a person snoring loudly", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "water splashes as an animal walks through"], "sample_ids": ["vddP56-ogds", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["water, flow, laugh", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["water running down a sink while a man is talking", "birds chirp and objects are moved around"], "sample_ids": ["vSeGhaZt-aI", "yPUYU6t3rwo"], "start_seconds": ["50", "370"], "properties": ["water, sink, talk", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["an animal hissing followed by a man mumbling then a pig oinking while birds chirp in the background", "a helicopter engine idles continuously"], "sample_ids": ["yYEVLuqEytU", "ugHJF0hfYkg"], "start_seconds": ["40", "10"], "properties": ["animal, pig, background", "engine, idle, continuously"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["several sheep bleat and a man speaks", "a helicopter is flying overhead "], "question": "which entity is silent", "label": 1}, {"captions": ["a person screams glaringly", "a stream of water runs briefly"], "sample_ids": ["xC8kbrKJmco", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["glaringly, screams, person", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a goat is bleating ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "a man speaks as crickets sing"], "sample_ids": ["sLUnaPT5gM8", "ryFDPxgDOGc"], "start_seconds": ["0", "570"], "properties": ["loud, laughter, intermittent", "a, crickets, sing"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "a group of people dressed in camouflage and hunting gear in the dark"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a man is speaking with crickets chirping in the background"], "question": "which entity is quieter", "label": 1}, {"captions": ["an aircraft engine runs", "water flows and trickles"], "sample_ids": ["yLCORCnd35Q", "tB7hWb9gTuQ"], "start_seconds": ["0", "30"], "properties": ["engine, aircraft, runs", "water, flow, trickle"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "some men converse over an engine running"], "sample_ids": ["u--KhUW8l1Y", "sCiy7QS1U"], "start_seconds": ["0", "300"], "properties": ["horn, siren, life", "men, converse, engine"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", null], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "a few ducks quack and scamper and a man speaks"], "sample_ids": ["zliInBdC98Y", "w2bYrCVLT60"], "start_seconds": ["30", "120"], "properties": ["a, baby, cries, wails", "ducks, speak, quack"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "of the ducks drinking from a pink pool in the grass"], "captions_pred_audio": ["a baby cries and a woman speaks", "ducks are quacking and a man is speaking"], "question": "which entity is speaking", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a clock ticktocks"], "sample_ids": ["sa6TLVbooCc", "v-g-j2uTByM"], "start_seconds": ["240", "30"], "properties": ["people, laugh, child", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["someone whistles a tune", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sIXTftIuUgw", "zj2R0XoFr5k"], "start_seconds": ["90", "50"], "properties": ["someone, tune, whistle", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a person whistling a song", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["a door opens and birds chirp", "a man speaks as a car is passing by"], "sample_ids": ["yeFvk9x0wWI", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "a, car, pass"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["people cheer as a vehicle engine revs", "someone snores nearby"], "sample_ids": ["xjhAnI2q6hM", "spJCm8tD9Zo"], "start_seconds": ["6", "90"], "properties": ["engine revs, vehicle, people", "someone snores, nearby, someone"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a person is snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "waves crash against a shoreline and people speak"], "sample_ids": ["v0x1odnXtP0", "yFB25fqfU8I"], "start_seconds": ["210", "300"], "properties": ["keyboard, type, computer", "wave, crash, shoreline"], "captions_pred_video": ["how to make money on youtube in spanish", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a person is typing on a keyboard", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which is a natural phenomenon", "label": 1}, {"captions": ["rustling leaves with some light squeaking and wind blowing hard followed by a light smack", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yI-KvObbDoY", "xBxDz0CFVn0"], "start_seconds": ["260", "30"], "properties": ["sound, smack, wind", "stream, water, flow"], "captions_pred_video": ["of a man hanging clothes on a clothesline in the backyard", "footage is blurry and out of focus"], "captions_pred_audio": ["the wind is blowing and footsteps are walking ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "people speak in a closed space"], "sample_ids": ["sYITalLZjj4", "sTpirNYo8vQ"], "start_seconds": ["30", "30"], "properties": ["water, rushes, background, birds", "people, space, speak"], "captions_pred_video": ["two ducks are swimming in the water near each other", "of a man taking a selfie on a bus"], "captions_pred_audio": ["wind blows and birds chirp", "a man is speaking while a car is revving and accelerating "], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "water flows and trickles"], "sample_ids": ["uC9dtII1KDI", "tB7hWb9gTuQ"], "start_seconds": ["150", "30"], "properties": ["wind, gusts, distance", "water, flow, trickle"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "water is splashing and gurgling"], "question": "which entity is more likely to be a natural phenomenon", "label": 1}, {"captions": ["paper folding and crinkling", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zPpG3RD8lSs", "yajyRTUQk3U"], "start_seconds": ["20", "400"], "properties": ["paper, fold, crinkle", "a woman, something, fried"], "captions_pred_video": ["how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's", "- a woman cooking in the kitchen"], "captions_pred_audio": ["the wind blows and a mouse clicks ", "a woman is speaking while food is frying in the background"], "question": "which entity is about food?", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "someone is typing on a computer keyboard"], "sample_ids": ["vYkA3cfXp5Q", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["speed, idle, accelerate", "keyboard, type, computer"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "how to make money on youtube in spanish"], "captions_pred_audio": ["an engine is idling", "a person is typing on a keyboard"], "question": "which is a person", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a toilet flushes and a female speaks"], "sample_ids": ["vddP56-ogds", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["water, splash, person, laugh", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["v7jJS8aAyA", "wDVMhEdTiVw"], "start_seconds": ["10", "30"], "properties": ["wind, blows, loudly", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not followed by water sloshing nearby?", "label": 0}, {"captions": ["a vehicle engine accelerates and wind blows", "a toilet flushes and a female speaks"], "sample_ids": ["wudZTNBtVqc", "yaln9y8I7ms"], "start_seconds": ["60", "230"], "properties": ["accelerates, engine, wind", "female, flushes, toilet"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", "footage is blurry and out of focus"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a toilet flushes and a man speaks"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["sQwlkXjQabo", "sLUnaPT5gM8"], "start_seconds": ["10", "0"], "properties": ["liquid, surface, spray", "loud, laughter, intermittent"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["spraying followed by silence", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["an adult male speaks and dials a rotary phone", "an airplane engine runs"], "sample_ids": ["tK4VlLsNxak", "yVPZ2MNWpms"], "start_seconds": ["120", "0"], "properties": ["An adult male speaks, dials, and speaks into a rotary phone", "engine, airplane, runs"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a car is driving by on the road "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man woman speak while crickets sing", "a clock ticktocks"], "sample_ids": ["zTLVJCo4WEE", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["a, crickets, sing", "ticktocks, clock, ticktocks"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["women speak and laugh as wind blows", "water flows and trickles"], "sample_ids": ["un9VQlzgZM", "tB7hWb9gTuQ"], "start_seconds": ["5", "30"], "properties": ["wind, speak, laugh", "water, flow, trickle"], "captions_pred_video": [null, "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "water is splashing and gurgling"], "question": "which entity is moving", "label": 1}, {"captions": ["water runs from a faucet while some men speak and the water runs in the sink", "vehicles pass by on a roadway"], "sample_ids": ["vzceMbklWc", "tgbONvsP47Y"], "start_seconds": ["180", "0"], "properties": ["water, faucet, sink", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["water is running and a man is speaking", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks as something metal hits against and glass is set down", "a child speaks in closed space"], "sample_ids": ["x6ijhqRY38s", "yW6FWLSLkx4"], "start_seconds": ["250", "40"], "properties": ["something metal, glass, hit", "child, space, speak"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["an insect buzzes around continuously", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["v25l1jef3JY", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["buzzes, continuously, insect", "animal, grunts, snorts"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking and a baby is crying"], "question": "which entity is not a person?", "label": 0}, {"captions": ["an audience gives applause", "dishes cling together then a man begins to speak"], "sample_ids": ["x6iCUDmRpKQ", "sQGXqGcwOTc"], "start_seconds": ["38", "3"], "properties": ["applause, audience, give", "cling, speak, dishes"], "captions_pred_video": ["a black background with the moon and stars in the sky", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a group of people are clapping and cheering", "mechanisms are operating and water is splashing "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a jet engine roars, almost making a man inaudible", "a man speaks followed by another man speaking outside"], "sample_ids": ["xfaoyyzw2WU", "viuTg1M-dqg"], "start_seconds": ["180", "30"], "properties": ["loud, jet engine, roar", "two men, speak, follow"], "captions_pred_video": ["footage of an airplane on the tarmac at an airport", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["an aircraft engine roars and a man speaks ", "a man is speaking with background noise and breathing sounds "], "question": "which is not a pair of men speaking?", "label": 0}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["zY3icUyMdh8", "ziUT9IFTkjg"], "start_seconds": ["20", "10"], "properties": ["dog, bark, engine", "background, birds, rustling"], "captions_pred_video": ["footage of a bus driving through a residential street at night", null], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "birds are chirping and a chime is ringing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["vehicle engines accelerate as a man commentates on an intercom", "an airplane engine runs"], "sample_ids": ["sZPuqDgX2V0", "yVPZ2MNWpms"], "start_seconds": ["30", "0"], "properties": ["engine, accelerate, intercom", "engine, airplane, runs"], "captions_pred_video": [null, "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a car is driving by on the road "], "question": "which engine is running", "label": 0}, {"captions": ["loud clanking and banging with brief male speech", "a toilet flushes and a female speaks"], "sample_ids": ["sWZzXuWYY", "yaln9y8I7ms"], "start_seconds": ["420", "230"], "properties": ["male, speech, banging", "female, flushes, toilet"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a toilet flushes and a man speaks"], "question": "which entity is silent", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "wind blows and people scream while an engine revs"], "sample_ids": ["wqUmIEzuNz4", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["frog, bird, vocalize", "wind, engine, scream"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", null], "captions_pred_audio": ["a cat meows and rustles", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is not a frog?", "label": 1}, {"captions": ["wind blows in gusts as a woman speaks in the distance", "multiple motorcycles pass by as a man speaks"], "sample_ids": ["uC9dtII1KDI", "zcDwZ6W7E3E"], "start_seconds": ["150", "180"], "properties": ["wind, gusts, distance", "man, speak, motorcycles"], "captions_pred_video": ["footage of a person riding a horse in a riding arena", "2 people riding motorcycles down a mountain road with trees lining the sides of the road"], "captions_pred_audio": ["a woman is speaking with wind noise and breathing in the background ", "a man is speaking while a car accelerates and revs its engine "], "question": "which entity is more likely to be in a city", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "a man speaks as a motor runs in the background"], "sample_ids": ["vKrYfzleLB8", "xZepNM9qcRA"], "start_seconds": ["110", "30"], "properties": ["a, ring, gunshots", "background, motor, run"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a man yell?", "label": 0}, {"captions": ["a train horn blows as it passes by", "a duck quacks continuously"], "sample_ids": ["zVacuqSb4LI", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["horn, blows, train", "quacks, continuously, duck"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "water flows as men speak and yell"], "sample_ids": ["zcDwZ6W7E3E", "vJ7JPEFhyLA"], "start_seconds": ["180", "16"], "properties": ["man, speak, motorcycles", "water, flow, men"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing as men speak and yell?", "label": 1}, {"captions": ["a car speeding up in the distance", "a woman speaks happily and an animal chirps"], "sample_ids": ["u0TrcHhkPQ", "uWAAAL4CIoc"], "start_seconds": ["20", "0"], "properties": ["distance, car, speed", "a woman, chirps, animal"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a woman is speaking and a dog is barking "], "question": "which entity is not moving", "label": 1}, {"captions": ["a duck quacks continuously", "a machine beeps continuously"], "sample_ids": ["vh30P49Po6s", "y682ml90jGw"], "start_seconds": ["30", "11"], "properties": ["quacks, continuously, duck", "beeps, machine, continuously"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "an engine runs loudly"], "sample_ids": ["v0x1odnXtP0", "vqZuVbG6-HI"], "start_seconds": ["210", "130"], "properties": ["keyboard, type, computer", "loud, engine, run"], "captions_pred_video": ["how to make money on youtube in spanish", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a person is typing on a keyboard", "a lawn mower is running and men are speaking "], "question": "which is quieter", "label": 0}, {"captions": ["a person is snoring while sleeping", "someone snores nearby"], "sample_ids": ["vJrjSeP17yE", "spJCm8tD9Zo"], "start_seconds": ["40", "90"], "properties": ["a person is sleeping, snoring, person", "someone snores, nearby, someone"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a person snoring loudly", "a person is snoring loudly"], "question": "which entity is a person", "label": 0}, {"captions": ["birds coo incessantly", "a horn blows as a train chugs along and warning bells ring"], "sample_ids": ["yZrFNS7GFBQ", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["coo, bird, incessant", "a train, a horn, a bell"], "captions_pred_video": ["of the bird in the cage", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["an owl hoots in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a warning", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["vs65y4qmyBE", "yajyRTUQk3U"], "start_seconds": ["340", "400"], "properties": ["engine, run, man", "a woman, something, fried"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["an electronic device bleeps once", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["tHJ6JSa8Y4", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["bleeps, electronic, device", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a clock is ticking and beeping", "a woman is speaking and a subway train is moving "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a kid speaks followed by music playing", "a stream of water runs briefly"], "sample_ids": ["tQWGZLItBXk", "x-PeY8Yb8M4"], "start_seconds": ["170", "300"], "properties": ["music, kid, speak", "stream, water, run"], "captions_pred_video": ["worms revolution screenshots", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["people speak in a closed space", "an insect buzzes around continuously"], "sample_ids": ["sTpirNYo8vQ", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["people, space, speak", "buzzes, continuously, insect"], "captions_pred_video": ["of a man taking a selfie on a bus", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "vehicles pass by on a roadway"], "sample_ids": ["yZrFNS7GFBQ", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["pigeon, buzzes, insect", "pass, vehicle, roadway"], "captions_pred_video": ["of the bird in the cage", "footage of a fire truck entering a garage"], "captions_pred_audio": ["an owl hoots in the background ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["wnpJndXuxLc", "w5W5Kqtc8E"], "start_seconds": ["50", "100"], "properties": ["blows, vehicle, train", "wind, blow, vehicle"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a train blowing a horn?", "label": 0}, {"captions": ["wind blows and women speak as livestock vocalizes", "several insects fly while two men talk"], "sample_ids": ["vXlk0lIQBFo", "s-T9OVOiMLo"], "start_seconds": ["470", "330"], "properties": ["wind, speak, vocalize", "several, fly, men"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is about animals?", "label": 0}, {"captions": ["a toilet flushes and a female speaks", "people speak and tapping occurs"], "sample_ids": ["yaln9y8I7ms", "tFCUUGdREgA"], "start_seconds": ["230", "70"], "properties": ["female, flushes, toilet", "people, tap, speak"], "captions_pred_video": ["footage is blurry and out of focus", "a person riding a white horse in an indoor arena"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is speaking and walking with wind noise in the background "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["an engine idles consistently before sputtering some", "a stream of water flows as people talk and wind blows"], "sample_ids": ["rwTERCUno", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["engine, idle, sputter", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["zl9Dqx-j7q4", "y2bVZ7rz-5M"], "start_seconds": ["6", "280"], "properties": ["engine, laugh, loud", "motor noise, horn, siren"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a jet engine roars ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["w8uLijTqtlU", "y8WEcpOlT3I"], "start_seconds": ["70", "40"], "properties": ["wind, microphone, noise", "harsh, wind, blows"], "captions_pred_video": ["footage is blurry and shaky", "on how to use a sewing machine youtube"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking with wind noise in the background "], "question": "which entity is a recording of a harsh wind blowing?", "label": 1}, {"captions": ["male speech with light ticking", "several ducks quack and cocks crow far away"], "sample_ids": ["xO-Q2BlIIPU", "sNB8zxXneIM"], "start_seconds": ["30", "20"], "properties": ["male, speech, ticking", "several, quack, cocks"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a group of geese in a cage"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a rooster is crowing and wind is blowing "], "question": "which entity is a bird", "label": 1}, {"captions": ["heavy rain splashes as it falls", "wind blows as people chatter quietly"], "sample_ids": ["wP8ZKrlx3oA", "xBxDz0CFVn0"], "start_seconds": ["40", "30"], "properties": ["fall, rain, splash", "wind, chatter, people"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage is blurry and out of focus"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a music is played followed by a frog croaking and then music is played again", "someone whistles a tune"], "sample_ids": ["voJh2gJxXhA", "sIXTftIuUgw"], "start_seconds": ["50", "90"], "properties": ["music, frog, croak", "someone, tune, whistle"], "captions_pred_video": ["a frog on a black background with a red diamond in the center", null], "captions_pred_audio": ["music is playing and crickets are chirping ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a pigeon cooing as an insect buzzes by briefly", "paper is crumpling consistently"], "sample_ids": ["yZrFNS7GFBQ", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["pigeon, buzzes, insect", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of the bird in the cage", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["an owl hoots in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["an aircraft engine runs", "winds blows roughly as a vehicle races past"], "sample_ids": ["yLCORCnd35Q", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["engine, aircraft, runs", "wind, blows, vehicle"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["a vehicle is skidding and squealing tires", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["soTOh3zYJfY", "tdWhHV3X25Q"], "start_seconds": ["40", "60"], "properties": ["vehicle, skid, tires", "applause, audience, yells"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["uWAAAL4CIoc", "uEU-Hg5MTN8"], "start_seconds": ["0", "27"], "properties": ["a woman, chirps, animal", "a woman, laughs, animal"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking happily and an animal chirps?", "label": 0}, {"captions": ["a man speaks as he moves silverware in a bowl", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["x6ijhqRY38s", "uEU-Hg5MTN8"], "start_seconds": ["250", "27"], "properties": ["bowl, silverware, man", "animal, grunts, snorts"], "captions_pred_video": ["a chef preparing a dish with a bottle of wine and a plate of food on a table", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and dishes are clanging ", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a horn honks and then loudly blares"], "sample_ids": ["sG7TyPnFDR0", "wnpJndXuxLc"], "start_seconds": ["180", "50"], "properties": ["beeps, machine, smoke alarm", "horn, honk, loud"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["a sheep baa followed by birds chirping and then more sheep baaing", "an infant crying as a woman laughs"], "sample_ids": ["vlS6YMeWAPo", "xhmRY9yhC7c"], "start_seconds": ["40", "20"], "properties": ["sheep, baa, birds", "a, laugh, infant"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a goat bleats and birds chirp", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["wyllXV6PjKo", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["a kid, talk, cry", "harsh, wind, blows"], "captions_pred_video": [null, "on how to use a sewing machine youtube"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking with wind noise in the background "], "question": "which entity has a harsh wind blowing?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "a person speaks over rustling leaves"], "sample_ids": ["uYT5gxnyMWM", "zOZleIRqZm4"], "start_seconds": ["50", "80"], "properties": ["person, spray, yell", "rustling, leaves, person"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a person picking berries from the bushes in the garden"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking with crickets chirping in the background"], "question": "which entity has a person speaking over a background of rustling leaves?", "label": 1}, {"captions": ["water splashes as an animal walks through", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["w1ir-sZ3Im8", "zl9Dqx-j7q4"], "start_seconds": ["90", "6"], "properties": ["animal, water, splashes", "engine, laugh, loud"], "captions_pred_video": ["footage of a group of people riding horses through a river", "footage of a man driving a car in the dark"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["ul60S8TXDA8", "uYT5gxnyMWM"], "start_seconds": ["60", "50"], "properties": ["sound, distance, bell", "female, spraying, scream"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "multiple people speak and children yell while water gurgles"], "sample_ids": ["vBslzh7saPw", "vb1fPSDI4c"], "start_seconds": ["90", "30"], "properties": ["power, scream, increase", "multiple, people, yell"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a crowd of people are talking and laughing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["uYT5gxnyMWM", "sLUnaPT5gM8"], "start_seconds": ["50", "0"], "properties": ["female, spraying, scream", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more calming", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["y8WEcpOlT3I", "uYT5gxnyMWM"], "start_seconds": ["40", "50"], "properties": ["wind, speak, buffeting", "a, scream, girl"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a woman is speaking and a baby is crying"], "question": "which entity is a recording of a girl speaking?", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["zofjfKhqLk8", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["noise, stop, motor", "music, gunfire, explosion"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a clock ticktocks"], "sample_ids": ["sQwlkXjQabo", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["liquid, surface, spray", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["spraying followed by silence", "a clock is ticking loudly"], "question": "which entity is not a clock?", "label": 0}, {"captions": ["an engine runs loudly", "a telephone rings followed by a woman talking"], "sample_ids": ["vqZuVbG6-HI", "tGcFnX0GHI"], "start_seconds": ["130", "0"], "properties": ["loud, engine, run", "ring, talk, woman"], "captions_pred_video": ["footage is blurry because it's raining outside", null], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a dial tone sounds followed by a woman speaking"], "question": "which is quieter", "label": 1}, {"captions": ["water running down a sink while a man is talking", "a machine beeps continuously"], "sample_ids": ["vSeGhaZt-aI", "y682ml90jGw"], "start_seconds": ["50", "11"], "properties": ["water, sink, talk", "beeps, machine, continuously"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a beeping sound is being made "], "question": "which entity is silent", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "people speak as gunfire rings out"], "sample_ids": ["zY3icUyMdh8", "wqTCwqVRDlk"], "start_seconds": ["20", "80"], "properties": ["dog, bark, engine", "gunfire, ring, speak"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["zgUgkpk78xU", "xfaoyyzw2WU"], "start_seconds": ["70", "180"], "properties": ["clinking, humming, horn", "loud, jet engine, roar"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "several insects fly while two men talk"], "sample_ids": ["vz8868znkVQ", "s-T9OVOiMLo"], "start_seconds": ["60", "330"], "properties": ["audio, click, kid speaking", "several, fly, men"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video", "label": 1}, {"captions": ["a woman and man are speaking", "popping and crackling repeats as men yell and laugh"], "sample_ids": ["vbpKkWvfOu4", "rqu8iB22IY"], "start_seconds": ["560", "5"], "properties": ["two people, speaking, woman, man", "sound, repeats, laugh"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", null], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a dog barks and a man speaks while music plays "], "question": "which entity has more people speaking", "label": 0}, {"captions": ["a man is snoring loudly and repeatedly", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sncRqQ67iJU", "tdWhHV3X25Q"], "start_seconds": ["460", "60"], "properties": ["loud, repeatedly, man", "applause, audience, yells"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a person is snoring", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a infant makes noise and is excited"], "sample_ids": ["sfAvvZwdLCY", "wIJK3-5y0kA"], "start_seconds": ["20", "30"], "properties": ["flushes, drains, water", "noise, excited, infant"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a toilet is flushed", "a baby cries and a woman speaks"], "question": "which entity is quieter", "label": 0}, {"captions": ["someone snores nearby", "water pouring and bubbling"], "sample_ids": ["spJCm8tD9Zo", "uyRfq-jKPpo"], "start_seconds": ["90", "50"], "properties": ["someone snores, nearby, someone", "water, bubbles, pouring"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a person is snoring loudly", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a car speeding up in the distance", "some men converse over an engine running"], "sample_ids": ["u0TrcHhkPQ", "sCiy7QS1U"], "start_seconds": ["20", "300"], "properties": ["distance, car, speed", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man speaks while a motorcycle revs and accelerates "], "question": "which is a static image", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "people speak as gunfire rings out"], "sample_ids": ["sWZzXuWYY", "wqTCwqVRDlk"], "start_seconds": ["420", "80"], "properties": ["male, speech, banging", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "a clock ticktocks"], "sample_ids": ["soTOh3zYJfY", "v-g-j2uTByM"], "start_seconds": ["40", "30"], "properties": ["vehicle, skid, tires", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["people speak as gunfire rings out", "dogs bark as an engine runs and a person whistles"], "sample_ids": ["wqTCwqVRDlk", "zY3icUyMdh8"], "start_seconds": ["80", "20"], "properties": ["gunfire, ring, speak", "dog, bark, engine"], "captions_pred_video": ["of a woman shooting a gun at a target on the beach", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a man is speaking and a gun is fired", "a car is driving and dogs are barking and squealing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person is whistling a tune", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["scYRUkrFLiQ", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["a, tune, whistle", "animal, grunts, snorts"], "captions_pred_video": ["of the man wearing a bow tie and a suit jacket in front of a red door", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking and a baby is crying"], "question": "which entity is not a person?", "label": 1}, {"captions": ["water rushes by", "a woman speaks and other women and a man talk with her"], "sample_ids": ["x-PeY8Yb8M4", "vbpKkWvfOu4"], "start_seconds": ["300", "560"], "properties": ["water, rushes, by", "a, woman, man"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a car is driving on a wet road ", "a woman is speaking and a man is speaking"], "question": "which entity is a group of people", "label": 1}, {"captions": ["after a few seconds of silence, a loud bang occurs followed by a softer banging noise", "a man speaks followed by another man speaking outside"], "sample_ids": ["zkKdxzNC97Y", "viuTg1M-dqg"], "start_seconds": ["27", "30"], "properties": ["loud, bang, noise", "two men, speak, follow"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a loud bang followed by a softer banging noise?", "label": 0}, {"captions": ["a dark barks and whimpers", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sYj4hpDUZDQ", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["barks, whimpers, dark", "male, duck, laugh"], "captions_pred_video": ["a brown and white dog standing in front of a wall with its mouth open", null], "captions_pred_audio": ["a dog barks and a cat meows", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a man speaks as a car is passing by"], "sample_ids": ["vW4x7S1VfQc", "sK4u5T8hW78"], "start_seconds": ["150", "30"], "properties": ["clacking, oil, woman", "a, car, pass"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["food sizzles in a frying pan", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a horse runs while two women talk", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sdvI1mHAsc", "tdWhHV3X25Q"], "start_seconds": ["20", "60"], "properties": ["two women, horse, run", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["someone is snoring while sleeping", "a stream of water flows as people talk and wind blows"], "sample_ids": ["ujMt0-D-x2k", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["snore, sleep, someone", "stream, water, flow"], "captions_pred_video": ["of the dog playing with a toy on the floor", "footage is blurry and out of focus"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sEprKHm8Sj8", "uZesmtKZGSw"], "start_seconds": ["90", "250"], "properties": ["car, tires, slows", "men, talk, cars"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking and a car is revving with laughter in the background "], "question": "which car is going faster", "label": 1}, {"captions": ["a infant makes noise and is excited", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wIJK3-5y0kA", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["noise, excited, infant", "applause, audience, yells"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a train horn sounds as a railroad passing bell rings", "a motorcycle engine is revving while people are speaking"], "sample_ids": ["zgUgkpk78xU", "y8dSeubCNI"], "start_seconds": ["70", "4"], "properties": ["horn, bell, train", "engine revving, people speaking, motorcycle"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "an engine revving and people talking in the background"], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "wind blows as people chatter quietly"], "sample_ids": ["tiDFTC-5vU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a frog vocalizes as birds chirp", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["wqUmIEzuNz4", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["frog, bird, vocalize", "a woman, a television program, a bird"], "captions_pred_video": ["a frog sitting in the grass on a sunny day", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a cat meows and rustles", "a woman is speaking and a dog is whimpering"], "question": "which entity is a frog", "label": 0}, {"captions": ["someone snores nearby", "a child speaks in closed space"], "sample_ids": ["spJCm8tD9Zo", "yW6FWLSLkx4"], "start_seconds": ["90", "40"], "properties": ["someone snores, nearby, someone", "child, space, speak"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "a toilet flushes and water drains"], "sample_ids": ["wvKpEYswXO0", "sfAvvZwdLCY"], "start_seconds": ["150", "20"], "properties": ["plastic, tap, speak", "water drains, flushes, water"], "captions_pred_video": ["of the person preparing food in the kitchen", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a vehicle accelerates and squeals tires", "an electric engine works nearby followed by a child talking"], "sample_ids": ["yRx9txMcBl0", "xSKJGCItUWE"], "start_seconds": ["40", "10"], "properties": ["accelerates, tires, squeals", "engine, work, child"], "captions_pred_video": ["in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta", "footage of the helicopter flying in the room"], "captions_pred_audio": ["a car is revving its engine and skidding ", "a high pitched engine is running and a child speaks"], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["white noise and snoring with some rustling in the background", "people cheer as a vehicle engine revs"], "sample_ids": ["xzKKf9bKNUo", "xjhAnI2q6hM"], "start_seconds": ["10", "6"], "properties": ["background, noise, snoring", "engine revs, vehicle, people"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a person snoring loudly", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["females talk and laugh over gusting wind", "paper is crumpling consistently"], "sample_ids": ["un9VQlzgZM", "v5cSxLaHADY"], "start_seconds": ["5", "0"], "properties": ["females, talk, laugh", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": [null, "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a person is whistling", "a stream of water runs briefly"], "sample_ids": ["sIXTftIuUgw", "x-PeY8Yb8M4"], "start_seconds": ["90", "300"], "properties": ["person, whistling, person", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a person whistling a song", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks while water drains", "vehicles pass by on a roadway"], "sample_ids": ["vSeGhaZt-aI", "tgbONvsP47Y"], "start_seconds": ["50", "0"], "properties": ["water, drain, man", "pass, vehicle, roadway"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["water splashes and wind noise is made into a microphone", "a propeller rotates loudly and intensely"], "sample_ids": ["sDSppXIlJrs", "ugHJF0hfYkg"], "start_seconds": ["27", "10"], "properties": ["microphone, water, wind", "loud, intense, propeller"], "captions_pred_video": ["a man is paddling a small wooden boat in the water", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["the wind is blowing and water is splashing", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "water flows as men speak and yell"], "sample_ids": ["uWAAAL4CIoc", "vJ7JPEFhyLA"], "start_seconds": ["0", "16"], "properties": ["a woman, chirps, animal", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["loud intermittent buzzing with intermittent laughter", "water pouring and bubbling"], "sample_ids": ["sLUnaPT5gM8", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["loud, laughter, intermittent", "water, bubbles, pouring"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "water is running from a faucet"], "question": "which entity is more quiet", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "winds blows roughly as a vehicle races past"], "sample_ids": ["vZAw4apG0Es", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["background, clock, ticktocks", "wind, blows, vehicle"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a clock is ticking and people are talking", "a jet engine roars and wind blows "], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a helicopter engine idles continuously"], "sample_ids": ["su6FAOcOA8c", "ugHJF0hfYkg"], "start_seconds": ["4", "10"], "properties": ["engine, run, woman", "engine, idle, continuously"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a helicopter is flying overhead "], "question": "which entity has a running engine", "label": 0}, {"captions": ["water is sprayed across a hard surface", "an aircraft engine runs"], "sample_ids": ["sQwlkXjQabo", "yLCORCnd35Q"], "start_seconds": ["10", "0"], "properties": ["water, spray, surface", "engine, aircraft, runs"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a lufthansa airbus a380 landing at london's heathrow airport"], "captions_pred_audio": ["spraying followed by silence", "a train is moving and its wheels are squealing "], "question": "which entity is moving", "label": 1}, {"captions": ["several insects fly while two men talk", "pigeons vocalize and birds chirp"], "sample_ids": ["s-T9OVOiMLo", "uiS58TNyUiw"], "start_seconds": ["330", "430"], "properties": ["several, fly, men", "vocalize, bird, chirp"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a man speaks, then dials a rotary telephone", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tK4VlLsNxak", "zj2R0XoFr5k"], "start_seconds": ["120", "50"], "properties": ["a, dial, telephone", "airplane, boy, fly"], "captions_pred_video": ["person is wearing a headset and holding a remote control in his hand", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and using a sewing machine", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a boy speaking?", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["w5W5Kqtc8E", "zj2R0XoFr5k"], "start_seconds": ["100", "50"], "properties": ["water, splashes, motorboat", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tQWGZLItBXk", "zj2R0XoFr5k"], "start_seconds": ["170", "50"], "properties": ["music, person, ding", "airplane, boy, fly"], "captions_pred_video": ["worms revolution screenshots", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["small dogs yip and bark sharply", "a motor slows to a stopover traffic noises"], "sample_ids": ["v-wcQf4BDY0", "zofjfKhqLk8"], "start_seconds": ["120", "10"], "properties": ["bark, yip, sharply", "noise, stop, motor"], "captions_pred_video": ["footage is blurry and shaky, making it difficult to see what is happening", "footage of a man using a machine to cut a piece of wood"], "captions_pred_audio": ["a dog barks and growls", "a large engine is running and a bell is ringing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a train horn blows as it passes by", "a man speaks followed by another man speaking outside"], "sample_ids": ["zVacuqSb4LI", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["horn, blows, train", "two men, speak, follow"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a single person speaking?", "label": 0}, {"captions": ["women speak and laugh as wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["un9VQlzgZM", "yajyRTUQk3U"], "start_seconds": ["5", "400"], "properties": ["wind, speak, laugh", "a woman, something, fried"], "captions_pred_video": [null, "- a woman cooking in the kitchen"], "captions_pred_audio": ["a woman is speaking and laughing with wind noise and breathing in the background ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["an small aircraft engine runs and a boy speaks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["xSKJGCItUWE", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["engine, run, boy", "wind, blow, vehicle"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a boy speaking?", "label": 0}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "someone is typing on a computer keyboard"], "sample_ids": ["sofxkNWaP0s", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["wind, engine, louder", "keyboard, type, computer"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a person is typing on a keyboard"], "question": "which is a type of computer", "label": 1}, {"captions": ["a man speaks on a radio as wind blows", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["tDVADusiIoc", "sLUnaPT5gM8"], "start_seconds": ["60", "0"], "properties": ["man, radio, blows", "loud, laughter, intermittent"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["wind blows and a stream of water flows nearby", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sYITalLZjj4", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["stream, flow, wind", "male, duck, laugh"], "captions_pred_video": ["two ducks are swimming in the water near each other", null], "captions_pred_audio": ["wind blows and birds chirp", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["vh30P49Po6s", "ukg5L09Wpvo"], "start_seconds": ["30", "150"], "properties": ["loud, continuous, quacks", "clickety-clack, train, whistle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a duck is quacking loudly", "a train blows its whistle and blows its horn "], "question": "which entity is quieter", "label": 1}, {"captions": ["a woman speaks followed by another woman whimpering and speaking", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["xOZfdgAgJ9o", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["woman, whimpering, speaking", "animal, grunts, snorts"], "captions_pred_video": ["footage of a woman talking to a man in a doctor's office", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a baby is crying"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["continuous chugging with birds chirping in the background", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["xM4joTqDVp4", "zFjIWfSD-4"], "start_seconds": ["160", "410"], "properties": ["background, chirp, birds", "People, motor, brakes"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", null], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running and air brakes hissing?", "label": 1}, {"captions": ["a toilet flushes and a female speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yaln9y8I7ms", "wz7N8YRy74I"], "start_seconds": ["230", "30"], "properties": ["female, flushes, toilet", "rooster, crow, background, men"], "captions_pred_video": ["footage is blurry and out of focus", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a toilet flushes and a man speaks", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster?", "label": 1}, {"captions": ["a saw finishes running as metal clings in the background", "a train horn blows as it passes by"], "sample_ids": ["zofjfKhqLk8", "zVacuqSb4LI"], "start_seconds": ["10", "30"], "properties": ["background, metal, clings", "horn, blows, train"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["a child yells and another yells", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vMDHu7Lxcgw", "uZesmtKZGSw"], "start_seconds": ["410", "250"], "properties": ["two, yell, child", "men, talk, cars"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has more cars", "label": 1}, {"captions": ["an electric engine works nearby followed by a child talking", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["xSKJGCItUWE", "wqZ135Ssz0"], "start_seconds": ["10", "60"], "properties": ["engine, work, child", "two men, woman, birds"], "captions_pred_video": ["footage of the helicopter flying in the room", null], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["some tunes played by whistling", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["u6BnG6YZqJ4", "uZesmtKZGSw"], "start_seconds": ["0", "250"], "properties": ["tune, play, whistling", "men, talk, cars"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a person whistling a song", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a infant makes noise and is excited", "people applaud and hoot and chat quietly"], "sample_ids": ["wIJK3-5y0kA", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["noise, excited, infant", "people, applaud, hoot"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a frog croaks as other frogs croak in the background", "a child speaks in closed space"], "sample_ids": ["yswmmRZFItk", "yW6FWLSLkx4"], "start_seconds": ["0", "40"], "properties": ["background, frog, croak", "child, space, speak"], "captions_pred_video": ["a close up of a frog in the water", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a frog is croaking", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["someone is burping continuously", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["y636gklDioE", "zl9Dqx-j7q4"], "start_seconds": ["20", "6"], "properties": ["burps, burps, burps", "engine, laugh, loud"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a person burps loudly several times", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a helicopter engine idles continuously", "waves crash against a shoreline and people speak"], "sample_ids": ["ugHJF0hfYkg", "yFB25fqfU8I"], "start_seconds": ["10", "300"], "properties": ["engine, idle, continuously", "wave, crash, shoreline"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a helicopter is flying overhead ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["several ducks are quacking and squawking", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wfHeoPDLMaM", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["quacking, squawking, ducks", "female, spraying, scream"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["ducks are quacking", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["u--KhUW8l1Y", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["horn, siren, life", "applause, audience, yells"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a person snores loudly multiple times at a close distance"], "sample_ids": ["vBHyYJ8pL0", "sSMl2vc3ek"], "start_seconds": ["2", "20"], "properties": ["noise, door, opening", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an emergency siren wails as it passes", "a heavy rain falls endlessly"], "sample_ids": ["vGj1XLJvNrw", "wP8ZKrlx3oA"], "start_seconds": ["0", "40"], "properties": ["wails, wails, pass", "heavy, rain, fall"], "captions_pred_video": ["footage of a police car driving down a city street", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a heavy rain is falling on a surface"], "question": "which entity is falling", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "pigeons vocalize and birds chirp"], "sample_ids": ["tOSWIURC-4", "uiS58TNyUiw"], "start_seconds": ["0", "430"], "properties": ["noise, engine, revs", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["some men talk among st themselves as cars speed and race loudly", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uZesmtKZGSw", "xBxDz0CFVn0"], "start_seconds": ["250", "30"], "properties": ["men, talk, cars", "stream, water, flow"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman speaks as she rubs two objects together", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vzxHnu-SFEw", "vbZ-0lGPneg"], "start_seconds": ["80", "30"], "properties": ["two objects, woman, speak", "a woman, a television program, a bird"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a woman is speaking and a dog is whimpering"], "question": "which woman is speaking", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a stream of water runs briefly"], "sample_ids": ["sZPuqDgX2V0", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["commentator, race, track", "stream, water, run"], "captions_pred_video": [null, "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "a child speaks in closed space"], "sample_ids": ["uYT5gxnyMWM", "yW6FWLSLkx4"], "start_seconds": ["50", "40"], "properties": ["person, spray, yell", "child, space, speak"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["wAAkbZToh8", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["burp, laugh, speak", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man burps and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["the wind blows while a vehicle engine runs", "a clock ticktocks"], "sample_ids": ["xyL9F5VrjkE", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["wind, blows, vehicle", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an animal quacks rapidly", "water is sprayed across a hard surface"], "sample_ids": ["vh30P49Po6s", "sQwlkXjQabo"], "start_seconds": ["30", "10"], "properties": ["animal, quacks, rapidly", "water, spray, surface"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a duck is quacking loudly", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "pigeons vocalize and birds chirp"], "sample_ids": ["y2bVZ7rz-5M", "uiS58TNyUiw"], "start_seconds": ["280", "430"], "properties": ["engine, horn, siren", "vocalize, bird, chirp"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "of the pigeon in the cage"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a vehicle?", "label": 1}, {"captions": ["a man speaks over intermittent keyboard taps", "paper is crumpling consistently"], "sample_ids": ["tw76HGONaKg", "v5cSxLaHADY"], "start_seconds": ["570", "0"], "properties": ["audio, man, keyboard", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "paper is crumpled and crinkled"], "question": "which entity is a video", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["uEU-Hg5MTN8", "xfaoyyzw2WU"], "start_seconds": ["27", "180"], "properties": ["animal, grunts, snorts", "loud, jet engine, roar"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows and women speak as livestock vocalizes", "a baby laughs giddily and a woman laughs then speaks"], "sample_ids": ["vXlk0lIQBFo", "wjsXBsc7M40"], "start_seconds": ["470", "10"], "properties": ["wind, speak, vocalize", "a baby laughs, a woman laughs, a woman speaks"], "captions_pred_video": ["- a woman and two donkeys in a fenced in area", "footage of the baby playing with a toothbrush"], "captions_pred_audio": ["wind chimes are ringing and people are speaking and laughing ", "a baby laughs and a woman speaks"], "question": "which entity is about a baby and a woman?", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a car speeding up in the distance"], "sample_ids": ["vf9xf3vMsGM", "u0TrcHhkPQ"], "start_seconds": ["540", "20"], "properties": ["A man speaks while turning a water faucet on.", "distance, car, speed"], "captions_pred_video": ["of the person washing their hands under the faucet", null], "captions_pred_audio": ["a man is speaking while water is running in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["water flows followed by women screaming", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["w5W5Kqtc8E", "tDVADusiIoc"], "start_seconds": ["100", "60"], "properties": ["water, flow, women", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows water flowing?", "label": 0}, {"captions": ["an emergency siren wails as it passes", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vGj1XLJvNrw", "yDoT73BWsdA"], "start_seconds": ["0", "10"], "properties": ["wails, wails, pass", "engine, revs, vehicle"], "captions_pred_video": ["footage of a police car driving down a city street", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["food fries in a pan as someone talks and cooks", "a speedboat passes quickly on the water"], "sample_ids": ["ukxt9I7eMMg", "tjmoSi330GM"], "start_seconds": ["30", "23"], "properties": ["food, pan, cook", "speed, water, boat"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a person riding a jet ski on a lake with trees in the background"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a motorboat speeds through water with wind noise "], "question": "which is moving faster", "label": 0}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["wqZ135Ssz0", "y2bVZ7rz-5M"], "start_seconds": ["60", "280"], "properties": ["man, woman, squawks", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is accompanied by a horn", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["w9lpbUn0hPc", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["male, wind, rustling", "beeps, hit, woman"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["continuous snoring", "waves crash against a shoreline and people speak"], "sample_ids": ["sLkeqCDJIyw", "yFB25fqfU8I"], "start_seconds": ["120", "300"], "properties": ["loud, snoring, noise", "wave, crash, shoreline"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is louder", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "insects humming with a dog barking and small goat bleating"], "sample_ids": ["uJV8NDaHqqk", "tIY7qOV3rEM"], "start_seconds": ["100", "0"], "properties": ["loud, fly, chirp", "animal, bark, dog, barking, small, goat, bleating"], "captions_pred_video": ["a bee hive in a wooden box", "a dog is standing in the middle of a dirt road in the woods"], "captions_pred_audio": ["a swarm of bees buzzing around", "a dog is barking and a cat is meowing"], "question": "which animal is barking", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "a man speaks over intermittent keyboard taps"], "sample_ids": ["sYITalLZjj4", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["water, rushes, background, birds", "audio, man, keyboard"], "captions_pred_video": ["two ducks are swimming in the water near each other", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["wind blows and birds chirp", "a man speaks and types on a computer keyboard "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "wind blows as people chatter quietly"], "sample_ids": ["s4Uz1Ffgo04", "xBxDz0CFVn0"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "wind, chatter, people"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["food is frying then a woman speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["ukxt9I7eMMg", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["food, woman, speak", "men, talk, cars"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["an insect buzzes around continuously", "people speak as gunfire rings out"], "sample_ids": ["v25l1jef3JY", "wqTCwqVRDlk"], "start_seconds": ["0", "80"], "properties": ["buzzes, continuously, insect", "gunfire, ring, speak"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["waves of water rumble", "a car accelerates and wind blows"], "sample_ids": ["vwqaIHKxLvM", "u0TrcHhkPQ"], "start_seconds": ["20", "20"], "properties": ["sound, wave, water", "accelerates, wind, blows"], "captions_pred_video": ["of a surfer riding a big wave in the ocean", null], "captions_pred_audio": ["waves crash and wind blows ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["an engine runs and wind blows", "a child speaks in closed space"], "sample_ids": ["vs65y4qmyBE", "yW6FWLSLkx4"], "start_seconds": ["340", "40"], "properties": ["engine, run, wind", "child, space, speak"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "water splashes as an animal walks through"], "sample_ids": ["x9JovgqUcs", "w1ir-sZ3Im8"], "start_seconds": ["500", "90"], "properties": ["a, man, speaks, keyboard", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man speaks and types on a keyboard", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["birds chirp and an owl hoots before a man speaks briefly", "someone is typing on a computer keyboard"], "sample_ids": ["wRBHTgrbiwg", "v0x1odnXtP0"], "start_seconds": ["50", "210"], "properties": ["bird, owl, speak", "keyboard, type, computer"], "captions_pred_video": ["of a bee pollinating the flowers in the field", "how to make money on youtube in spanish"], "captions_pred_audio": ["birds are chirping and insects are buzzing", "a person is typing on a keyboard"], "question": "which entity is a person", "label": 1}, {"captions": ["a person is burping then speaks and laughs", "someone is typing on a computer keyboard"], "sample_ids": ["wAAkbZToh8", "v0x1odnXtP0"], "start_seconds": ["0", "210"], "properties": ["burp, laugh, speak", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man burps and a woman speaks", "a person is typing on a keyboard"], "question": "which person is typing on a computer keyboard", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "three men talk while wind blows and some liquid flows"], "sample_ids": ["vz8868znkVQ", "vJ7JPEFhyLA"], "start_seconds": ["60", "16"], "properties": ["audio, click, kid speaking", "three men, wind, flow"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video", "label": 1}, {"captions": ["water splashes as an animal walks through", "water pouring and bubbling"], "sample_ids": ["w1ir-sZ3Im8", "uyRfq-jKPpo"], "start_seconds": ["90", "50"], "properties": ["animal, water, splashes", "water, bubbles, pouring"], "captions_pred_video": ["footage of a group of people riding horses through a river", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["water splashes and gurgles as people speak", "water is running from a faucet"], "question": "which entity is more calm", "label": 1}, {"captions": ["an airplane engine runs", "a person screams glaringly"], "sample_ids": ["yVPZ2MNWpms", "xC8kbrKJmco"], "start_seconds": ["0", "0"], "properties": ["engine, airplane, runs", "glaringly, screams, person"], "captions_pred_video": ["footage of an airport with planes parked on the tarmac", null], "captions_pred_audio": ["a car is driving by on the road ", "a goat is bleating "], "question": "which entity is louder", "label": 1}, {"captions": ["a man laughs and speaks as cats purr and hiss", "a propeller rotates loudly and intensely"], "sample_ids": ["vVhthZ45k3Y", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["cat, purr, hiss", "loud, intense, propeller"], "captions_pred_video": ["footage is blurry and out of focus", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sjlVMgdGSK0", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["accelerates, vehicle, race car", "applause, audience, yells"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "people applaud and hoot and chat quietly"], "sample_ids": ["uEU-Hg5MTN8", "wwyfGO2J4"], "start_seconds": ["27", "90"], "properties": ["animal, grunts, snorts", "people, applaud, hoot"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as horns blow", "dog barking and vehicle engine idling followed shortly by vehicle engine revving"], "sample_ids": ["tHyNqRyK34A", "zY3icUyMdh8"], "start_seconds": ["24", "20"], "properties": ["a, man, speaks", "dog, bark, engine"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of a bus driving through a residential street at night"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a car is driving and dogs are barking and squealing "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an engine starts and increases in power", "a woman speaks and other women and a man talk with her"], "sample_ids": ["zjTG0gaGCUI", "vbpKkWvfOu4"], "start_seconds": ["80", "560"], "properties": ["power, increase, engine", "a, woman, man"], "captions_pred_video": [null, "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a jet engine roars as wind blows ", "a woman is speaking and a man is speaking"], "question": "which entity is a group of people", "label": 1}, {"captions": ["a child babbles as a woman speaks", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["wEBlkGWVWwE", "uZesmtKZGSw"], "start_seconds": ["260", "250"], "properties": ["a, babble, woman", "men, talk, cars"], "captions_pred_video": ["shows a person writing on the whiteboard", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a man speaks as water trickles down a stream", "a duck quacks continuously"], "sample_ids": ["sapQIQUhFc", "vh30P49Po6s"], "start_seconds": ["280", "30"], "properties": ["water, stream, trickles", "quacks, continuously, duck"], "captions_pred_video": [null, "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "someone whistles a tune"], "sample_ids": ["vh30P49Po6s", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["loud, continuous, quacks", "someone, tune, whistle"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a person whistling a song"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a small musical boom and then birds tweet and a few dogs pant"], "sample_ids": ["sK4u5T8hW78", "y2ZBGpgbhHM"], "start_seconds": ["30", "30"], "properties": ["a, car, pass", "birds, tweet, pant"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "birds chirping and a dog panting"], "question": "which entity has more animals", "label": 1}, {"captions": ["male speech followed by light wind, rustling, distant speech and brief hissing", "frogs croak and vocalize"], "sample_ids": ["w9lpbUn0hPc", "yswmmRZFItk"], "start_seconds": ["30", "0"], "properties": ["male, wind, rustling", "croak, vocalize, frog"], "captions_pred_video": ["footage of a man in a black shirt standing in front of a white truck in a parking lot", "a close up of a frog in the water"], "captions_pred_audio": ["a man is speaking with wind noise and breathing sounds in the background ", "a frog is croaking"], "question": "which entity is a frog?", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "multiple people speak and children yell while water gurgles"], "sample_ids": ["uJV8NDaHqqk", "vb1fPSDI4c"], "start_seconds": ["100", "30"], "properties": ["loud, fly, chirp", "multiple, people, yell"], "captions_pred_video": ["a bee hive in a wooden box", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a crowd of people are talking and laughing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as horns blow", "vehicles pass by on a roadway"], "sample_ids": ["tHyNqRyK34A", "tgbONvsP47Y"], "start_seconds": ["24", "0"], "properties": ["a, man, speaks", "pass, vehicle, roadway"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "a car is driving on the road "], "question": "which entity is more active", "label": 1}, {"captions": ["heavy rain splashes as it falls", "a motor idles, accelerates, then slows down."], "sample_ids": ["wP8ZKrlx3oA", "vYkA3cfXp5Q"], "start_seconds": ["40", "30"], "properties": ["fall, rain, splash", "speed, idle, accelerate"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a heavy rain is falling on a surface", "an engine is idling"], "question": "which entity is not a motor?", "label": 0}, {"captions": ["a horn honks twice and keys jingle, followed by a slam and an electronic beep", "a vehicle engine accelerating then running on idle"], "sample_ids": ["wSVhSdj0F0", "vYkA3cfXp5Q"], "start_seconds": ["10", "30"], "properties": ["horn honks, keys jingle, slam", "engine, accelerate, idle"], "captions_pred_video": [null, "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vf44CgrjT0A", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["loud, long, person", "female, spraying, scream"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a loud burp", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["a dog barks and whimpers", "an airplane flies overhead as a woman speaks"], "sample_ids": ["sShpyu2l4YQ", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["barks, whimpers, dog", "airplane, fly, overhead"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a dog is barking and growling", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying overhead", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "people applaud and hoot and chat quietly"], "sample_ids": ["tEE3MpBt1sg", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["drill, something, laugh", "people, applaud, hoot"], "captions_pred_video": ["footage is blurry due to the smoke in the air", null], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "people are clapping and speaking with background noise "], "question": "which entity is more likely to be a performance", "label": 1}, {"captions": ["an insect buzzes around continuously", "an engine runs loudly"], "sample_ids": ["v25l1jef3JY", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["buzzes, continuously, insect", "loud, engine, run"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["birds chirp as a bell rings", "a car accelerates and wind blows"], "sample_ids": ["ziUT9IFTkjg", "u0TrcHhkPQ"], "start_seconds": ["10", "20"], "properties": ["chirp, bell, ring", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a frog vocalizes while birds chirp", "a car accelerates and wind blows"], "sample_ids": ["vMf1dLD6Sng", "u0TrcHhkPQ"], "start_seconds": ["6", "20"], "properties": ["frog, bird, vocalize", "accelerates, wind, blows"], "captions_pred_video": ["a frog in a pond with pink flowers in the background", null], "captions_pred_audio": ["a frog croaks loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a drill drills through something then people begin laughing", "a woman speaks as she rubs two objects together"], "sample_ids": ["tEE3MpBt1sg", "vzxHnu-SFEw"], "start_seconds": ["50", "80"], "properties": ["drill, something, laugh", "two objects, woman, speak"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a demonstration of a tool?", "label": 0}, {"captions": ["a man speaks while a rooster crows and other people speak in the background", "a man speaks as a car is passing by"], "sample_ids": ["wz7N8YRy74I", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, people", "a, car, pass"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has a rooster in it?", "label": 0}, {"captions": ["birds vocalize and chirp continuously", "three men talk while wind blows and some liquid flows"], "sample_ids": ["w1mlz3Pe4fU", "vJ7JPEFhyLA"], "start_seconds": ["300", "16"], "properties": ["vocalize, chirp, continuously", "three men, wind, flow"], "captions_pred_video": ["of a bird in a cage", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a bird?", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "wind blowing followed by a zoom"], "sample_ids": ["sYITalLZjj4", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["water, rushes, background, birds", "wind, blow, zoom"], "captions_pred_video": ["two ducks are swimming in the water near each other", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["wind blows and birds chirp", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more calm", "label": 0}, {"captions": ["children cry and people talk", "a person snores loudly multiple times at a close distance"], "sample_ids": ["xLwHe825Zs", "sSMl2vc3ek"], "start_seconds": ["18", "20"], "properties": ["people talk, children cry, people talk", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby cries and a woman speaks", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a baby cries and a woman moans", "birds chirp, a woman speaks, and insects buzz"], "sample_ids": ["smDKStoHBJo", "t97k0cejSQE"], "start_seconds": ["0", "250"], "properties": ["a, cry, woman", "sound, chirp, buzz"], "captions_pred_video": ["a man holding a crying baby in his arms", "a bee on a purple thistle flower"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a bee buzzes and a woman speaks"], "question": "which entity has a higher pitch", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["t69a8aRKhmc", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["a, b, c", "background, birds, rustling"], "captions_pred_video": ["footage is blurry and out of focus", null], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "birds are chirping and a chime is ringing "], "question": "which entity has a bird in the background?", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["vZAw4apG0Es", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["background, clock, ticktocks", "beeps, hit, woman"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a clock is ticking and people are talking", "a beep sounds followed by a child speaking"], "question": "which entity has a clock ticking in the background?", "label": 0}, {"captions": ["a man speaks then multiple motorcycles pass by", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["zcDwZ6W7E3E", "ukg5L09Wpvo"], "start_seconds": ["180", "150"], "properties": ["a, man, speak", "clickety-clack, train, whistle"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a infant makes noise and is excited"], "sample_ids": ["s4Uz1Ffgo04", "wIJK3-5y0kA"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "noise, excited, infant"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a baby cries and a woman speaks"], "question": "which entity is quieter", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "water splashes as an animal walks through"], "sample_ids": ["zl9Dqx-j7q4", "w1ir-sZ3Im8"], "start_seconds": ["6", "90"], "properties": ["engine, laugh, loud", "animal, water, splashes"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a jet engine roars ", "water splashes and gurgles as people speak"], "question": "which entity is more quiet", "label": 1}, {"captions": ["birds twitter and chirp and clatter", "a clock ticktocks"], "sample_ids": ["yeFvk9x0wWI", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["chirp, twitter, clatter", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a chime of a clock followed by various tones of ticking with come clinking", "a man speaks, another man speaks, and a small bell dings"], "sample_ids": ["uqFtmnhuqA8", "t69a8aRKhmc"], "start_seconds": ["30", "30"], "properties": ["a, b, c", "a, b, c"], "captions_pred_video": ["shows a clock on the wall of a room with a person standing in front of it", "footage is blurry and out of focus"], "captions_pred_audio": ["mechanisms are ticking and a hammer is striking ", "a man is speaking and birds are chirping in the background "], "question": "which entity has a ding?", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "small dogs yip and bark sharply"], "sample_ids": ["vqZuVbG6-HI", "v-wcQf4BDY0"], "start_seconds": ["130", "120"], "properties": ["background, male, female", "bark, yip, sharply"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a dog barks and growls"], "question": "which entity is more aggressive", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "an insect buzzes around continuously"], "sample_ids": ["vms5XGTDVQc", "v25l1jef3JY"], "start_seconds": ["220", "0"], "properties": ["paper, crumpled, crinkled", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["paper is crumpled and crinkled", "a fly is buzzing around a microphone "], "question": "which entity is a living thing", "label": 1}, {"captions": ["a motorcycle engine works nearby", "several insects fly while two men talk"], "sample_ids": ["tOSWIURC-4", "s-T9OVOiMLo"], "start_seconds": ["0", "330"], "properties": ["engine, work, nearby", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a lawn mower is running ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a living thing", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "vehicles pass by on a roadway"], "sample_ids": ["y2bVZ7rz-5M", "tgbONvsP47Y"], "start_seconds": ["280", "0"], "properties": ["motor noise, horn, siren", "pass, vehicle, roadway"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "some tunes played by whistling"], "sample_ids": ["tQWGZLItBXk", "u6BnG6YZqJ4"], "start_seconds": ["170", "0"], "properties": ["voice, music, whoosh", "tune, play, whistling"], "captions_pred_video": ["worms revolution screenshots", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a vehicle engine runs and wind blows before women yell", "waves crash against a shoreline and people speak"], "sample_ids": ["w5W5Kqtc8E", "yFB25fqfU8I"], "start_seconds": ["100", "300"], "properties": ["wind, blow, vehicle", "wave, crash, shoreline"], "captions_pred_video": [null, "footage of a person surfing in the ocean"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more likely to be in a storm", "label": 1}, {"captions": ["a man speaks as a car is passing by", "someone is typing on a computer keyboard"], "sample_ids": ["sK4u5T8hW78", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["a, car, pass", "keyboard, type, computer"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["birds chirp as a bell rings", "music plays and a woman speaks on a radio before gunshots are fired"], "sample_ids": ["ziUT9IFTkjg", "xKB8O8LTs6s"], "start_seconds": ["10", "70"], "properties": ["chirp, bell, ring", "music, radio, gunshots"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["vBHyYJ8pL0", "uZesmtKZGSw"], "start_seconds": ["2", "250"], "properties": ["noise, door, opening", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is about cars?", "label": 1}, {"captions": ["a man speaks while water trickles and flows", "people cheer as a vehicle engine revs"], "sample_ids": ["sapQIQUhFc", "xjhAnI2q6hM"], "start_seconds": ["280", "6"], "properties": ["water, trickles, flow", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["an infant crying and a woman speaking with some distant murmuring", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["smDKStoHBJo", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["a, infant, speaking", "two men, woman, birds"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a male speaks over some small clicks", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["uXxVebHsGZ8", "ziUT9IFTkjg"], "start_seconds": ["30", "10"], "properties": ["male, clicks, speak", "background, birds, rustling"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man speaks and types on a computer keyboard", "birds are chirping and a chime is ringing "], "question": "which entity has a background of birds chirping", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "a toilet flushes and water drains"], "sample_ids": ["yNtRmrn0io8", "sfAvvZwdLCY"], "start_seconds": ["210", "20"], "properties": ["storm, distance, strike", "water drains, flushes, water"], "captions_pred_video": ["footage of a house in the middle of the night", "footage of the toilet in the bathroom"], "captions_pred_audio": ["rain falls and thunder roars", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["an aircraft engine runs", "a horn rings out as a machine runs by"], "sample_ids": ["yLCORCnd35Q", "slZLHwNbbt4"], "start_seconds": ["0", "300"], "properties": ["engine, aircraft, runs", "a, horn, run"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a machine?", "label": 1}, {"captions": ["a duck quacks continuously", "vehicles pass by on a roadway"], "sample_ids": ["vh30P49Po6s", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["quacks, continuously, duck", "pass, vehicle, roadway"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a duck is quacking loudly", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a person snores loudly multiple times at a close distance"], "sample_ids": ["yYJksgsxx5U", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["audio, woman, silverware", "loud, multiple, distance"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", null], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sSMl2vc3ek", "uEU-Hg5MTN8"], "start_seconds": ["20", "27"], "properties": ["a person, laughs, snores", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and a baby is crying"], "question": "which entity is a person?", "label": 0}, {"captions": ["pigeons vocalize and birds chirp", "a person snores loudly multiple times at a close distance"], "sample_ids": ["uiS58TNyUiw", "sSMl2vc3ek"], "start_seconds": ["430", "20"], "properties": ["vocalize, bird, chirp", "loud, multiple, distance"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["rwTERCUno", "su6FAOcOA8c"], "start_seconds": ["90", "4"], "properties": ["engine, idle, sputter", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["an engine is idling and vibrating", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "race cars go around a track as a man commentates"], "sample_ids": ["wz7N8YRy74I", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["rooster, crow, background, men", "car, track, man"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["a man speaks in the background while a slow tick repeats", "vehicles pass by on a roadway"], "sample_ids": ["vZAw4apG0Es", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["background, tick, repeat", "pass, vehicle, roadway"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a clock is ticking and people are talking", "a car is driving on the road "], "question": "which entity has a lot of vehicles passing by on a roadway?", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "people speak as gunfire rings out"], "sample_ids": ["t8CV69hcvF0", "wqTCwqVRDlk"], "start_seconds": ["210", "80"], "properties": ["person, sneeze, follow", "gunfire, ring, speak"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman sneezes and speaks", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "an infant crying as a woman laughs"], "sample_ids": ["zcDwZ6W7E3E", "xhmRY9yhC7c"], "start_seconds": ["180", "20"], "properties": ["man, speak, motorcycles", "a, laugh, infant"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a siren comes to life as a horn blares"], "sample_ids": ["y2bVZ7rz-5M", "u--KhUW8l1Y"], "start_seconds": ["280", "0"], "properties": ["motor noise, horn, siren", "horn, siren, life"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a firefighter spraying water from a fire hydrant at night"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a fire truck siren blares and a horn blows "], "question": "which entity has a horn that is blaring?", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "water is sprayed across a hard surface"], "sample_ids": ["uEU-Hg5MTN8", "sQwlkXjQabo"], "start_seconds": ["27", "10"], "properties": ["a woman, laughs, animal", "water, spray, surface"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "an airplane engine runs"], "sample_ids": ["soTOh3zYJfY", "yVPZ2MNWpms"], "start_seconds": ["40", "0"], "properties": ["vehicle, skid, tires", "engine, airplane, runs"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a car is driving by on the road "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["some liquid flows while a woman laughs and man talks", "a race car approaches quickly and slows down squealing tires"], "sample_ids": ["vddP56-ogds", "sEprKHm8Sj8"], "start_seconds": ["30", "90"], "properties": ["liquid, laughs, man", "car, tires, slows"], "captions_pred_video": [null, "rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "a male speaks and another male speaks"], "sample_ids": ["xV7Mg1QucSc", "viuTg1M-dqg"], "start_seconds": ["14", "30"], "properties": ["alarm, ticktocks, laughs", "two males, speaking, male"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more males speaking", "label": 1}, {"captions": ["an animal growls followed by birds chirping", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["y2ZBGpgbhHM", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["animal, growl, bird", "men, talk, cars"], "captions_pred_video": [null, "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["birds chirping and a dog panting", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["food is frying while a woman speaks", "someone snores nearby"], "sample_ids": ["yhQ2Lg-7qDY", "spJCm8tD9Zo"], "start_seconds": ["130", "90"], "properties": ["food, woman, speak", "someone snores, nearby, someone"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a person is snoring loudly"], "question": "what is a person doing in the first picture?", "label": 0}, {"captions": ["a child and woman laughs and the woman speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["uPDn2BFTHk", "zj2R0XoFr5k"], "start_seconds": ["140", "50"], "properties": ["woman, laughs, speaks", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman speaks while a helicopter flies overhead "], "question": "which entity shows a child and woman laughs and the woman speaks?", "label": 0}, {"captions": ["a girl talking, laughing and sneezing noise", "an engine runs loudly"], "sample_ids": ["y4tPJXBKDig", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["a, noise, talk", "loud, engine, run"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a lawn mower is running and men are speaking "], "question": "which noise is louder", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a woman speaks and is crumpling paper"], "sample_ids": ["uEU-Hg5MTN8", "xvDdE3zNf8Y"], "start_seconds": ["27", "120"], "properties": ["a woman, laughs, animal", "A, crumple, paper"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of a woman in a white shirt and glasses holding a purple tie"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman speaks and crumples paper"], "question": "which woman is crumpling paper", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a vehicle engine runs and someone speaks"], "sample_ids": ["xKB8O8LTs6s", "zF8yoL0rkbI"], "start_seconds": ["70", "30"], "properties": ["music, radio, gunshots", "engine, run, someone"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of the traffic on the street at night"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "the wind is blowing hard and water is splashing"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a woman speaks with water running", "propeller rearing loudly with some male and female voices interspersed in the background"], "sample_ids": ["wTideSjRFS0", "vqZuVbG6-HI"], "start_seconds": ["30", "130"], "properties": ["water, running, woman", "background, male, female"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a lawn mower is running and men are speaking "], "question": "which entity has a woman speaking with water running in the background?", "label": 0}, {"captions": ["a train horn blares as a train passes, then fades", "people speak as gunfire rings out"], "sample_ids": ["zVacuqSb4LI", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["blares, fades, train", "gunfire, ring, speak"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 0}, {"captions": ["dogs barking and whimpering", "someone whistles a tune"], "sample_ids": ["tIY7qOV3rEM", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["barking, whimpering, dog", "someone, tune, whistle"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", null], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a person whistling a song"], "question": "which entity is a human", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "birds chirp and objects are moved around"], "sample_ids": ["vBslzh7saPw", "yPUYU6t3rwo"], "start_seconds": ["90", "370"], "properties": ["power, scream, increase", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a jet engine roars and accelerates ", "insects buzz and a man speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "motors runs briefly and tires screech"], "sample_ids": ["sZvwOuuPGP0", "yRx9txMcBl0"], "start_seconds": ["50", "40"], "properties": ["engine, diesel, truck", "motors, tires, screech"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "in 10 words or less the video is about a red mustang driving down a street at night in grand theft auto 5 gta 5, gta 5 mods, gta 5 cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta 5 mods cars, gta"], "captions_pred_audio": ["a medium engine is running ", "a car is revving its engine and skidding "], "question": "which entity has a continuous running engine", "label": 0}, {"captions": ["a clock ticktocks in wind", "a duck quacks loudly and continuously"], "sample_ids": ["yVumC9TGknc", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, wind", "loud, continuous, quacks"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a series of beeps and chirps", "a duck is quacking loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["yZp6xizR0yU", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["animal, bleat, cry", "stream, water, flow"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "footage is blurry and out of focus"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a man is speaking with wind noise in the background "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a man talks as several small engines run", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u9A6VZQCZpU", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["a, man, talk", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking while a race car is revving and accelerating ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is about a boy flying an airplane?", "label": 1}, {"captions": ["a bird is chirping and tweeting a bird song", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["wPz6QRAkEb4", "zFjIWfSD-4"], "start_seconds": ["60", "410"], "properties": ["chirps, tweets, song", "People, motor, brakes"], "captions_pred_video": ["a bird in a cage on top of a pole", null], "captions_pred_audio": ["birds are chirping in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is not a bird?", "label": 1}, {"captions": ["pigeons vocalize and birds chirp", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["uiS58TNyUiw", "zFjIWfSD-4"], "start_seconds": ["430", "410"], "properties": ["vocalize, bird, chirp", "People, motor, brakes"], "captions_pred_video": ["of the pigeon in the cage", null], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is not a symphony", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "an airplane engine spools and people speak"], "sample_ids": ["xKB8O8LTs6s", "wTjoRj1se3U"], "start_seconds": ["70", "390"], "properties": ["music, gunfire, explosion", "airplane, engine, spool"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a jet engine is running and people are talking"], "question": "which entity is a video of an airplane engine spooling?", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a machine beeps continuously"], "sample_ids": ["wnpJndXuxLc", "y682ml90jGw"], "start_seconds": ["50", "11"], "properties": ["blows, vehicle, train", "beeps, machine, continuously"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["speaking following by laughing and clapping", "water pouring and bubbling"], "sample_ids": ["u2f5NpsoHBg", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["person, laugh, clap", "water, bubbles, pouring"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a baby cries and wails as an adult female speaks", "wind blows as people chatter quietly"], "sample_ids": ["zliInBdC98Y", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["a, baby, cries, wails", "wind, chatter, people"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "footage is blurry and out of focus"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a small engine spits as it runs", "paper is crumpling consistently"], "sample_ids": ["sZvwOuuPGP0", "v5cSxLaHADY"], "start_seconds": ["50", "0"], "properties": ["spits, engine, runs", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a medium engine is running ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["yZmhM1HcsyE", "w5W5Kqtc8E"], "start_seconds": ["4", "100"], "properties": ["engine, roar, water", "wind, blow, vehicle"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", null], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a girl talking, laughing and sneezing noise", "a person snores loudly multiple times at a close distance"], "sample_ids": ["y4tPJXBKDig", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["a, noise, talk", "loud, multiple, distance"], "captions_pred_video": ["footage of the woman wiping her nose with a tissue", null], "captions_pred_audio": ["a woman is speaking and coughing with background noise and breathing ", "a person snoring loudly"], "question": "which noise is louder", "label": 1}, {"captions": ["a dog barks and whimpers", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["sShpyu2l4YQ", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "male, duck, laugh"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["a car speeding up in the distance", "people applaud and hoot and chat quietly"], "sample_ids": ["u0TrcHhkPQ", "wwyfGO2J4"], "start_seconds": ["20", "90"], "properties": ["distance, car, speed", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "people are clapping and speaking with background noise "], "question": "which entity is moving faster", "label": 0}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "a woman speaks happily and an animal chirps"], "sample_ids": ["ul60S8TXDA8", "uWAAAL4CIoc"], "start_seconds": ["60", "0"], "properties": ["sound, distance, bell", "a woman, chirps, animal"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", null], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a woman is speaking and a dog is barking "], "question": "which entity is more quiet", "label": 1}, {"captions": ["long loud burping by a man", "water flows as men speak and yell"], "sample_ids": ["xmiUIOhtZyQ", "vJ7JPEFhyLA"], "start_seconds": ["60", "16"], "properties": ["loud, burp, man", "water, flow, men"], "captions_pred_video": ["homer simpson drinking a beer", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a person burps and music plays in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a man speaking and yelling?", "label": 1}, {"captions": ["a motor noise is accompanied by a door opening and closing", "a woman speaks as she rubs two objects together"], "sample_ids": ["vBHyYJ8pL0", "vzxHnu-SFEw"], "start_seconds": ["2", "80"], "properties": ["noise, door, opening", "two objects, woman, speak"], "captions_pred_video": [null, "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["mechanisms are ticking and a sliding door is opening and closing ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["an engine sputters followed by a car zooming by", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["u5RmF3c3Aw", "y2bVZ7rz-5M"], "start_seconds": ["60", "280"], "properties": ["engine, car, zoom", "motor noise, horn, siren"], "captions_pred_video": [null, "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a race car accelerates and skids with wind noise in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a car zooming by", "label": 0}, {"captions": ["people converse in the distance as a clock ticks", "a person is whistling"], "sample_ids": ["vZAw4apG0Es", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["people, clock, converse", "person, whistling, person"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a person whistling a song"], "question": "which person is whistling", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["zofjfKhqLk8", "uYT5gxnyMWM"], "start_seconds": ["10", "50"], "properties": ["noise, stop, motor", "a, scream, girl"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking and a baby is crying"], "question": "which entity has more noise", "label": 1}, {"captions": ["bird squawks are accompanied by a man and woman speaking", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["wqZ135Ssz0", "vlS6YMeWAPo"], "start_seconds": ["60", "40"], "properties": ["man, woman, squawks", "sheep, baa, birds"], "captions_pred_video": [null, "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a goat bleats and birds chirp"], "question": "which entity has more animals", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vlJS7LN2XyM", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["background, clocks, ticking", "harsh, wind, blows"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking with wind noise in the background "], "question": "which entity is more calm", "label": 0}, {"captions": ["men speak and a nozzle sprays liquid", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["wRV8yMk886E", "w34HjHr6gAY"], "start_seconds": ["0", "30"], "properties": ["liquid, spray, nozzle", "beeps, hit, woman"], "captions_pred_video": ["two cars are parked in a parking lot at night", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["an infant crying as a woman laughs", "a man speaks as a motor runs in the background"], "sample_ids": ["xhmRY9yhC7c", "xZepNM9qcRA"], "start_seconds": ["20", "30"], "properties": ["a, laugh, infant", "background, motor, run"], "captions_pred_video": ["of a baby crying in a baby bouncer", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is a person", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a child speaks in closed space"], "sample_ids": ["sK4u5T8hW78", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["a, car, pass", "child, space, speak"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a woman speaks and taps on a hard surface before running tap water"], "sample_ids": ["sQwlkXjQabo", "wvKpEYswXO0"], "start_seconds": ["10", "150"], "properties": ["water, spray, surface", "water, tap, run"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "of the person preparing food in the kitchen"], "captions_pred_audio": ["spraying followed by silence", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a source of water", "label": 1}, {"captions": ["a cat meows and children speak", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["x5cuQjOdM3E", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["cat, speak, children", "animal, grunts, snorts"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a cat meows and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is more active", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "a horn blasts as warning bells ring"], "sample_ids": ["w34HjHr6gAY", "zgUgkpk78xU"], "start_seconds": ["30", "70"], "properties": ["beeps, squawk, child speaking", "horn, bells, ring"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a warning", "label": 1}, {"captions": ["a woman speaks and laughs and an animal grunts and snorts", "a duck quacks continuously"], "sample_ids": ["uEU-Hg5MTN8", "vh30P49Po6s"], "start_seconds": ["27", "30"], "properties": ["animal, grunts, snorts", "quacks, continuously, duck"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a duck is quacking loudly"], "question": "which animal is speaking", "label": 1}, {"captions": ["a vehicle engine runs as a siren and horn sound", "an infant crying as a woman laughs"], "sample_ids": ["u--KhUW8l1Y", "xhmRY9yhC7c"], "start_seconds": ["0", "20"], "properties": ["sound, vehicle, horn", "a, laugh, infant"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a helicopter engine runs continuously"], "sample_ids": ["vhJWZheqaE", "ugHJF0hfYkg"], "start_seconds": ["0", "10"], "properties": ["water drains unevenly, toilet flushes, water drains", "engine, running, continuously"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a toilet is flushed", "a helicopter is flying overhead "], "question": "which entity is not running continuously?", "label": 0}, {"captions": ["a person is burping then speaks and laughs", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["wAAkbZToh8", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["burp, laugh, speak", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man burps and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity is a person?", "label": 0}, {"captions": ["a woman speaks as frying food sizzles", "a telephone rings followed by a woman talking"], "sample_ids": ["wTideSjRFS0", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["food, sizzle, woman", "ring, talk, woman"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a dial tone sounds followed by a woman speaking"], "question": "which woman is talking", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "people speak as gunfire rings out"], "sample_ids": ["u7C-AEBQM", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["ticks, rhythmic, quiet", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 0}, {"captions": ["a jet engine spools up and takes off", "a man speaks while water trickles and flows"], "sample_ids": ["vBslzh7saPw", "sapQIQUhFc"], "start_seconds": ["90", "280"], "properties": ["engine, spools, takes", "water, trickles, flow"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a man is speaking and a stream is flowing in the background "], "question": "which entity is a moving object", "label": 0}, {"captions": ["animals bleat and cry out and then a woman speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yZp6xizR0yU", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["animal, bleat, cry", "rustling, ducks, quack"], "captions_pred_video": ["footage of a woman feeding goats in a barn", null], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 0}, {"captions": ["an airplane flies overhead as a woman speaks", "a few ducks quack and scamper and a man speaks"], "sample_ids": ["zj2R0XoFr5k", "w2bYrCVLT60"], "start_seconds": ["50", "120"], "properties": ["airplane, fly, overhead", "ducks, speak, quack"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of the ducks drinking from a pink pool in the grass"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "ducks are quacking and a man is speaking"], "question": "which entity is speaking", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["y8WEcpOlT3I", "wz7N8YRy74I"], "start_seconds": ["40", "30"], "properties": ["harsh, wind, blows", "rooster, crow, background, men"], "captions_pred_video": ["on how to use a sewing machine youtube", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a stream of water flows as people talk and wind blows"], "sample_ids": ["rqfQRErjfk8", "xBxDz0CFVn0"], "start_seconds": ["170", "30"], "properties": ["crowd, cheers, applauds", "stream, water, flow"], "captions_pred_video": ["a man hugging another man in front of an orchestra", "footage is blurry and out of focus"], "captions_pred_audio": ["a crowd of people clapping and cheering", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a clock ticktocks and sounds an alarm then a man laughs", "someone is typing on a computer keyboard"], "sample_ids": ["xV7Mg1QucSc", "v0x1odnXtP0"], "start_seconds": ["14", "210"], "properties": ["alarm, ticktocks, laughs", "keyboard, type, computer"], "captions_pred_video": ["a cuckoo clock hanging on the wall", "how to make money on youtube in spanish"], "captions_pred_audio": ["an alarm clock ticks and a woman laughs", "a person is typing on a keyboard"], "question": "which object is used to type on a computer", "label": 1}, {"captions": ["a vehicle accelerates squealing tires", "a horn rings out as a machine runs by"], "sample_ids": ["sd7xVssqlw", "slZLHwNbbt4"], "start_seconds": ["50", "300"], "properties": ["accelerates, tires, squealing", "a, horn, run"], "captions_pred_video": [null, "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "an infant crying as a woman laughs"], "sample_ids": ["zj2R0XoFr5k", "xhmRY9yhC7c"], "start_seconds": ["50", "20"], "properties": ["airplane, boy, fly", "a, laugh, infant"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["continuous chugging with birds chirping in the background", "a door opens and closes"], "sample_ids": ["xM4joTqDVp4", "vBHyYJ8pL0"], "start_seconds": ["160", "2"], "properties": ["background, chirp, birds", "open, close, door"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", null], "captions_pred_audio": ["birds are chirping and a train is moving ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which door is opening and closing", "label": 1}, {"captions": ["an adult man speaks over glass clinking", "a stream of water flows as people talk and wind blows"], "sample_ids": ["u6jIvCtKarQ", "xBxDz0CFVn0"], "start_seconds": ["70", "30"], "properties": ["a, man, speaks", "stream, water, flow"], "captions_pred_video": ["footage of a person using a blender on a stove top", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and dishes are being moved with background noise ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["speaking following by laughing and clapping", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["u2f5NpsoHBg", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["person, laugh, clap", "female, spraying, scream"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a woman is speaking and a baby is crying"], "question": "which entity shows a person speaking", "label": 0}, {"captions": ["a man speaks on a radio as wind blows", "a motorcycle engine works nearby"], "sample_ids": ["tDVADusiIoc", "tOSWIURC-4"], "start_seconds": ["60", "0"], "properties": ["man, radio, blows", "engine, work, nearby"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a lawn mower is running "], "question": "which entity is working", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["ukxt9I7eMMg", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["continuous, woman, speaking", "male, duck, laugh"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck quacking as others laugh?", "label": 1}, {"captions": ["plastic is tapped on while someone speaks", "wind blowing followed by a zoom"], "sample_ids": ["wvKpEYswXO0", "vr8ZXjEBhMQ"], "start_seconds": ["150", "150"], "properties": ["plastic, tap, speak", "wind, blow, zoom"], "captions_pred_video": ["of the person preparing food in the kitchen", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["a toilet flushes and water drains", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["sfAvvZwdLCY", "xKB8O8LTs6s"], "start_seconds": ["20", "70"], "properties": ["water drains, flushes, water", "music, gunfire, explosion"], "captions_pred_video": ["footage of the toilet in the bathroom", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a toilet is flushed", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is a scene of a toilet flushing and water draining?", "label": 0}, {"captions": ["people converse in the distance as a clock ticks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vZAw4apG0Es", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["people, clock, converse", "applause, audience, yells"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["an insect buzzes around continuously", "people applaud and hoot and chat quietly"], "sample_ids": ["v25l1jef3JY", "wwyfGO2J4"], "start_seconds": ["0", "90"], "properties": ["buzzes, continuously, insect", "people, applaud, hoot"], "captions_pred_video": ["a black background with a cartoon character in the foreground", null], "captions_pred_audio": ["a fly is buzzing around a microphone ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays"], "sample_ids": ["xKB8O8LTs6s", "sU53zg9Jp7s"], "start_seconds": ["70", "380"], "properties": ["music, radio, gunshots", "a bird chirps, a door bell ringing, a woman gasps"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "a cartoon girl is standing in front of a blue couch"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "birds chirp and a doorbell rings with breathing and music in the background "], "question": "which entity is more calm", "label": 1}, {"captions": ["a dog barks and whimpers", "an infant crying frantically"], "sample_ids": ["sShpyu2l4YQ", "zwOBqeFTgiU"], "start_seconds": ["0", "30"], "properties": ["barks, whimpers, dog", "cry, infant, frantically"], "captions_pred_video": ["the puppies are playing with a toy", "of the baby crying in the car seat"], "captions_pred_audio": ["a dog is barking and growling", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a man speaks as music plays before artillery is fired", "some men converse over an engine running"], "sample_ids": ["vcmWSmvti8", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["music, man, fire", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking as music plays before artillery is fired?", "label": 0}, {"captions": ["a woman speaks over sizzling noise", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["yajyRTUQk3U", "wz7N8YRy74I"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "rooster, crow, background, men"], "captions_pred_video": ["- a woman cooking in the kitchen", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "plastic is tapped on while someone speaks"], "sample_ids": ["uYT5gxnyMWM", "wvKpEYswXO0"], "start_seconds": ["50", "150"], "properties": ["person, spray, yell", "plastic, tap, speak"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a person speaking over?", "label": 0}, {"captions": ["a man speaks as music plays before artillery is fired", "a man speaks followed by another man speaking outside"], "sample_ids": ["vcmWSmvti8", "viuTg1M-dqg"], "start_seconds": ["30", "30"], "properties": ["music, man, fire", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking music is playing and an explosion is heard ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more people speaking", "label": 1}, {"captions": ["gunshots ring out, a man yells, and more shots follow", "someone is typing on a computer keyboard"], "sample_ids": ["vKrYfzleLB8", "v0x1odnXtP0"], "start_seconds": ["110", "210"], "properties": ["a, ring, gunshots", "keyboard, type, computer"], "captions_pred_video": ["stock footage of a person holding a gun in their hand", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking with background noise and a cap gun is fired ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["winds blows roughly as a vehicle races past", "several insects fly while two men talk"], "sample_ids": ["xjvTpk2Zpr8", "s-T9OVOiMLo"], "start_seconds": ["70", "330"], "properties": ["wind, blows, vehicle", "several, fly, men"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a man is speaking while insects are buzzing in the background "], "question": "which entity is more likely to be in a garden", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "a man speaks followed by another man speaking outside"], "sample_ids": ["tDlfY3nmx1A", "viuTg1M-dqg"], "start_seconds": ["160", "30"], "properties": ["applause, laugh, man", "two men, speak, follow"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has two men speaking to each other?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["wy1eKjR7KC0", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["people, talk, distance", "a woman, something, fried"], "captions_pred_video": ["two police officers riding motorcycles down the street", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a harsh wind blows as a man speaks and another man speaks", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["y8WEcpOlT3I", "tiDFTC-5vU"], "start_seconds": ["40", "30"], "properties": ["harsh, wind, blows", "male, duck, laugh"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a man speaks as a motor runs in the background"], "sample_ids": ["ukg5L09Wpvo", "xZepNM9qcRA"], "start_seconds": ["150", "30"], "properties": ["a train, a horn, a bell", "background, motor, run"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["wyllXV6PjKo", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["a kid, talk, cry", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a woman speaks and a baby cries", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a clock ticktocks in wind", "wind blows as people chatter quietly"], "sample_ids": ["yVumC9TGknc", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["ticktocks, clock, wind", "wind, chatter, people"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", "footage is blurry and out of focus"], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a motorcycle engine works nearby", "water pouring and bubbling"], "sample_ids": ["tOSWIURC-4", "uyRfq-jKPpo"], "start_seconds": ["0", "50"], "properties": ["engine, work, nearby", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a lawn mower is running ", "water is running from a faucet"], "question": "which is a liquid", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["sTpirNYo8vQ", "w34HjHr6gAY"], "start_seconds": ["30", "30"], "properties": ["a, tone, fast", "beeps, hit, woman"], "captions_pred_video": ["of a man taking a selfie on a bus", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a beep sounds followed by a child speaking"], "question": "which entity has a woman talking?", "label": 1}, {"captions": ["multiple birds chirp and an animal grunts", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tDlysoZiA1I", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["animal, grunt, multiple", "female, spraying, scream"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["young female child snoring and breathing deeply", "a person sniffs and sneezes"], "sample_ids": ["sAam2NqGhLY", "uRlbY6aoBU"], "start_seconds": ["20", "0"], "properties": ["snoring, breathing, child", "sneezes, person, sniffs"], "captions_pred_video": ["of a little girl sleeping on a couch", null], "captions_pred_audio": ["a person is snoring", "a man is sneezing "], "question": "which entity is a person", "label": 1}, {"captions": ["a person snoring", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["t8tv5YRMJUg", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["a person, snore, loud", "a, scream, girl"], "captions_pred_video": ["of a man getting his face licked by another man", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person sniffs and breathes heavily", "a woman is speaking and a baby is crying"], "question": "which entity is louder", "label": 1}, {"captions": ["a clicking followed by some people laughing and a kid speaking", "an infant crying as a woman laughs"], "sample_ids": ["vz8868znkVQ", "xhmRY9yhC7c"], "start_seconds": ["60", "20"], "properties": ["audio, click, kid speaking", "a, laugh, infant"], "captions_pred_video": ["a video of a plane flying over a cloudy sky", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a baby is laughing and breathing with background noise ", "a baby cries and a woman speaks"], "question": "which entity is a video", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zgUgkpk78xU", "yajyRTUQk3U"], "start_seconds": ["70", "400"], "properties": ["clinking, humming, horn", "a woman, something, fried"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking while food is frying in the background"], "question": "which entity is about cooking?", "label": 1}, {"captions": ["a large crowd cheers and applauds", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["rqfQRErjfk8", "w5W5Kqtc8E"], "start_seconds": ["170", "100"], "properties": ["crowd, cheers, applauds", "wind, blow, vehicle"], "captions_pred_video": ["a man hugging another man in front of an orchestra", null], "captions_pred_audio": ["a crowd of people clapping and cheering", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more likely to be at a sporting event", "label": 0}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "dishes cling together then a man begins to speak"], "sample_ids": ["vbZ-0lGPneg", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["a woman, a television program, a bird", "cling, speak, dishes"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "mechanisms are operating and water is splashing "], "question": "which entity has a bird?", "label": 0}, {"captions": ["a church bell rings several times", "a car speeding up in the distance"], "sample_ids": ["sUVVjE3Ucp8", "u0TrcHhkPQ"], "start_seconds": ["0", "20"], "properties": ["ring, bell, several", "distance, car, speed"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", null], "captions_pred_audio": ["a church bell is ringing ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a jet engine spools up and takes off"], "sample_ids": ["tIY7qOV3rEM", "vBslzh7saPw"], "start_seconds": ["0", "90"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "engine, spools, takes"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a pickup truck carrying a large object down the road"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a jet engine roars and accelerates "], "question": "which entity is not a living thing", "label": 1}, {"captions": ["a telephone rings followed by a woman talking", "a man speaks as a car is passing by"], "sample_ids": ["tGcFnX0GHI", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["ring, talk, woman", "a, car, pass"], "captions_pred_video": [null, "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a dial tone sounds followed by a woman speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["a vehicle engine revs as the vehicle passes", "people cheer as a vehicle engine revs"], "sample_ids": ["yDoT73BWsdA", "xjhAnI2q6hM"], "start_seconds": ["10", "6"], "properties": ["engine, revs, vehicle", "engine revs, vehicle, people"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a truck is revving its engine and a man is speaking "], "question": "which vehicle is revving its engine", "label": 1}, {"captions": ["an infant crying as a woman laughs", "dishes cling together then a man begins to speak"], "sample_ids": ["xhmRY9yhC7c", "sQGXqGcwOTc"], "start_seconds": ["20", "3"], "properties": ["a, laugh, infant", "cling, speak, dishes"], "captions_pred_video": ["of a baby crying in a baby bouncer", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a baby cries and a woman speaks", "mechanisms are operating and water is splashing "], "question": "which entity is about a woman and an infant?", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "people speak as gunfire rings out"], "sample_ids": ["vzxHnu-SFEw", "wqTCwqVRDlk"], "start_seconds": ["80", "80"], "properties": ["two objects, woman, speak", "gunfire, ring, speak"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["an engine runs and a man speaks", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["yT5WfYMRr-U", "wqZ135Ssz0"], "start_seconds": ["30", "60"], "properties": ["engine, run, man", "two men, woman, birds"], "captions_pred_video": ["a man in a red jacket and sunglasses driving a boat", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a train engine runs and a horn blows", "a infant makes noise and is excited"], "sample_ids": ["zPX9o1uDiI", "wIJK3-5y0kA"], "start_seconds": ["40", "30"], "properties": ["engine, horn, run", "noise, excited, infant"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["multiple ducks quack continuously", "several insects fly while two men talk"], "sample_ids": ["wfHeoPDLMaM", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["multiple, quack, continuously", "several, fly, men"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["ducks are quacking", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a video of a single event", "label": 1}, {"captions": ["a person speaks briefly", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["zOZleIRqZm4", "wDVMhEdTiVw"], "start_seconds": ["80", "30"], "properties": ["person, talk, brief", "gun, shoot, water"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause injury", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "someone is typing on a computer keyboard"], "sample_ids": ["wvKpEYswXO0", "v0x1odnXtP0"], "start_seconds": ["150", "210"], "properties": ["water, tap, run", "keyboard, type, computer"], "captions_pred_video": ["of the person preparing food in the kitchen", "how to make money on youtube in spanish"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a person is typing on a keyboard"], "question": "which action is performed on a computer", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["y1saVTXsKwc", "xBxDz0CFVn0"], "start_seconds": ["80", "30"], "properties": ["a, dog, talk", "stream, water, flow"], "captions_pred_video": ["a dog playing with a pink ball", "footage is blurry and out of focus"], "captions_pred_audio": ["a dog barks and a man speaks", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["tIY7qOV3rEM", "tDVADusiIoc"], "start_seconds": ["0", "60"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "water, radio, man"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a power tool runs and touches a surface", "a man speaks as a motor runs in the background"], "sample_ids": ["zfvPRf3chY", "xZepNM9qcRA"], "start_seconds": ["290", "30"], "properties": ["power tool, run, touch", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is not touching a surface", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "roadway noise occurs and a truck accelerates"], "sample_ids": ["wTjoRj1se3U", "tgbONvsP47Y"], "start_seconds": ["390", "0"], "properties": ["engine, run, people", "noise, truck, accelerate"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a jet engine is running and people are talking", "a car is driving on the road "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "paper is crumpling consistently"], "sample_ids": ["w2M4i1mklOA", "v5cSxLaHADY"], "start_seconds": ["30", "0"], "properties": ["loud, chime, bell", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of an antique clock", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a clock ticks quietly and rhythmically"], "sample_ids": ["y8WEcpOlT3I", "u7C-AEBQM"], "start_seconds": ["40", "30"], "properties": ["wind, speak, buffeting", "ticks, rhythmic, quiet"], "captions_pred_video": ["on how to use a sewing machine youtube", null], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a ticktock of a clock"], "question": "which entity is quieter", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "music plays and a woman speaks on a radio before gunshots are fired"], "sample_ids": ["vddP56-ogds", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["liquid, laughs, man", "music, radio, gunshots"], "captions_pred_video": [null, "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vs65y4qmyBE", "vbZ-0lGPneg"], "start_seconds": ["340", "30"], "properties": ["engine, run, man", "a woman, a television program, a bird"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["wind blows strongly and a young man speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["vs65y4qmyBE", "xBxDz0CFVn0"], "start_seconds": ["340", "30"], "properties": ["wind, blows, strongly", "stream, water, flow"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage is blurry and out of focus"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["multiple motorcycles pass by as a man speaks", "leaves rustling followed by a small bell chiming as birds chirp in the background"], "sample_ids": ["zcDwZ6W7E3E", "ziUT9IFTkjg"], "start_seconds": ["180", "10"], "properties": ["man, speak, motorcycles", "background, birds, rustling"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", null], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "birds are chirping and a chime is ringing "], "question": "which entity is a video of a man speaking?", "label": 0}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "someone is typing on a computer keyboard"], "sample_ids": ["sG7TyPnFDR0", "v0x1odnXtP0"], "start_seconds": ["180", "210"], "properties": ["beeps, machine, smoke alarm", "keyboard, type, computer"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a person is typing on a keyboard"], "question": "which entity is typing on a computer keyboard?", "label": 1}, {"captions": ["a door slams shut and an object moves on a hard surface", "wind blows as people chatter quietly"], "sample_ids": ["zkKdxzNC97Y", "xBxDz0CFVn0"], "start_seconds": ["27", "30"], "properties": ["hard, surface, door", "wind, chatter, people"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "footage is blurry and out of focus"], "captions_pred_audio": ["a door is opened and closed", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a car accelerates and wind blows", "a man speaks followed by another man speaking outside"], "sample_ids": ["u0TrcHhkPQ", "viuTg1M-dqg"], "start_seconds": ["20", "30"], "properties": ["accelerates, wind, blows", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["an audience gives applause", "paper is crumpling consistently"], "sample_ids": ["x6iCUDmRpKQ", "v5cSxLaHADY"], "start_seconds": ["38", "0"], "properties": ["applause, audience, give", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a black background with the moon and stars in the sky", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a group of people are clapping and cheering", "paper is crumpled and crinkled"], "question": "which is not a person", "label": 0}, {"captions": ["a woman speaks and then a man speaks", "a train horn blows as it passes by"], "sample_ids": ["vbpKkWvfOu4", "zVacuqSb4LI"], "start_seconds": ["560", "30"], "properties": ["a, man, speaks", "horn, blows, train"], "captions_pred_video": ["2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a woman is speaking and a man is speaking", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which is not a person speaking", "label": 1}, {"captions": ["birds chirp as a train approaches", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xM4joTqDVp4", "yDoT73BWsdA"], "start_seconds": ["160", "10"], "properties": ["bird, chirp, train", "engine, revs, vehicle"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["birds chirp quietly and an adult man speaks", "an infant crying frantically"], "sample_ids": ["zuua6-5goWw", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["birds, chirp, quiet, man, speaks", "cry, infant, frantically"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "of the baby crying in the car seat"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a baby cries loudly"], "question": "which entity is quieter", "label": 0}, {"captions": ["a baby cries and wails as an adult female speaks", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["zliInBdC98Y", "yajyRTUQk3U"], "start_seconds": ["30", "400"], "properties": ["a, baby, cries, wails", "a woman, something, fried"], "captions_pred_video": ["of a cute little girl playing with her hair in the bathroom", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a baby cries and a woman speaks", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["an emergency siren wails as it passes", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vGj1XLJvNrw", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["wails, wails, pass", "female, spraying, scream"], "captions_pred_video": ["footage of a police car driving down a city street", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a sleeping person snores and wheezes", "a infant makes noise and is excited"], "sample_ids": ["spJCm8tD9Zo", "wIJK3-5y0kA"], "start_seconds": ["90", "30"], "properties": ["snores, wheezes, sleeps", "noise, excited, infant"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a person is snoring loudly", "a baby cries and a woman speaks"], "question": "which entity is more quiet", "label": 0}, {"captions": ["animals bleat and moo as a person speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["tPJvjq9QePY", "tdWhHV3X25Q"], "start_seconds": ["40", "60"], "properties": ["animal, bleat, moo", "applause, audience, yells"], "captions_pred_video": ["a dog and a sheep in a barn", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a baby cries and a man speaks", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["someone sprays liquid onto a hard surface", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sQwlkXjQabo", "y8WEcpOlT3I"], "start_seconds": ["10", "40"], "properties": ["liquid, surface, spray", "harsh, wind, blows"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "on how to use a sewing machine youtube"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking with wind noise in the background "], "question": "which entity is not a liquid", "label": 1}, {"captions": ["children cry and people talk", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["xLwHe825Zs", "vfYTJq7nU"], "start_seconds": ["18", "130"], "properties": ["people talk, children cry, people talk", "rustling, ducks, quack"], "captions_pred_video": [null, null], "captions_pred_audio": ["a baby cries and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is about animals?", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "a car speeding up in the distance"], "sample_ids": ["tiDFTC-5vU", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["male, duck, laugh", "distance, car, speed"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a race car accelerates and revs its engine "], "question": "which is moving faster", "label": 0}, {"captions": ["a vehicle engine revs as the vehicle passes", "paper is crumpling consistently"], "sample_ids": ["yDoT73BWsdA", "v5cSxLaHADY"], "start_seconds": ["10", "0"], "properties": ["engine, revs, vehicle", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["water quietly rushes by while birds chirp in the background", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["sYITalLZjj4", "uZesmtKZGSw"], "start_seconds": ["30", "250"], "properties": ["water, rushes, background, birds", "men, talk, cars"], "captions_pred_video": ["two ducks are swimming in the water near each other", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["wind blows and birds chirp", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["an engine idles quietly then gradually becomes louder", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vbr9mHKc8WM", "zl9Dqx-j7q4"], "start_seconds": ["40", "6"], "properties": ["noise, loudness, engine", "engine, laugh, loud"], "captions_pred_video": [null, "footage of a man driving a car in the dark"], "captions_pred_audio": ["an engine is idling", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as crickets sing", "a car accelerates and wind blows"], "sample_ids": ["ryFDPxgDOGc", "u0TrcHhkPQ"], "start_seconds": ["570", "20"], "properties": ["a, crickets, sing", "accelerates, wind, blows"], "captions_pred_video": ["a group of people dressed in camouflage and hunting gear in the dark", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a machine beeps continuously", "rustling with distant murmuring"], "sample_ids": ["y682ml90jGw", "wnNNcxAPwGQ"], "start_seconds": ["11", "0"], "properties": ["beeps, machine, continuously", "sound, distance, rustling"], "captions_pred_video": [null, "footage of a yellow truck doing a burnout on a race track"], "captions_pred_audio": ["a beeping sound is being made ", "a crowd of people are talking and laughing while a skateboard rolls by "], "question": "which entity is quieter", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["ugHJF0hfYkg", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["engine, running, continuously", "wind, blow, vehicle"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", null], "captions_pred_audio": ["a helicopter is flying overhead ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a running engine", "label": 1}, {"captions": ["a duck quacks continuously", "a stream of water runs briefly"], "sample_ids": ["vh30P49Po6s", "x-PeY8Yb8M4"], "start_seconds": ["30", "300"], "properties": ["quacks, continuously, duck", "stream, water, run"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a duck is quacking loudly", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a heavy rain falls endlessly"], "sample_ids": ["zofjfKhqLk8", "wP8ZKrlx3oA"], "start_seconds": ["10", "40"], "properties": ["noise, stop, motor", "heavy, rain, fall"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a heavy rain is falling on a surface"], "question": "which entity is falling", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "winds blows roughly as a vehicle races past"], "sample_ids": ["sTpirNYo8vQ", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["a, tone, fast", "wind, blows, vehicle"], "captions_pred_video": ["of a man taking a selfie on a bus", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a jet engine roars and wind blows "], "question": "which entity is not a vehicle?", "label": 0}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["uRExseg-0XI", "zl9Dqx-j7q4"], "start_seconds": ["210", "6"], "properties": ["woman, man, water", "engine, laugh, loud"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["people clap and speak in the distance", "people speak as gunfire rings out"], "sample_ids": ["wwyfGO2J4", "wqTCwqVRDlk"], "start_seconds": ["90", "80"], "properties": ["clap, distance, speak", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vddP56-ogds", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["liquid, laughs, man", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "a man speaks as a car is passing by"], "sample_ids": ["vJ7JPEFhyLA", "sK4u5T8hW78"], "start_seconds": ["16", "30"], "properties": ["three men, wind, flow", "a, car, pass"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "food is frying then a woman speaks"], "sample_ids": ["w34HjHr6gAY", "ukxt9I7eMMg"], "start_seconds": ["30", "30"], "properties": ["beeps, squawk, child speaking", "food, woman, speak"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a woman is speaking while food is frying in the background "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["people speak softly as food sizzles", "a woman speaks happily and an animal chirps"], "sample_ids": ["yhQ2Lg-7qDY", "uWAAAL4CIoc"], "start_seconds": ["130", "0"], "properties": ["food, sizzle, speak", "a woman, chirps, animal"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", null], "captions_pred_audio": ["a faucet is running and a man is speaking", "a woman is speaking and a dog is barking "], "question": "which entity has a more active animal", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "plastic is tapped on while someone speaks"], "sample_ids": ["vf9xf3vMsGM", "wvKpEYswXO0"], "start_seconds": ["540", "150"], "properties": ["A man speaks while turning a water faucet on.", "plastic, tap, speak"], "captions_pred_video": ["of the person washing their hands under the faucet", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a man speaks in the background while a slow tick repeats", "birds chirp and objects are moved around"], "sample_ids": ["vZAw4apG0Es", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["background, tick, repeat", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a clock is ticking and people are talking", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "water is sprayed across a hard surface"], "sample_ids": ["uPDn2BFTHk", "sQwlkXjQabo"], "start_seconds": ["140", "10"], "properties": ["woman, laughs, speaks", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a baby laughs and a woman speaks", "spraying followed by silence"], "question": "which entity is a video of a person speaking?", "label": 0}, {"captions": ["a man speaks as a motor runs in the background", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["xZepNM9qcRA", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["background, motor, run", "gun, shoot, water"], "captions_pred_video": ["a close-up view of the motorcycle's engine and exhaust system", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man speaks while a motorcycle revs and accelerates ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "bees buzz and wind blows"], "sample_ids": ["yJ0TePmaOo", "tMJne1a4AFI"], "start_seconds": ["390", "0"], "properties": ["two hard objects, man, speak", "bees buzz, wind blows, bees"], "captions_pred_video": [null, "a swarm of bees on the ground"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "a swarm of bees buzzing around"], "question": "which entity is moving", "label": 1}, {"captions": ["food is frying while a woman speaks", "people speak in the background as a clock ticktocks"], "sample_ids": ["yhQ2Lg-7qDY", "vZAw4apG0Es"], "start_seconds": ["130", "30"], "properties": ["food, woman, speak", "background, clock, ticktocks"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "a clock made out of wood and gears with birds on top of it"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a clock is ticking and people are talking"], "question": "which entity has a clock ticktocking in the background?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "a chime of a clock followed by various tones of ticking with come clinking"], "sample_ids": ["wz7N8YRy74I", "uqFtmnhuqA8"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, men", "a, b, c"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "shows a clock on the wall of a room with a person standing in front of it"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "mechanisms are ticking and a hammer is striking "], "question": "which entity is a clock?", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["tIY7qOV3rEM", "xKB8O8LTs6s"], "start_seconds": ["0", "70"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "music, gunfire, explosion"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["an infant crying as a woman laughs", "a stream of water runs briefly"], "sample_ids": ["xhmRY9yhC7c", "x-PeY8Yb8M4"], "start_seconds": ["20", "300"], "properties": ["a, laugh, infant", "stream, water, run"], "captions_pred_video": ["of a baby crying in a baby bouncer", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a baby cries and a woman speaks", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["someone whistles briefly", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["uFoga8sHpiw", "wz7N8YRy74I"], "start_seconds": ["90", "30"], "properties": ["sound, duration, pitch", "rooster, crow, background, men"], "captions_pred_video": ["footage of a bird in a cage", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a person whistles a song", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a longer duration", "label": 1}, {"captions": ["propeller rearing loudly with some male and female voices interspersed in the background", "a duck quacks continuously"], "sample_ids": ["vqZuVbG6-HI", "vh30P49Po6s"], "start_seconds": ["130", "30"], "properties": ["background, male, female", "quacks, continuously, duck"], "captions_pred_video": ["footage is blurry because it's raining outside", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "a duck is quacking loudly"], "question": "which entity is a duck?", "label": 1}, {"captions": ["a person is snoring while sleeping", "plastic is tapped on while someone speaks"], "sample_ids": ["vJrjSeP17yE", "wvKpEYswXO0"], "start_seconds": ["40", "150"], "properties": ["a person is sleeping, snoring, person", "plastic, tap, speak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a person snoring loudly", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["a clock ticktocks", "a car speeding up in the distance"], "sample_ids": ["v-g-j2uTByM", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["ticktocks, clock, ticktocks", "distance, car, speed"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", null], "captions_pred_audio": ["a clock is ticking loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a jet engine screams, then increases its power", "winds blows roughly as a vehicle races past"], "sample_ids": ["vBslzh7saPw", "xjvTpk2Zpr8"], "start_seconds": ["90", "70"], "properties": ["power, scream, increase", "wind, blows, vehicle"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a jet engine roars and accelerates ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["a storm rolls by as thunder and lighting strike in the distance", "a woman speaks happily and an animal chirps"], "sample_ids": ["yNtRmrn0io8", "uWAAAL4CIoc"], "start_seconds": ["210", "0"], "properties": ["storm, distance, strike", "a woman, chirps, animal"], "captions_pred_video": ["footage of a house in the middle of the night", null], "captions_pred_audio": ["rain falls and thunder roars", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["a toilet flushes and water drains unevenly", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vhJWZheqaE", "tiDFTC-5vU"], "start_seconds": ["0", "30"], "properties": ["water drains unevenly, toilet flushes, water drains", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["a toilet is flushed", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking to a duck?", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "an infant crying as a woman laughs"], "sample_ids": ["zcDwZ6W7E3E", "xhmRY9yhC7c"], "start_seconds": ["180", "20"], "properties": ["a, man, speak", "a, laugh, infant"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a gun shoots, followed by water sloshing nearby", "wind blows as people chatter quietly"], "sample_ids": ["wDVMhEdTiVw", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["gun, shoot, water", "wind, chatter, people"], "captions_pred_video": ["a blurry image of trees and water in the forest", "footage is blurry and out of focus"], "captions_pred_audio": ["a gun is fired followed by splashing and a person sneezing", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a duck quacks continuously"], "sample_ids": ["wRV8yMk886E", "vh30P49Po6s"], "start_seconds": ["0", "30"], "properties": ["liquid, spray, nozzle", "quacks, continuously, duck"], "captions_pred_video": ["two cars are parked in a parking lot at night", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["continuous sizzling with a woman speaking towards the end", "people cheer as a vehicle engine revs"], "sample_ids": ["ukxt9I7eMMg", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["continuous, woman, speaking", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a stream runs then someone speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["wbHTKEJZyhc", "xjhAnI2q6hM"], "start_seconds": ["20", "6"], "properties": ["stream, run, someone", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a child and woman laughs and the woman speaks", "a child speaks in closed space"], "sample_ids": ["uPDn2BFTHk", "yW6FWLSLkx4"], "start_seconds": ["140", "40"], "properties": ["woman, laughs, speaks", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["a duck quacks continuously", "wind blowing followed by a zoom"], "sample_ids": ["vh30P49Po6s", "vr8ZXjEBhMQ"], "start_seconds": ["30", "150"], "properties": ["quacks, continuously, duck", "wind, blow, zoom"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a duck is quacking loudly", "wind blows and a chainsaw cuts through wood "], "question": "which entity is a zoom", "label": 1}, {"captions": ["birds chirp as a train approaches", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["xM4joTqDVp4", "uZesmtKZGSw"], "start_seconds": ["160", "250"], "properties": ["bird, chirp, train", "men, talk, cars"], "captions_pred_video": ["footage is of a train station with a train on the tracks and smoke coming out of the train's smokestack", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["birds are chirping and a train is moving ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a drill runs and two people laugh", "a vehicle engine accelerating then running on idle"], "sample_ids": ["tEE3MpBt1sg", "vYkA3cfXp5Q"], "start_seconds": ["50", "30"], "properties": ["two people, laugh, drill", "engine, accelerate, idle"], "captions_pred_video": ["footage is blurry due to the smoke in the air", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["people are laughing breathing and speaking with background noise ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["loud, continuous burping", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["y636gklDioE", "y8WEcpOlT3I"], "start_seconds": ["20", "40"], "properties": ["loud, continuous, burping", "harsh, wind, blows"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a person burps loudly several times", "a man is speaking with wind noise in the background "], "question": "which entity is not continuous", "label": 1}, {"captions": ["birds chirp as a bell rings", "a child speaks in closed space"], "sample_ids": ["ziUT9IFTkjg", "yW6FWLSLkx4"], "start_seconds": ["10", "40"], "properties": ["chirp, bell, ring", "child, space, speak"], "captions_pred_video": [null, "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is in a closed space", "label": 1}, {"captions": ["some people speak", "a propeller rotates loudly and intensely"], "sample_ids": ["vbZ-0lGPneg", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "loud, intense, propeller"], "captions_pred_video": ["of a man holding a baby duck in his hands", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["a clock ticktocks continuously", "water splashes and a motorboat passes as people yell"], "sample_ids": ["vlJS7LN2XyM", "w5W5Kqtc8E"], "start_seconds": ["30", "100"], "properties": ["ticktocks, clock, ticktocks continuously", "water, splashes, motorboat"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", null], "captions_pred_audio": ["a ticktock of a clock", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is more active", "label": 1}, {"captions": ["some light rustling followed by a loud burp and a girl speaking", "people cheer as a vehicle engine revs"], "sample_ids": ["vdoxuJn9lTc", "xjhAnI2q6hM"], "start_seconds": ["40", "6"], "properties": ["burp, loud, girl", "engine revs, vehicle, people"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a child speaks followed by a burp", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["yZp6xizR0yU", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["animal, bleat, cry", "engine, laugh, loud"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "a vehicle engine runs while a siren and horn sound"], "sample_ids": ["slZLHwNbbt4", "u--KhUW8l1Y"], "start_seconds": ["300", "0"], "properties": ["a, horn, run", "engine, sound, horn"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "a firefighter spraying water from a fire hydrant at night"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a fire truck siren blares and a horn blows "], "question": "which entity has a horn that rings out as a machine runs by?", "label": 0}, {"captions": ["a baby cries and a woman speaks", "a car accelerates and wind blows"], "sample_ids": ["tMbMDvT50j8", "u0TrcHhkPQ"], "start_seconds": ["12", "20"], "properties": ["a, cry, woman", "accelerates, wind, blows"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sU53zg9Jp7s", "zFjIWfSD-4"], "start_seconds": ["380", "410"], "properties": ["a bird chirps, a door bell ringing, a woman gasps", "People, motor, brakes"], "captions_pred_video": ["a cartoon girl is standing in front of a blue couch", null], "captions_pred_audio": ["birds chirp and a doorbell rings with breathing and music in the background ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["paper is repeatedly crumpled and crinkled", "birds chirp and objects are moved around"], "sample_ids": ["vms5XGTDVQc", "yPUYU6t3rwo"], "start_seconds": ["220", "370"], "properties": ["paper, crumpled, crinkled", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["footage of a woman opening a black bag on a table", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["paper is crumpled and crinkled", "insects buzz and a man speaks"], "question": "which entity is more likely to be a video of a person's hands?", "label": 0}, {"captions": ["children speak as a female ask them questions", "a person sniffs and sneezes"], "sample_ids": ["wEBlkGWVWwE", "uRlbY6aoBU"], "start_seconds": ["260", "0"], "properties": ["female, speak, questions", "sneezes, person, sniffs"], "captions_pred_video": ["shows a person writing on the whiteboard", null], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is sneezing "], "question": "which entity is a person", "label": 1}, {"captions": ["a man is filing a hard object", "a telephone rings followed by a woman talking"], "sample_ids": ["vveS8HT7Uog", "tGcFnX0GHI"], "start_seconds": ["100", "0"], "properties": ["a man, hard, object", "ring, talk, woman"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", null], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a dial tone sounds followed by a woman speaking"], "question": "which entity is talking", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "paper folding and crinkling"], "sample_ids": ["vZAw4apG0Es", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["background, tick, repeat", "paper, fold, crinkle"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a clock is ticking and people are talking", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of folding and crinkling paper?", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a toilet flushes and a female speaks"], "sample_ids": ["u--KhUW8l1Y", "yaln9y8I7ms"], "start_seconds": ["0", "230"], "properties": ["horn, siren, life", "female, flushes, toilet"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage is blurry and out of focus"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a toilet flushes and a man speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["a infant makes noise and is excited", "paper folding and crinkling"], "sample_ids": ["wIJK3-5y0kA", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["noise, excited, infant", "paper, fold, crinkle"], "captions_pred_video": ["of a baby playing with a cat in a dark room", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a baby cries and a woman speaks", "the wind blows and a mouse clicks "], "question": "which entity is not a person", "label": 1}, {"captions": ["children cry and people talk", "pigeons vocalize and birds chirp"], "sample_ids": ["xLwHe825Zs", "uiS58TNyUiw"], "start_seconds": ["18", "430"], "properties": ["people talk, children cry, people talk", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a train horn blows as it passes by"], "sample_ids": ["uRExseg-0XI", "zVacuqSb4LI"], "start_seconds": ["210", "30"], "properties": ["woman, man, water", "horn, blows, train"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a train?", "label": 1}, {"captions": ["birds fly and flutter around", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["wGKgwOP3h30", "xKB8O8LTs6s"], "start_seconds": ["30", "70"], "properties": ["fly, flutter, around", "music, gunfire, explosion"], "captions_pred_video": ["of the pigeons in the coop", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["pigeons coo and flap their wings", "music plays while a woman speaks and gunshots are fired "], "question": "which entity is more active", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["uYT5gxnyMWM", "zFjIWfSD-4"], "start_seconds": ["50", "410"], "properties": ["person, spray, yell", "People, motor, brakes"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a person speaking over spraying and another person yelling?", "label": 0}, {"captions": ["a vehicle engine runs while a siren and horn sound", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["u--KhUW8l1Y", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["engine, sound, horn", "airplane, boy, fly"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a propeller rotates loudly and intensely"], "sample_ids": ["wSVhSdj0F0", "ugHJF0hfYkg"], "start_seconds": ["10", "10"], "properties": ["horn honks, keys jingle, electronic beep", "loud, intense, propeller"], "captions_pred_video": [null, "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["wind loudly blowing while people speak in the background followed by a horn blowing", "people speak as gunfire rings out"], "sample_ids": ["xjhAnI2q6hM", "wqTCwqVRDlk"], "start_seconds": ["6", "80"], "properties": ["wind, blow, loudly", "gunfire, ring, speak"], "captions_pred_video": ["a school bus decorated with christmas lights is floating in the water", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a truck is revving its engine and a man is speaking ", "a man is speaking and a gun is fired"], "question": "which entity is more violent", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "a toilet flushes and a female speaks"], "sample_ids": ["sOa7g-44Dag", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["background, man, spray", "female, flushes, toilet"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "the revving of an engine throttle followed by a man speaking"], "sample_ids": ["wTjoRj1se3U", "tezvROoo4bs"], "start_seconds": ["390", "40"], "properties": ["engine, run, people", "audio, throttle, speaking"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of a busy city street with cars parked on both sides of the road"], "captions_pred_audio": ["a jet engine is running and people are talking", "a car accelerates and revs while a man speaks "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "white noise and snoring with some rustling in the background"], "sample_ids": ["xKB8O8LTs6s", "xzKKf9bKNUo"], "start_seconds": ["70", "10"], "properties": ["music, radio, gunshots", "background, noise, snoring"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "shows a woman laying on a bed with her eyes closed and her mouth open"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a person snoring loudly"], "question": "which entity has a woman speaking on a radio?", "label": 0}, {"captions": ["low humming with a clock ticking and birds chirping", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["yVumC9TGknc", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["humming, clock, birds", "male, duck, laugh"], "captions_pred_video": ["game title screen of the game shadow of the colossus on sony playstation 2", null], "captions_pred_audio": ["a series of beeps and chirps", "a man is speaking and ducks are quacking"], "question": "which entity has a clock ticking?", "label": 0}, {"captions": ["an adult woman speaks over chopping and silverware noises", "a child speaks in closed space"], "sample_ids": ["yYJksgsxx5U", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["audio, woman, silverware", "child, space, speak"], "captions_pred_video": ["of a woman slicing an orange on a cutting board", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a woman is speaking and dishes are clanging in the background ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a child speaking?", "label": 1}, {"captions": ["children speak as a female ask them questions", "water flows as men speak and yell"], "sample_ids": ["wEBlkGWVWwE", "vJ7JPEFhyLA"], "start_seconds": ["260", "16"], "properties": ["female, speak, questions", "water, flow, men"], "captions_pred_video": ["shows a person writing on the whiteboard", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more calm", "label": 0}, {"captions": ["two frogs croak at each other", "a man speaks as a car is passing by"], "sample_ids": ["zg0X6BnhOLQ", "sK4u5T8hW78"], "start_seconds": ["410", "30"], "properties": ["two frogs, croak, at each other", "a, car, pass"], "captions_pred_video": ["footage of lightning in the sky at night", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a frog is croaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["wind blows as people chatter quietly", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["xBxDz0CFVn0", "zl9Dqx-j7q4"], "start_seconds": ["30", "6"], "properties": ["wind, chatter, people", "engine, laugh, loud"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows as people chatter quietly", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["xBxDz0CFVn0", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["wind, chatter, people", "applause, audience, yells"], "captions_pred_video": ["footage is blurry and out of focus", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a horn honks and then loudly blares", "multiple people speak and children yell while water gurgles"], "sample_ids": ["wnpJndXuxLc", "vb1fPSDI4c"], "start_seconds": ["50", "30"], "properties": ["horn, honk, loud", "multiple, people, yell"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a crowd of people are talking and laughing"], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks then rubs two hard objects together", "water splashes as an animal walks through"], "sample_ids": ["yJ0TePmaOo", "w1ir-sZ3Im8"], "start_seconds": ["390", "90"], "properties": ["two hard objects, man, speak", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking with background noise and filing sounds ", "water splashes and gurgles as people speak"], "question": "which entity is more likely to cause water to splash", "label": 1}, {"captions": ["a train horn blows as it passes by", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zVacuqSb4LI", "su6FAOcOA8c"], "start_seconds": ["30", "4"], "properties": ["horn, blows, train", "engine, idle, woman"], "captions_pred_video": ["by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a train whistle blows and a train passes by with a whistle blowing ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["rain falls on a surface as men speak and thunder roars", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["w0xsN8X18Y", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["rain, thunder, surface", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an aircraft engine runs as wind blows heavily", "a woman and man speak while food is frying"], "sample_ids": ["xjvTpk2Zpr8", "zk-xJGQU8-4"], "start_seconds": ["70", "130"], "properties": ["engine, run, wind", "food, man, woman"], "captions_pred_video": ["footage of a dhl plane landing on the runway", "a man and a woman cooking in a wok on the stove"], "captions_pred_audio": ["a jet engine roars and wind blows ", "a woman is speaking while dishes are clanging and music is playing in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "a clock ticktocks"], "sample_ids": ["tDlysoZiA1I", "v-g-j2uTByM"], "start_seconds": ["0", "30"], "properties": ["animal, grunts, chirps", "ticktocks, clock, ticktocks"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["bees buzz and wind blows", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["tMJne1a4AFI", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["bees buzz, wind blows, bees", "female, spraying, scream"], "captions_pred_video": ["a swarm of bees on the ground", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "a man woman speak while crickets sing"], "sample_ids": ["s4Uz1Ffgo04", "zTLVJCo4WEE"], "start_seconds": ["100", "30"], "properties": ["roars, background, people speaking", "a, crickets, sing"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "- a boy with a rifle aiming at a target"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a woman speaks and crickets chirp"], "question": "which entity is quieter", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "continuous sizzling with a woman speaking towards the end"], "sample_ids": ["zofjfKhqLk8", "ukxt9I7eMMg"], "start_seconds": ["10", "30"], "properties": ["background, metal, clank", "continuous, woman, speaking"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking while food is frying in the background "], "question": "which entity is a video of a woman speaking?", "label": 1}, {"captions": ["a church bell rings several times", "some tunes played by whistling"], "sample_ids": ["sUVVjE3Ucp8", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["ring, bell, several", "tune, play, whistling"], "captions_pred_video": ["the video shows a stone wall with a clock on top of it and a bench in front of it", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a church bell is ringing ", "a person whistling a song"], "question": "which is a musical instrument", "label": 1}, {"captions": ["water flows as a woman laughs and a man speaks", "people speak as gunfire rings out"], "sample_ids": ["vddP56-ogds", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["water, flow, laugh", "gunfire, ring, speak"], "captions_pred_video": [null, "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking and a gun is fired"], "question": "which entity is more active", "label": 1}, {"captions": ["a person is burping while a girl speaks", "plastic is tapped on while someone speaks"], "sample_ids": ["vdoxuJn9lTc", "wvKpEYswXO0"], "start_seconds": ["40", "150"], "properties": ["person, burp, girl", "plastic, tap, speak"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", "of the person preparing food in the kitchen"], "captions_pred_audio": ["a child speaks followed by a burp", "a woman is speaking and tapping with background noise and water running "], "question": "which entity is tapped on while someone speaks", "label": 1}, {"captions": ["birds chirp and pigeons vocalize while walking around", "a man speaks followed by another man speaking outside"], "sample_ids": ["wIvYjuR3nrg", "viuTg1M-dqg"], "start_seconds": ["9", "30"], "properties": ["birds, pigeons, vocalize", "two men, speak, follow"], "captions_pred_video": ["footage of a pigeon sitting on a roof with trees in the background", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["birds are chirping and cooing", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a group of animals", "label": 0}, {"captions": ["a man speaks in the background while a slow tick repeats", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["vZAw4apG0Es", "tdWhHV3X25Q"], "start_seconds": ["30", "60"], "properties": ["background, tick, repeat", "applause, audience, yells"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a clock is ticking and people are talking", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["crowd applause while a guy laughs followed by another man speaking", "winds blows roughly as a vehicle races past"], "sample_ids": ["tDlfY3nmx1A", "xjvTpk2Zpr8"], "start_seconds": ["160", "70"], "properties": ["applause, laugh, man", "wind, blows, vehicle"], "captions_pred_video": ["a man in a suit and tie is talking to another man in a suit and tie", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["a crowd is clapping and laughing and a man is speaking ", "a jet engine roars and wind blows "], "question": "which entity is a vehicle racing past?", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a stream of water flows as people talk and wind blows"], "sample_ids": ["uRExseg-0XI", "xBxDz0CFVn0"], "start_seconds": ["210", "30"], "properties": ["woman, man, water", "stream, water, flow"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "a man is speaking with wind noise in the background "], "question": "which entity has more water", "label": 1}, {"captions": ["a heavy rain falls endlessly", "an airplane engine runs"], "sample_ids": ["wP8ZKrlx3oA", "yVPZ2MNWpms"], "start_seconds": ["40", "0"], "properties": ["heavy, rain, fall", "engine, airplane, runs"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a car is driving by on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a sleeping person emits a gravely snore", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["w2JXXIAdUdg", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["emits, sleeping, person", "applause, audience, yells"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["someone is snoring while sleeping", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["ujMt0-D-x2k", "zFjIWfSD-4"], "start_seconds": ["0", "410"], "properties": ["snore, sleep, someone", "People, motor, brakes"], "captions_pred_video": ["of the dog playing with a toy on the floor", null], "captions_pred_audio": ["a person is snoring loudly", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a person", "label": 1}, {"captions": ["a crowd yells, reacts and applauds", "a car accelerates and wind blows"], "sample_ids": ["wztCSUxOf8", "u0TrcHhkPQ"], "start_seconds": ["130", "20"], "properties": ["a crowd, yells, applauds", "accelerates, wind, blows"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["wind blowing followed by a zoom", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vr8ZXjEBhMQ", "tDVADusiIoc"], "start_seconds": ["150", "60"], "properties": ["wind, blow, zoom", "water, radio, man"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about a man speaking over a radio as wind blows and water splashes?", "label": 1}, {"captions": ["several insects fly while two men talk", "a door opens and closes"], "sample_ids": ["s-T9OVOiMLo", "vBHyYJ8pL0"], "start_seconds": ["330", "2"], "properties": ["several, fly, men", "open, close, door"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", null], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "mechanisms are ticking and a sliding door is opening and closing "], "question": "which entity is more passive", "label": 1}, {"captions": ["a steam engine runs and whistles as it passes by", "people cheer as a vehicle engine revs"], "sample_ids": ["se87d6yxEOA", "xjhAnI2q6hM"], "start_seconds": ["10", "6"], "properties": ["run, whistle, pass", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a train passing by a train station with smoke billowing out of the train's smokestack", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a train is moving and blowing its whistle ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a toilet flushes and water drains", "people cheer as a vehicle engine revs"], "sample_ids": ["sfAvvZwdLCY", "xjhAnI2q6hM"], "start_seconds": ["20", "6"], "properties": ["water drains, flushes, water", "engine revs, vehicle, people"], "captions_pred_video": ["footage of the toilet in the bathroom", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a toilet is flushed", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sQGXqGcwOTc", "su6FAOcOA8c"], "start_seconds": ["3", "4"], "properties": ["cling, speak, dishes", "engine, idle, woman"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a woman is speaking and a subway train is moving "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["an emergency siren wails as it passes", "paper is crumpling consistently"], "sample_ids": ["vGj1XLJvNrw", "v5cSxLaHADY"], "start_seconds": ["0", "0"], "properties": ["wails, wails, pass", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of a police car driving down a city street", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["an emergency vehicle with a siren is passing by", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a person snoring several times", "a horn rings out as a machine runs by"], "sample_ids": ["spJCm8tD9Zo", "slZLHwNbbt4"], "start_seconds": ["90", "300"], "properties": ["snore, person, several", "a, horn, run"], "captions_pred_video": ["of a man laying on the ground with his mouth open", "footage of a train coming down the tracks on a sunny day"], "captions_pred_audio": ["a person is snoring loudly", "a train is moving and blowing its horn with a clickety-clack sound "], "question": "which entity is not a person?", "label": 1}, {"captions": ["an engine runs and wind blows", "an aircraft engine runs"], "sample_ids": ["vs65y4qmyBE", "yLCORCnd35Q"], "start_seconds": ["340", "0"], "properties": ["engine, run, wind", "engine, aircraft, runs"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a lufthansa airbus a380 landing at london's heathrow airport"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a train is moving and its wheels are squealing "], "question": "which entity is running", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a child speaks in closed space"], "sample_ids": ["w34HjHr6gAY", "yW6FWLSLkx4"], "start_seconds": ["30", "40"], "properties": ["beeps, hit, woman", "child, space, speak"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is a child speaking in a closed space?", "label": 1}, {"captions": ["a man speaks as a machine runs", "a man talks as several small engines run"], "sample_ids": ["vD6lYD1l0BY", "u9A6VZQCZpU"], "start_seconds": ["330", "30"], "properties": ["a, machine, run", "a, man, talk"], "captions_pred_video": ["game controller being held in the hands of the person", null], "captions_pred_audio": ["a man is speaking and dishes are being washed ", "a man is speaking while a race car is revving and accelerating "], "question": "which entity has a man speaking as a machine runs?", "label": 0}, {"captions": ["a woman speaks and food sizzles while frying", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wTideSjRFS0", "vfYTJq7nU"], "start_seconds": ["30", "130"], "properties": ["food, sizzle, woman", "rustling, ducks, quack"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", null], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a duck quacks and a woman speaks"], "question": "which entity is about a woman speaking and food sizzling while frying?", "label": 0}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "birds chirp and objects are moved around"], "sample_ids": ["wy1eKjR7KC0", "yPUYU6t3rwo"], "start_seconds": ["30", "370"], "properties": ["people, talk, distance", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a man is speaking and a siren is going off", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["soft movement is accompanied by clocks ticking in the background", "a man speaks over intermittent keyboard taps"], "sample_ids": ["vlJS7LN2XyM", "tw76HGONaKg"], "start_seconds": ["30", "570"], "properties": ["background, clocks, ticking", "audio, man, keyboard"], "captions_pred_video": ["of the clock in the video is blurry and hard to make out the numbers on the face of the clock are hard to see due to the blurriness of the video", "game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda"], "captions_pred_audio": ["a ticktock of a clock", "a man speaks and types on a computer keyboard "], "question": "which entity has a man speaking over intermittent keyboard taps?", "label": 1}, {"captions": ["a man speaks then multiple motorcycles pass by", "vehicles pass by on a roadway"], "sample_ids": ["zcDwZ6W7E3E", "tgbONvsP47Y"], "start_seconds": ["180", "0"], "properties": ["a, man, speak", "pass, vehicle, roadway"], "captions_pred_video": ["2 people riding motorcycles down a mountain road with trees lining the sides of the road", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking while a car accelerates and revs its engine ", "a car is driving on the road "], "question": "which entity has more vehicles", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "women speak and laugh as wind blows"], "sample_ids": ["y1saVTXsKwc", "un9VQlzgZM"], "start_seconds": ["80", "5"], "properties": ["a, dog, talk", "wind, speak, laugh"], "captions_pred_video": ["a dog playing with a pink ball", null], "captions_pred_audio": ["a dog barks and a man speaks", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is about a woman talking to a dog?", "label": 0}, {"captions": ["continuous sizzling with a woman speaking towards the end", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["ukxt9I7eMMg", "sLUnaPT5gM8"], "start_seconds": ["30", "0"], "properties": ["continuous, woman, speaking", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a person preparing food on a stool in a kitchen", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a woman is speaking while food is frying in the background ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more like a scream", "label": 1}, {"captions": ["wind blows as people chatter quietly", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["xBxDz0CFVn0", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["wind, chatter, people", "motor noise, horn, siren"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is louder", "label": 1}, {"captions": ["a person is snoring while sleeping", "some tunes played by whistling"], "sample_ids": ["vJrjSeP17yE", "u6BnG6YZqJ4"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "tune, play, whistling"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a person snoring loudly", "a person whistling a song"], "question": "which entity is playing tunes", "label": 1}, {"captions": ["a man is filing a hard object", "people cheer as a vehicle engine revs"], "sample_ids": ["vveS8HT7Uog", "xjhAnI2q6hM"], "start_seconds": ["100", "6"], "properties": ["a man, hard, object", "engine revs, vehicle, people"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a truck is revving its engine and a man is speaking "], "question": "which object is harder to file", "label": 0}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["zofjfKhqLk8", "vfYTJq7nU"], "start_seconds": ["10", "130"], "properties": ["background, metal, clank", "rustling, ducks, quack"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", null], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a duck quacks and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["dog barking and vehicle engine idling followed shortly by vehicle engine revving", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zY3icUyMdh8", "su6FAOcOA8c"], "start_seconds": ["20", "4"], "properties": ["dog, bark, engine", "engine, idle, woman"], "captions_pred_video": ["footage of a bus driving through a residential street at night", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a car is driving and dogs are barking and squealing ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "some tunes played by whistling"], "sample_ids": ["sEprKHm8Sj8", "u6BnG6YZqJ4"], "start_seconds": ["90", "0"], "properties": ["car, tires, slows", "tune, play, whistling"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a person whistling a song"], "question": "which entity is not a car?", "label": 1}, {"captions": ["water is sprayed across a hard surface", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sQwlkXjQabo", "vJ7JPEFhyLA"], "start_seconds": ["10", "16"], "properties": ["water, spray, surface", "three men, wind, flow"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a video of a liquid flowing?", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "water flows and trickles"], "sample_ids": ["zl9Dqx-j7q4", "tB7hWb9gTuQ"], "start_seconds": ["6", "30"], "properties": ["motors rev, laugh, loudly", "water, flow, trickle"], "captions_pred_video": ["footage of a man driving a car in the dark", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a jet engine roars ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["two men and a woman talk while wind blows and birds tweet", "food is frying then a woman speaks"], "sample_ids": ["wqZ135Ssz0", "ukxt9I7eMMg"], "start_seconds": ["60", "30"], "properties": ["two men, woman, birds", "food, woman, speak"], "captions_pred_video": [null, "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["a man is speaking and ducks are quacking with wind noise in the background ", "a woman is speaking while food is frying in the background "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["wRV8yMk886E", "tdWhHV3X25Q"], "start_seconds": ["0", "60"], "properties": ["liquid, spray, nozzle", "applause, audience, yells"], "captions_pred_video": ["two cars are parked in a parking lot at night", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["an infant crying frantically", "water flows and trickles"], "sample_ids": ["zwOBqeFTgiU", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["cry, infant, frantically", "water, flow, trickle"], "captions_pred_video": ["of the baby crying in the car seat", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a baby cries loudly", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an engine works in idle nearby followed by a man talking", "vehicles pass by on a roadway"], "sample_ids": ["wqADXCzngMw", "tgbONvsP47Y"], "start_seconds": ["340", "0"], "properties": ["engine, idle, man", "pass, vehicle, roadway"], "captions_pred_video": ["of a man working on a vintage volkswagen beetle", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a lawn mower is running and a man is speaking ", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "people applaud and hoot and chat quietly"], "sample_ids": ["vK93VuO0yNc", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["male voice, bus, rumble", "people, applaud, hoot"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", null], "captions_pred_audio": ["a car drives by with wind noise in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["an adult speaks and is typing on a computer keyboard", "a clock ticktocks"], "sample_ids": ["x9JovgqUcs", "v-g-j2uTByM"], "start_seconds": ["500", "30"], "properties": ["An adult is speaking, typing, and using a computer keyboard", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "dishes cling together then a man begins to speak"], "sample_ids": ["sG7TyPnFDR0", "sQGXqGcwOTc"], "start_seconds": ["180", "3"], "properties": ["beeps, machine, smoke alarm", "cling, speak, dishes"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "mechanisms are operating and water is splashing "], "question": "which entity has a man speaking while a machine runs?", "label": 0}, {"captions": ["water drips and bubbles as a man speaks", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["vSeGhaZt-aI", "zl9Dqx-j7q4"], "start_seconds": ["50", "6"], "properties": ["water, bubbles, speak", "engine, laugh, loud"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a horn blows as a train chugs along and warning bells ring", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["ukg5L09Wpvo", "tiDFTC-5vU"], "start_seconds": ["150", "30"], "properties": ["a train, a horn, a bell", "male, duck, laugh"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", null], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "a man is speaking and ducks are quacking"], "question": "which entity has a duck?", "label": 1}, {"captions": ["a motor runs and stops, and animals squawk and croak", "someone is typing on a computer keyboard"], "sample_ids": ["s4tUs779vBA", "v0x1odnXtP0"], "start_seconds": ["160", "210"], "properties": ["a, sound, stop", "keyboard, type, computer"], "captions_pred_video": ["game in 10 words or less - screenshot 1", "how to make money on youtube in spanish"], "captions_pred_audio": ["a car is revving and a man is speaking ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["an aircraft engine runs", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["yLCORCnd35Q", "uYT5gxnyMWM"], "start_seconds": ["0", "50"], "properties": ["engine, aircraft, runs", "female, spraying, scream"], "captions_pred_video": ["a lufthansa airbus a380 landing at london's heathrow airport", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a train is moving and its wheels are squealing ", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "wind blows as people chatter quietly"], "sample_ids": ["w0xsN8X18Y", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["music, surface, rain", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "male speech with light ticking"], "sample_ids": ["s3cTDAj31g", "xO-Q2BlIIPU"], "start_seconds": ["80", "30"], "properties": ["man, talk, woman", "male, speech, ticking"], "captions_pred_video": [null, "a clock with a green glowing display showing the time 09 07 2016 12 31 2016"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a speech?", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "a sheep baa followed by birds chirping and then more sheep baaing"], "sample_ids": ["x5cuQjOdM3E", "vlS6YMeWAPo"], "start_seconds": ["30", "40"], "properties": ["cat, talk, meow", "sheep, baa, birds"], "captions_pred_video": ["a black background with an airplane flying in the sky", "footage of a goat in a pen behind a wooden fence"], "captions_pred_audio": ["a cat meows and a woman speaks", "a goat bleats and birds chirp"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a vehicle engine accelerates and wind blows", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["wudZTNBtVqc", "vfYTJq7nU"], "start_seconds": ["60", "130"], "properties": ["accelerates, engine, wind", "rustling, ducks, quack"], "captions_pred_video": ["footage is of a parking lot with cars parked in it", null], "captions_pred_audio": ["a car accelerates and revs its engine ", "a duck quacks and a woman speaks"], "question": "which entity is a video of a vehicle?", "label": 0}, {"captions": ["a small voice speaks, music plays followed by a double whoosh, and then a bell dings", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["tQWGZLItBXk", "wDVMhEdTiVw"], "start_seconds": ["170", "30"], "properties": ["voice, music, whoosh", "gun, shoot, water"], "captions_pred_video": ["worms revolution screenshots", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a movie", "label": 1}, {"captions": ["a beep repeats multiple times", "someone is typing on a computer keyboard"], "sample_ids": ["y682ml90jGw", "v0x1odnXtP0"], "start_seconds": ["11", "210"], "properties": ["beep, repeat, multiple", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a beeping sound is being made ", "a person is typing on a keyboard"], "question": "which is not a type of computer", "label": 0}, {"captions": ["a low rumbling in the distance followed by a motorcycle engine revving up", "a propeller rotates loudly and intensely"], "sample_ids": ["vr8ZXjEBhMQ", "ugHJF0hfYkg"], "start_seconds": ["150", "10"], "properties": ["sound, distance, engine", "loud, intense, propeller"], "captions_pred_video": ["is taken from a motorcycle's point of view", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a clock ticktocks"], "sample_ids": ["sZPuqDgX2V0", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["commentator, race, track", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a dog barks and whimpers", "birds chirp and objects are moved around"], "sample_ids": ["sShpyu2l4YQ", "yPUYU6t3rwo"], "start_seconds": ["0", "370"], "properties": ["barks, whimpers, dog", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["the puppies are playing with a toy", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a dog is barking and growling", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a baby cries and a woman moans", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["smDKStoHBJo", "wqZ135Ssz0"], "start_seconds": ["0", "60"], "properties": ["a, cry, woman", "two men, woman, birds"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more people", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "people cheer as a vehicle engine revs"], "sample_ids": ["zj2R0XoFr5k", "xjhAnI2q6hM"], "start_seconds": ["50", "6"], "properties": ["airplane, fly, woman", "engine revs, vehicle, people"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a truck is revving its engine and a man is speaking "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a young woman speaks and laughs and an animal snorts"], "sample_ids": ["weDbePuc-Xc", "uEU-Hg5MTN8"], "start_seconds": ["40", "27"], "properties": ["music, slaps, human", "a woman, laughs, animal"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has a woman speaking and laughing and an animal snorts?", "label": 1}, {"captions": ["a man makes an exclamation, then another man speaks", "a bird chirps followed by a door bell ringing that causes a woman to gasp and the music plays"], "sample_ids": ["xO-Q2BlIIPU", "sU53zg9Jp7s"], "start_seconds": ["30", "380"], "properties": ["two men, exclamation, speak", "a bird chirps, a door bell ringing, a woman gasps"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", "a cartoon girl is standing in front of a blue couch"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "birds chirp and a doorbell rings with breathing and music in the background "], "question": "which entity has a doorbell ringing?", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "a machine beeps continuously"], "sample_ids": ["uJV8NDaHqqk", "y682ml90jGw"], "start_seconds": ["100", "11"], "properties": ["loud, fly, chirp", "beeps, machine, continuously"], "captions_pred_video": ["a bee hive in a wooden box", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "wind blowing followed by a zoom"], "sample_ids": ["t25U-v4k4ts", "vr8ZXjEBhMQ"], "start_seconds": ["40", "150"], "properties": ["bees buzz, birds chirp, man speaks", "wind, blow, zoom"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "is taken from a motorcycle's point of view"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "wind blows and a chainsaw cuts through wood "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks and food sizzles while frying", "small dogs yip and bark sharply"], "sample_ids": ["wTideSjRFS0", "v-wcQf4BDY0"], "start_seconds": ["30", "120"], "properties": ["food, sizzle, woman", "bark, yip, sharply"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a dog barks and growls"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks in the background while a slow tick repeats", "some tunes played by whistling"], "sample_ids": ["vZAw4apG0Es", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["background, tick, repeat", "tune, play, whistling"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a clock is ticking and people are talking", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a man speaks as insects buzz and a bird chirps", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["t25U-v4k4ts", "y8WEcpOlT3I"], "start_seconds": ["40", "40"], "properties": ["a, chirps, bird", "harsh, wind, blows"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a man is speaking with wind noise in the background "], "question": "which entity is more active", "label": 1}, {"captions": ["music plays and someone speaks before gunfire and an explosion occurs", "a horn blasts as warning bells ring"], "sample_ids": ["xKB8O8LTs6s", "zgUgkpk78xU"], "start_seconds": ["70", "70"], "properties": ["music, gunfire, explosion", "horn, bells, ring"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a warning", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["yYEVLuqEytU", "xfaoyyzw2WU"], "start_seconds": ["40", "180"], "properties": ["grunt, slurp, background", "loud, jet engine, roar"], "captions_pred_video": ["a baby goat is being petted by a person's hand", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["several sheep bleat and a man speaks", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "someone is typing on a computer keyboard"], "sample_ids": ["w0xsN8X18Y", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["music, surface, rain", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a man sprays as a scraping occurs in the background", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["sOa7g-44Dag", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["background, man, spray", "People, motor, brakes"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", null], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has a motor running?", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a man speaks as a car is passing by"], "sample_ids": ["zgUgkpk78xU", "sK4u5T8hW78"], "start_seconds": ["70", "30"], "properties": ["horn, bells, ring", "a, car, pass"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a warning", "label": 0}, {"captions": ["a door slams shut roughly", "dishes cling together then a man begins to speak"], "sample_ids": ["zkKdxzNC97Y", "sQGXqGcwOTc"], "start_seconds": ["27", "3"], "properties": ["a door, slams, shut", "cling, speak, dishes"], "captions_pred_video": ["footage of the door opening and closing in slow motion", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a door is opened and closed", "mechanisms are operating and water is splashing "], "question": "which entity is more likely to be a door", "label": 0}, {"captions": ["water running down a sink while a man is talking", "a woman speaks and other women and a man talk with her"], "sample_ids": ["vSeGhaZt-aI", "vbpKkWvfOu4"], "start_seconds": ["50", "560"], "properties": ["water, sink, talk", "a, woman, man"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a woman is speaking and a man is speaking"], "question": "which entity has a man talking to a sink?", "label": 0}, {"captions": ["a woman speaks and food sizzles while frying", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["wTideSjRFS0", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["food, sizzle, woman", "animal, grunts, snorts"], "captions_pred_video": ["footage of a woman cooking in a kitchen with a microwave oven", "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a woman is speaking while water is running in the background", "a woman is speaking and a baby is crying"], "question": "which entity is about a woman speaking and food sizzling while frying?", "label": 0}, {"captions": ["a woman speaks as she rubs two objects together", "someone whistles a tune"], "sample_ids": ["vzxHnu-SFEw", "sIXTftIuUgw"], "start_seconds": ["80", "90"], "properties": ["two objects, woman, speak", "someone, tune, whistle"], "captions_pred_video": ["how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to", null], "captions_pred_audio": ["a woman is speaking and breathing with mechanisms in the background ", "a person whistling a song"], "question": "which is not a musical instrument", "label": 0}, {"captions": ["plastic is tapped on while someone speaks", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wvKpEYswXO0", "tDVADusiIoc"], "start_seconds": ["150", "60"], "properties": ["plastic, tap, speak", "water, radio, man"], "captions_pred_video": ["of the person preparing food in the kitchen", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a toilet flushes and water drains"], "sample_ids": ["sG7TyPnFDR0", "sfAvvZwdLCY"], "start_seconds": ["180", "20"], "properties": ["beeps, machine, smoke alarm", "water drains, flushes, water"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", "footage of the toilet in the bathroom"], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a toilet is flushed"], "question": "which entity is a source of water", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "an airplane engine runs"], "sample_ids": ["zj2R0XoFr5k", "yVPZ2MNWpms"], "start_seconds": ["50", "0"], "properties": ["airplane, boy, fly", "engine, airplane, runs"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a car is driving by on the road "], "question": "which airplane is flying", "label": 0}, {"captions": ["here comes the train and it starts to blow the horn and get close", "a dog barks and whimpers"], "sample_ids": ["s7knHCFW82w", "sShpyu2l4YQ"], "start_seconds": ["30", "0"], "properties": ["blow horn, get close, train", "barks, whimpers, dog"], "captions_pred_video": ["footage of the train on the tracks near a building and a car parked on the side of the road", "the puppies are playing with a toy"], "captions_pred_audio": ["a train is blowing its horn and its wheels are squealing ", "a dog is barking and growling"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a person snores hilariously while someone laughs", "water splashes as an animal walks through"], "sample_ids": ["sSMl2vc3ek", "w1ir-sZ3Im8"], "start_seconds": ["20", "90"], "properties": ["a person, laughs, snores", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a person snoring loudly", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "water flows as men speak and yell"], "sample_ids": ["w0xsN8X18Y", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["music, surface, rain", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["speaking following by laughing and clapping", "after a few seconds of silence, a loud bang occurs followed by a softer banging noise"], "sample_ids": ["u2f5NpsoHBg", "zkKdxzNC97Y"], "start_seconds": ["30", "27"], "properties": ["person, laugh, clap", "loud, bang, noise"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a door is opened and closed"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a weapon fires multiple times", "pigeons vocalize and birds chirp"], "sample_ids": ["sMC07Ucy7kg", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["weapon, fire, multiple", "vocalize, bird, chirp"], "captions_pred_video": ["footage is from a car's point of view", "of the pigeon in the cage"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking and a bee is buzzing"], "question": "which entity is not a weapon", "label": 1}, {"captions": ["a motorcycle engine revs then accelerates before hitting a bump", "paper is crumpling consistently"], "sample_ids": ["w-4gHptFNuU", "v5cSxLaHADY"], "start_seconds": ["21", "0"], "properties": ["engine revs, accelerates, bump", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["is taken from a motorcycle rider's point of view", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a car accelerates and revs its engine ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "a woman speaks happily and an animal chirps"], "sample_ids": ["uYT5gxnyMWM", "uWAAAL4CIoc"], "start_seconds": ["50", "0"], "properties": ["a, scream, girl", "a woman, chirps, animal"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a woman is speaking and a dog is barking "], "question": "which entity is more calm", "label": 1}, {"captions": ["a woman talking as an infant is crying", "a car accelerates and wind blows"], "sample_ids": ["tMbMDvT50j8", "u0TrcHhkPQ"], "start_seconds": ["12", "20"], "properties": ["a, talk, infant", "accelerates, wind, blows"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a race car accelerates and revs its engine "], "question": "which is not a person", "label": 1}, {"captions": ["some people speak", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["vbZ-0lGPneg", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["some people speak English, some people speak Spanish, some people speak French", "female, spraying, scream"], "captions_pred_video": ["of a man holding a baby duck in his hands", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a woman is speaking and a baby is crying"], "question": "which entity is more violent", "label": 1}, {"captions": ["a weapon fires multiple times", "an insect buzzes around continuously"], "sample_ids": ["sMC07Ucy7kg", "v25l1jef3JY"], "start_seconds": ["10", "0"], "properties": ["weapon, fire, multiple", "buzzes, continuously, insect"], "captions_pred_video": ["footage is from a car's point of view", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a fly is buzzing around a microphone "], "question": "which entity is not a weapon", "label": 1}, {"captions": ["white noise and snoring with some rustling in the background", "children speak and play together"], "sample_ids": ["xzKKf9bKNUo", "yVVP8XvWJTo"], "start_seconds": ["10", "260"], "properties": ["background, noise, snoring", "children, speak, play"], "captions_pred_video": ["shows a woman laying on a bed with her eyes closed and her mouth open", "footage of a playground at a school or daycare center"], "captions_pred_audio": ["a person snoring loudly", "children are speaking and breathing with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "a person uses a saw to cut some wood"], "sample_ids": ["soTOh3zYJfY", "sHbXC6na9hg"], "start_seconds": ["40", "0"], "properties": ["vehicle, skid, tires", "a person, saw, wood"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", "a man using a tractor to cut a log into firewood youtube"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "an engine is idling and vibrating"], "question": "which entity is stationary", "label": 1}, {"captions": ["a cat meows and children speak", "birds chirp and wind blows"], "sample_ids": ["x5cuQjOdM3E", "sxIvBMSavMQ"], "start_seconds": ["30", "210"], "properties": ["cat, speak, children", "birds, chirp, wind"], "captions_pred_video": ["a black background with an airplane flying in the sky", "beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a"], "captions_pred_audio": ["a cat meows and a woman speaks", "birds are chirping and insects are buzzing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["an engine runs loudly", "paper is crumpling consistently"], "sample_ids": ["vqZuVbG6-HI", "v5cSxLaHADY"], "start_seconds": ["130", "0"], "properties": ["loud, engine, run", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage is blurry because it's raining outside", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a motor idles, accelerates, then slows down.", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["vYkA3cfXp5Q", "y8WEcpOlT3I"], "start_seconds": ["30", "40"], "properties": ["speed, idle, accelerate", "harsh, wind, blows"], "captions_pred_video": ["footage of a car driving down the street on a sunny day", "on how to use a sewing machine youtube"], "captions_pred_audio": ["an engine is idling", "a man is speaking with wind noise in the background "], "question": "which entity is a natural phenomenon", "label": 1}, {"captions": ["water splashes and a motorboat passes as people yell", "birds vocalize and chirp continuously"], "sample_ids": ["w5W5Kqtc8E", "w1mlz3Pe4fU"], "start_seconds": ["100", "300"], "properties": ["water, splashes, motorboat", "vocalize, chirp, continuously"], "captions_pred_video": [null, "of a bird in a cage"], "captions_pred_audio": ["a motorboat is moving and people are shouting and cheering ", "birds are chirping and singing"], "question": "which entity is quieter", "label": 1}, {"captions": ["three men talk while wind blows and some liquid flows", "dishes cling together then a man begins to speak"], "sample_ids": ["vJ7JPEFhyLA", "sQGXqGcwOTc"], "start_seconds": ["16", "3"], "properties": ["three men, wind, flow", "cling, speak, dishes"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 1}, {"captions": ["an insect buzzes around continuously", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["v25l1jef3JY", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["buzzes, continuously, insect", "engine, idle, woman"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking and a subway train is moving "], "question": "which entity is not a person?", "label": 0}, {"captions": ["birds chirp as a man speaks and a younger person speaks", "some men converse over an engine running"], "sample_ids": ["xl2PIWyXaM", "sCiy7QS1U"], "start_seconds": ["160", "300"], "properties": ["chirp, man, younger person", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and people are talking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking to a younger person?", "label": 0}, {"captions": ["ticking continues without interruption", "a duck quacks continuously"], "sample_ids": ["v-g-j2uTByM", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["ticking, continuous, clock", "quacks, continuously, duck"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a clock is ticking loudly", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["birds chirp and objects are moved around", "a car accelerates and wind blows"], "sample_ids": ["yPUYU6t3rwo", "u0TrcHhkPQ"], "start_seconds": ["370", "20"], "properties": ["birds chirp, objects are moved around, birds", "accelerates, wind, blows"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", null], "captions_pred_audio": ["insects buzz and a man speaks", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["a man speaks with another voice speaking in the background", "water splashes as an animal walks through"], "sample_ids": ["u21-Z5gJCB8", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["background, voice, man", "animal, water, splashes"], "captions_pred_video": ["- a person cooking eggs in a pan on the stove", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["popping and crackling repeats as men yell and laugh", "water flows as men speak and yell"], "sample_ids": ["rqu8iB22IY", "vJ7JPEFhyLA"], "start_seconds": ["5", "16"], "properties": ["sound, repeats, laugh", "water, flow, men"], "captions_pred_video": [null, "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a dog barks and a man speaks while music plays ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about water flowing as men speak and yell?", "label": 1}, {"captions": ["a male is speaking and a duck quacks as others laugh", "an infant crying frantically"], "sample_ids": ["tiDFTC-5vU", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["male, duck, laugh", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking and ducks are quacking", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a clock ticktocks"], "sample_ids": ["w34HjHr6gAY", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["beeps, hit, woman", "ticktocks, clock, ticktocks"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a young woman speaks over spraying and another person yells", "someone whistles a tune"], "sample_ids": ["uYT5gxnyMWM", "sIXTftIuUgw"], "start_seconds": ["50", "90"], "properties": ["person, spray, yell", "someone, tune, whistle"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", null], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["several ducks are quacking and squawking", "wind blows as people chatter quietly"], "sample_ids": ["wfHeoPDLMaM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["quacking, squawking, ducks", "wind, chatter, people"], "captions_pred_video": ["ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire in the middle of the barn ducks in a barn with a fire", "footage is blurry and out of focus"], "captions_pred_audio": ["ducks are quacking", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a crowd yells, reacts and applauds"], "sample_ids": ["shmR4OZtzqA", "wztCSUxOf8"], "start_seconds": ["30", "130"], "properties": ["man, engine, idle", "a crowd, yells, applauds"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", null], "captions_pred_audio": ["a man speaks while a motor runs", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a baby cries and a woman moans", "a person screams glaringly"], "sample_ids": ["smDKStoHBJo", "xC8kbrKJmco"], "start_seconds": ["0", "0"], "properties": ["a, cry, woman", "glaringly, screams, person"], "captions_pred_video": ["a man holding a crying baby in his arms", null], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a goat is bleating "], "question": "which entity is a person?", "label": 1}, {"captions": ["a man speaks over a running engine and blowing wind", "a machine beeps continuously"], "sample_ids": ["ylpYOorfH4o", "y682ml90jGw"], "start_seconds": ["410", "11"], "properties": ["engine, running, wind", "beeps, machine, continuously"], "captions_pred_video": ["for youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 1500 youtube how to replace the fuel pump on a 1999 dodge ram 15", null], "captions_pred_audio": ["a man is speaking and an engine is revving", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as a vehicle engine idles", "a clock ticktocks"], "sample_ids": ["shmR4OZtzqA", "v-g-j2uTByM"], "start_seconds": ["30", "30"], "properties": ["man, engine, idle", "ticktocks, clock, ticktocks"], "captions_pred_video": ["shows how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche youtube how to replace a fog light bulb on a 2005 chevrolet avalanche you", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a man speaks while a motor runs", "a clock is ticking loudly"], "question": "which entity is ticktocks", "label": 1}, {"captions": ["a man talks while vehicles pass by", "an airplane engine spools and people speak"], "sample_ids": ["sK4u5T8hW78", "wTjoRj1se3U"], "start_seconds": ["30", "390"], "properties": ["a, man, talk", "airplane, engine, spool"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a jet engine is running and people are talking"], "question": "which entity is a video of a man talking?", "label": 0}, {"captions": ["a man speaks while a machine runs before a smoke alarm beeps", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["sG7TyPnFDR0", "w5W5Kqtc8E"], "start_seconds": ["180", "100"], "properties": ["beeps, machine, smoke alarm", "wind, blow, vehicle"], "captions_pred_video": ["a person is using an espresso machine in a restaurant", null], "captions_pred_audio": ["a man is speaking and a microwave oven is beeping ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["a power tool runs and touches a surface", "male speech with light ticking"], "sample_ids": ["zfvPRf3chY", "xO-Q2BlIIPU"], "start_seconds": ["290", "30"], "properties": ["power tool, run, touch", "male, speech, ticking"], "captions_pred_video": [null, "a clock with a green glowing display showing the time 09 07 2016 12 31 2016"], "captions_pred_audio": ["a man is speaking while a power tool is being used ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is not a power tool", "label": 1}, {"captions": ["a toilet flushes and water drains", "an engine works in idle nearby followed by a man talking"], "sample_ids": ["sfAvvZwdLCY", "wqADXCzngMw"], "start_seconds": ["20", "340"], "properties": ["water drains, flushes, water", "engine, idle, man"], "captions_pred_video": ["footage of the toilet in the bathroom", "of a man working on a vintage volkswagen beetle"], "captions_pred_audio": ["a toilet is flushed", "a lawn mower is running and a man is speaking "], "question": "which entity is a machine?", "label": 0}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "someone whistles a tune"], "sample_ids": ["w2JXXIAdUdg", "sIXTftIuUgw"], "start_seconds": ["10", "90"], "properties": ["snoring, distance, person", "someone, tune, whistle"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", null], "captions_pred_audio": ["a person snoring and a dog whimpering", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["some liquid flows while a woman laughs and man talks", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["vddP56-ogds", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["liquid, laughs, man", "male, duck, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a man is speaking and ducks are quacking"], "question": "which entity has a duck?", "label": 1}, {"captions": ["a man speaks as a scratching occurs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sOa7g-44Dag", "wDVMhEdTiVw"], "start_seconds": ["30", "30"], "properties": ["audio, scratching, man", "gun, shoot, water"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["long loud burping by a man", "wind blows strongly"], "sample_ids": ["xmiUIOhtZyQ", "w8uLijTqtlU"], "start_seconds": ["60", "70"], "properties": ["loud, burp, man", "wind, blows, strongly"], "captions_pred_video": ["homer simpson drinking a beer", "footage is blurry and shaky"], "captions_pred_audio": ["a person burps and music plays in the background ", "the wind is blowing strongly"], "question": "which is louder", "label": 1}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "a man speaks as a car is passing by"], "sample_ids": ["w34HjHr6gAY", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["beeps, squawk, child speaking", "a, car, pass"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking to a car?", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "a telephone rings followed by a woman talking"], "sample_ids": ["w34HjHr6gAY", "tGcFnX0GHI"], "start_seconds": ["30", "0"], "properties": ["beeps, hit, woman", "ring, talk, woman"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", null], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a dial tone sounds followed by a woman speaking"], "question": "which entity has a woman talking", "label": 1}, {"captions": ["a woman talking as an infant is crying", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["tMbMDvT50j8", "vfYTJq7nU"], "start_seconds": ["12", "130"], "properties": ["a, talk, infant", "rustling, ducks, quack"], "captions_pred_video": ["shows a little girl covering her face with her hands while sitting at a table", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a duck quacks and a woman speaks"], "question": "which entity is about a woman talking to an infant?", "label": 0}, {"captions": ["loud, continuous burping", "a clock ticktocks"], "sample_ids": ["y636gklDioE", "v-g-j2uTByM"], "start_seconds": ["20", "30"], "properties": ["loud, continuous, burping", "ticktocks, clock, ticktocks"], "captions_pred_video": ["a dog sitting on a red chair in front of an old telephone", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a person burps loudly several times", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a helicopter engine runs continuously", "a woman speaks and other women and a man talk with her"], "sample_ids": ["ugHJF0hfYkg", "vbpKkWvfOu4"], "start_seconds": ["10", "560"], "properties": ["engine, running, continuously", "a, woman, man"], "captions_pred_video": ["a man in a helicopter cockpit wearing headphones", "2012-07-20 17 30 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20"], "captions_pred_audio": ["a helicopter is flying overhead ", "a woman is speaking and a man is speaking"], "question": "which entity has more people", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "some tunes played by whistling"], "sample_ids": ["vuUVPzd2FXw", "u6BnG6YZqJ4"], "start_seconds": ["160", "0"], "properties": ["a, steam, release", "tune, play, whistling"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a large bell chimes back and forth loudly", "some tunes played by whistling"], "sample_ids": ["w2M4i1mklOA", "u6BnG6YZqJ4"], "start_seconds": ["30", "0"], "properties": ["loud, chime, bell", "tune, play, whistling"], "captions_pred_video": ["footage of an antique clock", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a clock is ticking and a bell is ringing ", "a person whistling a song"], "question": "which entity is played by whistling", "label": 1}, {"captions": ["a person is burping while a girl speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["vdoxuJn9lTc", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["person, burp, girl", "People, motor, brakes"], "captions_pred_video": ["a group of young girls playing a video game together in a living room", null], "captions_pred_audio": ["a child speaks followed by a burp", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is about a person speaking to a girl?", "label": 0}, {"captions": ["rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown", "people cheer as a vehicle engine revs"], "sample_ids": ["vfYTJq7nU", "xjhAnI2q6hM"], "start_seconds": ["130", "6"], "properties": ["rustling, ducks, quack", "engine revs, vehicle, people"], "captions_pred_video": [null, "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a duck quacks and a woman speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a boat travels through the waves as the wind blows loudly and a man speaks over a radio", "paper is crumpling consistently"], "sample_ids": ["tDVADusiIoc", "v5cSxLaHADY"], "start_seconds": ["60", "0"], "properties": ["wind, radio, waves", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a man speaks as a car is passing by", "dishes cling together then a man begins to speak"], "sample_ids": ["sK4u5T8hW78", "sQGXqGcwOTc"], "start_seconds": ["30", "3"], "properties": ["a, car, pass", "cling, speak, dishes"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a man makes an exclamation, then another man speaks", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["xO-Q2BlIIPU", "zFjIWfSD-4"], "start_seconds": ["30", "410"], "properties": ["two men, exclamation, speak", "People, motor, brakes"], "captions_pred_video": ["a clock with a green glowing display showing the time 09 07 2016 12 31 2016", null], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity has more people", "label": 1}, {"captions": ["water flows as men speak and yell", "a telephone rings followed by a woman talking"], "sample_ids": ["vJ7JPEFhyLA", "tGcFnX0GHI"], "start_seconds": ["16", "0"], "properties": ["water, flow, men", "ring, talk, woman"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a dial tone sounds followed by a woman speaking"], "question": "which entity is a recording", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a clock ticktocks"], "sample_ids": ["uEU-Hg5MTN8", "v-g-j2uTByM"], "start_seconds": ["27", "30"], "properties": ["a woman, laughs, animal", "ticktocks, clock, ticktocks"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["an aircraft engine runs as people speak", "an airplane engine runs"], "sample_ids": ["wTjoRj1se3U", "yVPZ2MNWpms"], "start_seconds": ["390", "0"], "properties": ["engine, run, people", "engine, airplane, runs"], "captions_pred_video": ["footage of a man playing with a remote control airplane in a field", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a jet engine is running and people are talking", "a car is driving by on the road "], "question": "which entity has a running engine", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "a horn honks twice and keys jingle, followed by an electronic beep"], "sample_ids": ["uWAAAL4CIoc", "wSVhSdj0F0"], "start_seconds": ["0", "10"], "properties": ["a woman, chirps, animal", "horn honks, keys jingle, electronic beep"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a car horn honks and keys jangle with background noise "], "question": "which entity is more likely to be heard in a car", "label": 1}, {"captions": ["an audience gives applause", "a horn blasts as warning bells ring"], "sample_ids": ["x6iCUDmRpKQ", "zgUgkpk78xU"], "start_seconds": ["38", "70"], "properties": ["applause, audience, give", "horn, bells, ring"], "captions_pred_video": ["a black background with the moon and stars in the sky", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a group of people are clapping and cheering", "a train blows its horn as it speeds down the tracks "], "question": "which is a warning", "label": 1}, {"captions": ["water drips and bubbles as a man speaks", "dishes cling together then a man begins to speak"], "sample_ids": ["vSeGhaZt-aI", "sQGXqGcwOTc"], "start_seconds": ["50", "3"], "properties": ["water, bubbles, speak", "cling, speak, dishes"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["water rushes and then a vehicle zooms past", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["s4Uz1Ffgo04", "xfaoyyzw2WU"], "start_seconds": ["100", "180"], "properties": ["water, rushes, vehicle", "loud, jet engine, roar"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["men speak and a nozzle sprays liquid", "a horn blasts as warning bells ring"], "sample_ids": ["wRV8yMk886E", "zgUgkpk78xU"], "start_seconds": ["0", "70"], "properties": ["liquid, spray, nozzle", "horn, bells, ring"], "captions_pred_video": ["two cars are parked in a parking lot at night", "of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108"], "captions_pred_audio": ["a man speaks followed by a loud burst", "a train blows its horn as it speeds down the tracks "], "question": "which entity is a warning device", "label": 1}, {"captions": ["a man speaks uses a drill", "a door slams shut and an object moves on a hard surface"], "sample_ids": ["x5eIC7S0fbg", "zkKdxzNC97Y"], "start_seconds": ["60", "27"], "properties": ["A man is speaking, uses a drill, and is a tool", "hard, surface, door"], "captions_pred_video": ["a person in surgical gloves is using a needle to remove a small object from a tooth", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a man is speaking and using a power tool ", "a door is opened and closed"], "question": "which entity is a tool", "label": 0}, {"captions": ["a horse runs while two women talk", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["sdvI1mHAsc", "uYT5gxnyMWM"], "start_seconds": ["20", "50"], "properties": ["two women, horse, run", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["horses clip-clop and a woman speaks", "a woman is speaking and a baby is crying"], "question": "which entity has more people", "label": 1}, {"captions": ["a motorcycle engine is revving while people are speaking", "small dogs yip and bark sharply"], "sample_ids": ["y8dSeubCNI", "v-wcQf4BDY0"], "start_seconds": ["4", "120"], "properties": ["engine revving, people speaking, motorcycle", "bark, yip, sharply"], "captions_pred_video": [null, "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["an engine revving and people talking in the background", "a dog barks and growls"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a woman speaks happily and an animal chirps", "an engine runs loudly"], "sample_ids": ["uWAAAL4CIoc", "vqZuVbG6-HI"], "start_seconds": ["0", "130"], "properties": ["a woman, chirps, animal", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a woman is speaking and a dog is barking ", "a lawn mower is running and men are speaking "], "question": "which entity is quieter", "label": 0}, {"captions": ["wind blows as people chatter quietly", "waves crash against a shoreline and people speak"], "sample_ids": ["xBxDz0CFVn0", "yFB25fqfU8I"], "start_seconds": ["30", "300"], "properties": ["wind, chatter, people", "wave, crash, shoreline"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a person surfing in the ocean"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a man is speaking and laughing while water is splashing and gurgling"], "question": "which entity is more calm", "label": 0}, {"captions": ["a man speaks while rain falls onto a hard surface", "an insect buzzes around continuously"], "sample_ids": ["wqN6IIHw3po", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["rain, surface, fall", "buzzes, continuously, insect"], "captions_pred_video": ["in your own words what is happening in this screenshot? blood splattered all over the place", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking and water is splashing", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a machine clanks and thumps and a male speaks", "a man talks nearby and another man talks far away while some liquid flows"], "sample_ids": ["sWZzXuWYY", "sapQIQUhFc"], "start_seconds": ["420", "280"], "properties": ["male, clanks, thumps", "liquid, flow, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a man is speaking and a stream is flowing in the background "], "question": "which entity has a man speaking?", "label": 0}, {"captions": ["speaking following by laughing and clapping", "an animal quacks rapidly"], "sample_ids": ["u2f5NpsoHBg", "vh30P49Po6s"], "start_seconds": ["30", "30"], "properties": ["person, laugh, clap", "animal, quacks, rapidly"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "a duck is quacking loudly"], "question": "which entity is a person", "label": 0}, {"captions": ["music plays and animals vocalize as a cartoon character makes sounds", "an infant crying frantically"], "sample_ids": ["weDbePuc-Xc", "zwOBqeFTgiU"], "start_seconds": ["40", "30"], "properties": ["cartoon character, music, vocalize", "cry, infant, frantically"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "of the baby crying in the car seat"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["a infant makes noise and is excited", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["wIJK3-5y0kA", "tiDFTC-5vU"], "start_seconds": ["30", "30"], "properties": ["noise, excited, infant", "male, duck, laugh"], "captions_pred_video": ["of a baby playing with a cat in a dark room", null], "captions_pred_audio": ["a baby cries and a woman speaks", "a man is speaking and ducks are quacking"], "question": "which entity is speaking", "label": 1}, {"captions": ["two women and a man talk while a kid cries", "people applaud and hoot and chat quietly"], "sample_ids": ["wyllXV6PjKo", "wwyfGO2J4"], "start_seconds": ["30", "90"], "properties": ["a kid, talk, cry", "people, applaud, hoot"], "captions_pred_video": [null, null], "captions_pred_audio": ["a woman speaks and a baby cries", "people are clapping and speaking with background noise "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a child babbles as a woman speaks", "a woman speaks as she rubs two objects together"], "sample_ids": ["wEBlkGWVWwE", "vzxHnu-SFEw"], "start_seconds": ["260", "80"], "properties": ["a, babble, woman", "two objects, woman, speak"], "captions_pred_video": ["shows a person writing on the whiteboard", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a woman is speaking and a child is speaking with background noise and clapping ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which woman speaks as she rubs two objects together?", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["vSeGhaZt-aI", "tDVADusiIoc"], "start_seconds": ["50", "60"], "properties": ["water, bubbles, run", "water, radio, man"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity shows a man speaking over a radio?", "label": 1}, {"captions": ["a baby coos and fidgets as a lady speaks and laughs", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uPDn2BFTHk", "wDVMhEdTiVw"], "start_seconds": ["140", "30"], "properties": ["lady, laugh, baby", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a baby laughs and a woman speaks", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to cause harm", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["t69a8aRKhmc", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["a, b, c", "a, scream, girl"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "people speak as gunfire rings out"], "sample_ids": ["slZLHwNbbt4", "wqTCwqVRDlk"], "start_seconds": ["300", "80"], "properties": ["a, horn, run", "gunfire, ring, speak"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "a man is speaking and a gun is fired"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a dog whimpers and a woman briefly talks", "vehicles pass by on a roadway"], "sample_ids": ["y1saVTXsKwc", "tgbONvsP47Y"], "start_seconds": ["80", "0"], "properties": ["a, dog, talk", "pass, vehicle, roadway"], "captions_pred_video": ["a dog playing with a pink ball", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a dog barks and a man speaks", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "people applaud and hoot and chat quietly"], "sample_ids": ["vSeGhaZt-aI", "wwyfGO2J4"], "start_seconds": ["50", "90"], "properties": ["water, bubbles, run", "people, applaud, hoot"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", null], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "people are clapping and speaking with background noise "], "question": "which entity has more people", "label": 1}, {"captions": ["a train whistle keeps going off while the clickety-clack of the train on the rails are continuous", "birds chirp and pigeons vocalize while walking around"], "sample_ids": ["ukg5L09Wpvo", "wIvYjuR3nrg"], "start_seconds": ["150", "9"], "properties": ["clickety-clack, train, whistle", "birds, pigeons, vocalize"], "captions_pred_video": ["footage of a train passing through a forest on a dirt road", "footage of a pigeon sitting on a roof with trees in the background"], "captions_pred_audio": ["a train blows its whistle and blows its horn ", "birds are chirping and cooing"], "question": "which entity is a bird?", "label": 1}, {"captions": ["ticking continues without interruption", "an infant crying as a woman laughs"], "sample_ids": ["v-g-j2uTByM", "xhmRY9yhC7c"], "start_seconds": ["30", "20"], "properties": ["ticking, continuous, clock", "a, laugh, infant"], "captions_pred_video": ["in your own words a cuckoo clock hanging on the wall", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a clock is ticking loudly", "a baby cries and a woman speaks"], "question": "which entity is not continuous", "label": 1}, {"captions": ["a man speaks and is typing on a keyboard", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["x9JovgqUcs", "uYT5gxnyMWM"], "start_seconds": ["500", "50"], "properties": ["a, man, speaks, keyboard", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man speaks and types on a keyboard", "a woman is speaking and a baby is crying"], "question": "which entity is a person speaking?", "label": 0}, {"captions": ["a man speaks over a radio as wind blows and water splashes", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["tDVADusiIoc", "w5W5Kqtc8E"], "start_seconds": ["60", "100"], "properties": ["water, radio, man", "wind, blow, vehicle"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", null], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has a vehicle engine running?", "label": 1}, {"captions": ["someone is typing on a computer keyboard", "a helicopter engine runs continuously"], "sample_ids": ["v0x1odnXtP0", "ugHJF0hfYkg"], "start_seconds": ["210", "10"], "properties": ["keyboard, type, computer", "engine, running, continuously"], "captions_pred_video": ["how to make money on youtube in spanish", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a person is typing on a keyboard", "a helicopter is flying overhead "], "question": "which entity is running continuously", "label": 1}, {"captions": ["several ducks quack and cocks crow far away", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sNB8zxXneIM", "zj2R0XoFr5k"], "start_seconds": ["20", "50"], "properties": ["several, quack, cocks", "airplane, boy, fly"], "captions_pred_video": ["a group of geese in a cage", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["birds chirp and a pop occurs before a man speaks", "a propeller rotates loudly and intensely"], "sample_ids": ["zuua6-5goWw", "ugHJF0hfYkg"], "start_seconds": ["30", "10"], "properties": ["sound, pop, bird", "loud, intense, propeller"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a helicopter is flying overhead "], "question": "which entity is louder", "label": 1}, {"captions": ["motor noise is followed by a horn honking and a siren wailing", "a propeller rotates loudly and intensely"], "sample_ids": ["y2bVZ7rz-5M", "ugHJF0hfYkg"], "start_seconds": ["280", "10"], "properties": ["motor noise, horn, siren", "loud, intense, propeller"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a helicopter is flying overhead "], "question": "which is louder", "label": 1}, {"captions": ["birds tweet and squawk", "an infant crying as a woman laughs"], "sample_ids": ["w1mlz3Pe4fU", "xhmRY9yhC7c"], "start_seconds": ["300", "20"], "properties": ["squawk, tweet, scream", "a, laugh, infant"], "captions_pred_video": ["of a bird in a cage", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["birds are chirping and singing", "a baby cries and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["a person snores loudly multiple times at a close distance", "someone is typing on a computer keyboard"], "sample_ids": ["sSMl2vc3ek", "v0x1odnXtP0"], "start_seconds": ["20", "210"], "properties": ["loud, multiple, distance", "keyboard, type, computer"], "captions_pred_video": [null, "how to make money on youtube in spanish"], "captions_pred_audio": ["a person snoring loudly", "a person is typing on a keyboard"], "question": "which is not a person", "label": 1}, {"captions": ["the wind blows while a vehicle engine runs", "a child speaks in closed space"], "sample_ids": ["xyL9F5VrjkE", "yW6FWLSLkx4"], "start_seconds": ["20", "40"], "properties": ["wind, blows, vehicle", "child, space, speak"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "of the doll sitting on the bed with her hand outstretched towards the viewer"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking with background noise and breathing sounds "], "question": "which entity is speaking", "label": 1}, {"captions": ["some tunes played by whistling", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["u6BnG6YZqJ4", "su6FAOcOA8c"], "start_seconds": ["0", "4"], "properties": ["tune, play, whistling", "engine, idle, woman"], "captions_pred_video": ["a young boy standing in front of a group of kids in a classroom", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a person whistling a song", "a woman is speaking and a subway train is moving "], "question": "which is not a musical instrument", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a machine beeps continuously"], "sample_ids": ["sShpyu2l4YQ", "y682ml90jGw"], "start_seconds": ["0", "11"], "properties": ["growl, bark, yip", "beeps, machine, continuously"], "captions_pred_video": ["the puppies are playing with a toy", null], "captions_pred_audio": ["a dog is barking and growling", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "water flows as men speak and yell"], "sample_ids": ["siJFXfGWgDk", "vJ7JPEFhyLA"], "start_seconds": ["50", "16"], "properties": ["a, bird, vehicle", "water, flow, men"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more water flowing", "label": 1}, {"captions": ["goats bleat and people speak", "a person is whistling"], "sample_ids": ["z5iUE5h0EPs", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["goats bleat, people speak, language", "person, whistling, person"], "captions_pred_video": ["of the goat in the barn", null], "captions_pred_audio": ["a goat bleats and a man speaks", "a person whistling a song"], "question": "which entity is a person", "label": 1}, {"captions": ["the rumbling of a bus followed by a soft male voice", "motor noise is followed by a horn honking and a siren wailing"], "sample_ids": ["vK93VuO0yNc", "y2bVZ7rz-5M"], "start_seconds": ["30", "280"], "properties": ["male voice, bus, rumble", "motor noise, horn, siren"], "captions_pred_video": ["footage is blurry due to the movement of the bus as it drives through the city at night", "footage of a parade of fire trucks driving down the street"], "captions_pred_audio": ["a car drives by with wind noise in the background ", "a truck is honking its horn and a siren is blaring "], "question": "which entity is followed by a horn honking and a siren wailing?", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["u--KhUW8l1Y", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["horn, siren, life", "a woman, something, fried"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["vehicle engines race around a track as a man commentates", "a woman speaks and laughs and an animal grunts and snorts"], "sample_ids": ["sZPuqDgX2V0", "uEU-Hg5MTN8"], "start_seconds": ["30", "27"], "properties": ["commentator, race, track", "animal, grunts, snorts"], "captions_pred_video": [null, "of a girl wearing a pig mask and a boy hugging her"], "captions_pred_audio": ["a man is speaking and a helicopter is flying overhead ", "a woman is speaking and a baby is crying"], "question": "which entity is a video of a race?", "label": 0}, {"captions": ["sirens ring and approach with humming of distant traffic", "winds blows roughly as a vehicle races past"], "sample_ids": ["xERFUeZONz8", "xjvTpk2Zpr8"], "start_seconds": ["0", "70"], "properties": ["ring, approach, traffic", "wind, blows, vehicle"], "captions_pred_video": ["footage is blurry due to camera shake or motion blur", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["an emergency vehicle siren blares", "a jet engine roars and wind blows "], "question": "which entity is more likely to be heard", "label": 0}, {"captions": ["wind blowing followed by a zoom", "a motorcycle engine works nearby"], "sample_ids": ["vr8ZXjEBhMQ", "tOSWIURC-4"], "start_seconds": ["150", "0"], "properties": ["wind, blow, zoom", "engine, work, nearby"], "captions_pred_video": ["is taken from a motorcycle's point of view", null], "captions_pred_audio": ["wind blows and a chainsaw cuts through wood ", "a lawn mower is running "], "question": "which entity is a zoom of", "label": 0}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "someone snores nearby"], "sample_ids": ["vbZ-0lGPneg", "spJCm8tD9Zo"], "start_seconds": ["30", "90"], "properties": ["a woman, a television program, a bird", "someone snores, nearby, someone"], "captions_pred_video": ["of a man holding a baby duck in his hands", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "a person is snoring loudly"], "question": "which entity is playing a television program", "label": 0}, {"captions": ["speaking following by laughing and clapping", "water flows and trickles"], "sample_ids": ["u2f5NpsoHBg", "tB7hWb9gTuQ"], "start_seconds": ["30", "30"], "properties": ["person, laugh, clap", "water, flow, trickle"], "captions_pred_video": ["is being projected on a screen at the front of the stage", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a woman is speaking and a crowd is clapping", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "a man speaks as a motor runs in the background"], "sample_ids": ["x5cuQjOdM3E", "xZepNM9qcRA"], "start_seconds": ["30", "30"], "properties": ["cat, meows, young woman", "background, motor, run"], "captions_pred_video": ["a black background with an airplane flying in the sky", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is quieter", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "an engine runs loudly"], "sample_ids": ["vf44CgrjT0A", "vqZuVbG6-HI"], "start_seconds": ["20", "130"], "properties": ["loud, long, person", "loud, engine, run"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "footage is blurry because it's raining outside"], "captions_pred_audio": ["a loud burp", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["sucking and grunting followed by slurping with birds in the background", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["yYEVLuqEytU", "w5W5Kqtc8E"], "start_seconds": ["40", "100"], "properties": ["grunt, slurp, background", "wind, blow, vehicle"], "captions_pred_video": ["a baby goat is being petted by a person's hand", null], "captions_pred_audio": ["several sheep bleat and a man speaks", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks while video game music plays with some clicking", "a woman speaks and dog vocalizes"], "sample_ids": ["tw76HGONaKg", "uWAAAL4CIoc"], "start_seconds": ["570", "0"], "properties": ["music, click, man", "a, dog, vocalize"], "captions_pred_video": ["game you are currently playing on microsoft xbox 360 the legend of zelda ocarina of time the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda ocarina of time secrets, the legend of zelda ocarina of time wiki, the legend of zelda ocarina of time walkthrough, the legend of zelda ocarina of time guide, the legend of zelda ocarina of time tips, the legend of zelda ocarina of time cheats, the legend of zelda", null], "captions_pred_audio": ["a man speaks and types on a computer keyboard ", "a woman is speaking and a dog is barking "], "question": "which entity is a dog?", "label": 1}, {"captions": ["multiple insects buzz over rustling wind", "a person is burping then speaks and laughs"], "sample_ids": ["tMJne1a4AFI", "wAAkbZToh8"], "start_seconds": ["0", "0"], "properties": ["wind, buzz, rustling", "burp, laugh, speak"], "captions_pred_video": ["a swarm of bees on the ground", null], "captions_pred_audio": ["a swarm of bees buzzing around", "a man burps and a woman speaks"], "question": "which entity is not a person?", "label": 0}, {"captions": ["a man sprays as a scraping occurs in the background", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sOa7g-44Dag", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["background, man, spray", "airplane, boy, fly"], "captions_pred_video": ["footage and stock-footage/b-roll of a man in a hazmat suit spraying insulation in an attic", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and rubbing his hands together ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["a fly buzzes around loudly as birds chirp", "people speak in a closed space"], "sample_ids": ["uJV8NDaHqqk", "sTpirNYo8vQ"], "start_seconds": ["100", "30"], "properties": ["loud, fly, chirp", "people, space, speak"], "captions_pred_video": ["a bee hive in a wooden box", "of a man taking a selfie on a bus"], "captions_pred_audio": ["a swarm of bees buzzing around", "a man is speaking while a car is revving and accelerating "], "question": "which entity is quieter", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "food is frying then a woman speaks"], "sample_ids": ["siJFXfGWgDk", "ukxt9I7eMMg"], "start_seconds": ["50", "30"], "properties": ["man, woman, vehicle", "food, woman, speak"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of a person preparing food on a stool in a kitchen"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman is speaking while food is frying in the background "], "question": "which entity has a woman speaking?", "label": 0}, {"captions": ["a toilet flushes and water drains", "paper is crumpling consistently"], "sample_ids": ["sfAvvZwdLCY", "v5cSxLaHADY"], "start_seconds": ["20", "0"], "properties": ["water drains, flushes, water", "paper is crumpling, paper is white, paper is crumpling"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of the person holding a pair of scissors"], "captions_pred_audio": ["a toilet is flushed", "paper is crumpled and crinkled"], "question": "which entity is crumpling", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["vSeGhaZt-aI", "yDoT73BWsdA"], "start_seconds": ["50", "10"], "properties": ["water, bubbles, run", "engine, revs, vehicle"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle", "label": 1}, {"captions": ["several insects fly while two men talk", "people applaud and hoot and chat quietly"], "sample_ids": ["s-T9OVOiMLo", "wwyfGO2J4"], "start_seconds": ["330", "90"], "properties": ["several, fly, men", "people, applaud, hoot"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", null], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "people are clapping and speaking with background noise "], "question": "which entity is more active", "label": 1}, {"captions": ["water splashing and a person laughs in the distance then a man speaks nearby", "a drill drills through something then people begin laughing"], "sample_ids": ["vddP56-ogds", "tEE3MpBt1sg"], "start_seconds": ["30", "50"], "properties": ["water, splash, person, laugh", "drill, something, laugh"], "captions_pred_video": [null, "footage is blurry due to the smoke in the air"], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "people are laughing breathing and speaking with background noise "], "question": "which entity is about a drill?", "label": 1}, {"captions": ["people speak in the background as a clock ticktocks", "water splashes and a door squeaks"], "sample_ids": ["vZAw4apG0Es", "sdXV-ylviw"], "start_seconds": ["30", "190"], "properties": ["background, clock, ticktocks", "sound, splash, door"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", null], "captions_pred_audio": ["a clock is ticking and people are talking", "a dog barks and taps with background noise "], "question": "which entity has a clock in the background?", "label": 0}, {"captions": ["a young female speaks, followed by spraying and a female screaming", "a man speaks as a motor runs in the background"], "sample_ids": ["uYT5gxnyMWM", "xZepNM9qcRA"], "start_seconds": ["50", "30"], "properties": ["female, spraying, scream", "background, motor, run"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a person burps loudly for a long time nearby", "a heavy rain falls endlessly"], "sample_ids": ["vf44CgrjT0A", "wP8ZKrlx3oA"], "start_seconds": ["20", "40"], "properties": ["loud, long, person", "heavy, rain, fall"], "captions_pred_video": ["a man in a striped shirt with his mouth open in front of a bookshelf", "footage of a flooded street in the middle of a desert with mountains in the background"], "captions_pred_audio": ["a loud burp", "a heavy rain is falling on a surface"], "question": "which entity is falling", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a duck quacks continuously"], "sample_ids": ["yajyRTUQk3U", "vh30P49Po6s"], "start_seconds": ["400", "30"], "properties": ["noise, woman, speak", "quacks, continuously, duck"], "captions_pred_video": ["- a woman cooking in the kitchen", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["a man speaks as bees buzz and birds chirp", "a baby cries and fusses, a woman speaks, and a man speaks"], "sample_ids": ["t25U-v4k4ts", "wyllXV6PjKo"], "start_seconds": ["40", "30"], "properties": ["bees buzz, birds chirp, man speaks", "a baby, a woman, a man"], "captions_pred_video": ["of a beekeeper working on a beehive in the woods", null], "captions_pred_audio": ["a man is speaking and bees are buzzing", "a woman speaks and a baby cries"], "question": "which entity has a baby?", "label": 1}, {"captions": ["a man speaks followed by another man speaking outside", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["viuTg1M-dqg", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["two men, speak, follow", "a, scream, girl"], "captions_pred_video": ["footage of water coming out of a hole in the ground", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a woman is speaking and a baby is crying"], "question": "which entity has a scream followed by more girls talking?", "label": 1}, {"captions": ["a baby cries and a woman moans", "a man speaks as a motor runs in the background"], "sample_ids": ["smDKStoHBJo", "xZepNM9qcRA"], "start_seconds": ["0", "30"], "properties": ["a, cry, woman", "background, motor, run"], "captions_pred_video": ["a man holding a crying baby in his arms", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a man speaks as a vehicles passes by then a woman speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["siJFXfGWgDk", "zj2R0XoFr5k"], "start_seconds": ["50", "50"], "properties": ["man, woman, vehicle", "airplane, boy, fly"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a woman speaks while a helicopter flies overhead "], "question": "which entity has a boy speaking?", "label": 1}, {"captions": ["a man speaks, another man speaks, and a small bell dings", "water splashes as an animal walks through"], "sample_ids": ["t69a8aRKhmc", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["a, b, c", "animal, water, splashes"], "captions_pred_video": ["footage is blurry and out of focus", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a train horn sounds as a railroad passing bell rings", "a woman speaks as she rubs two objects together"], "sample_ids": ["zgUgkpk78xU", "vzxHnu-SFEw"], "start_seconds": ["70", "80"], "properties": ["horn, bell, train", "two objects, woman, speak"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["distant men speak as a spray can nozzle is depressed", "multiple adults speaking, and a child shouting in the background"], "sample_ids": ["rwtmaKiCcQU", "yks4cLgIDMc"], "start_seconds": ["30", "170"], "properties": ["nozzle, depressed, spray can", "background, speaking, child"], "captions_pred_video": ["shows a man spraying paint on a wall with a spray gun", "footage of two kids wrestling on the floor"], "captions_pred_audio": ["spraying and people speaking", "a man is speaking and a child is crying"], "question": "which entity has a child shouting in the background?", "label": 1}, {"captions": ["metal clacking as food and oil sizzles followed by a woman talking", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["vW4x7S1VfQc", "xfaoyyzw2WU"], "start_seconds": ["150", "180"], "properties": ["clacking, oil, woman", "loud, jet engine, roar"], "captions_pred_video": ["footage of a person cooking fish in a frying pan on a stove top", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["food sizzles in a frying pan", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a clock ticks quietly and rhythmically", "an infant crying frantically"], "sample_ids": ["u7C-AEBQM", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["ticks, rhythmic, quiet", "cry, infant, frantically"], "captions_pred_video": [null, "of the baby crying in the car seat"], "captions_pred_audio": ["a ticktock of a clock", "a baby cries loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["a horn honks twice and keys jingle, followed by an electronic beep", "a man speaks as a motor runs in the background"], "sample_ids": ["wSVhSdj0F0", "xZepNM9qcRA"], "start_seconds": ["10", "30"], "properties": ["horn honks, keys jingle, electronic beep", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a car horn honks and keys jangle with background noise ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a motorcycle idles loudly as wind blows", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["v7jJS8aAyA", "tdWhHV3X25Q"], "start_seconds": ["10", "60"], "properties": ["wind, blows, loudly", "applause, audience, yells"], "captions_pred_video": [null, "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a motorcycle engine is idling and vibrating", "a man is speaking and a crowd is clapping"], "question": "which entity is a performance", "label": 1}, {"captions": ["a jet engine spools up and takes off", "water is sprayed across a hard surface"], "sample_ids": ["vBslzh7saPw", "sQwlkXjQabo"], "start_seconds": ["90", "10"], "properties": ["engine, spools, takes", "water, spray, surface"], "captions_pred_video": ["a pickup truck carrying a large object down the road", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a jet engine roars and accelerates ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a bell rings multiple times before a siren sounds in the distance", "people cheer as a vehicle engine revs"], "sample_ids": ["ul60S8TXDA8", "xjhAnI2q6hM"], "start_seconds": ["60", "6"], "properties": ["sound, distance, bell", "engine revs, vehicle, people"], "captions_pred_video": ["game the legend of zelda ocarina of time on nintendo 64", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a bell rings and mechanisms make ticking sounds ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["frogs croak and vocalize", "birds chirp and objects are moved around"], "sample_ids": ["yswmmRZFItk", "yPUYU6t3rwo"], "start_seconds": ["0", "370"], "properties": ["croak, vocalize, frog", "birds chirp, objects are moved around, birds"], "captions_pred_video": ["a close up of a frog in the water", "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a frog is croaking", "insects buzz and a man speaks"], "question": "which entity is a bird", "label": 1}, {"captions": ["an animal bleats and cries out and metal bangs", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["xfudFO976zE", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["animal, bleats, cry", "clickety-clack, train, whistle"], "captions_pred_video": ["footage is blurry and shaky", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a goat bleats and birds chirp in the background ", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["vehicle tires screech and a man speaks before a car door opens", "a man speaks as a car is passing by"], "sample_ids": ["sxYkFKFIZD0", "sK4u5T8hW78"], "start_seconds": ["20", "30"], "properties": ["screech, man, door", "a, car, pass"], "captions_pred_video": ["2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2005 audi a4 1 8t 2", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating with a squeal in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is about a car passing by?", "label": 1}, {"captions": ["a car revs and accelerates loudly and men and women chatter among themselves", "a man speaks as a motor runs in the background"], "sample_ids": ["y8dSeubCNI", "xZepNM9qcRA"], "start_seconds": ["4", "30"], "properties": ["men, women, car", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["an engine revving and people talking in the background", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["a person whistles a meandering tune", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["uFoga8sHpiw", "uYT5gxnyMWM"], "start_seconds": ["90", "50"], "properties": ["person, tune, whistle", "female, spraying, scream"], "captions_pred_video": ["footage of a bird in a cage", "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a person whistles a song", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 0}, {"captions": ["a man laughs and speaks as cats purr and hiss", "people speak as gunfire rings out"], "sample_ids": ["vVhthZ45k3Y", "wqTCwqVRDlk"], "start_seconds": ["30", "80"], "properties": ["cat, purr, hiss", "gunfire, ring, speak"], "captions_pred_video": ["footage is blurry and out of focus", "of a woman shooting a gun at a target on the beach"], "captions_pred_audio": ["a man is speaking and a cat is meowing", "a man is speaking and a gun is fired"], "question": "which entity is more quiet", "label": 0}, {"captions": ["rain falls on a surface as men speak and music plays", "a girl speaks followed by a scream and more girls talking"], "sample_ids": ["w0xsN8X18Y", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["music, surface, rain", "a, scream, girl"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a woman is speaking and a baby is crying"], "question": "which entity has more people speaking", "label": 1}, {"captions": ["birds chirp, a woman speaks, and insects buzz", "two men and a woman talk while wind blows and birds tweet"], "sample_ids": ["t97k0cejSQE", "wqZ135Ssz0"], "start_seconds": ["250", "60"], "properties": ["sound, chirp, buzz", "two men, woman, birds"], "captions_pred_video": ["a bee on a purple thistle flower", null], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking and ducks are quacking with wind noise in the background "], "question": "which entity has more birds", "label": 1}, {"captions": ["a toilet flushes and water drains", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["sfAvvZwdLCY", "wDVMhEdTiVw"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "gun, shoot, water"], "captions_pred_video": ["footage of the toilet in the bathroom", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a toilet is flushed", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is a source of water", "label": 1}, {"captions": ["music plays, a person speaks, followed by whooshes and a ding", "small dogs growl, bark and yip."], "sample_ids": ["tQWGZLItBXk", "sShpyu2l4YQ"], "start_seconds": ["170", "0"], "properties": ["music, person, ding", "growl, bark, yip"], "captions_pred_video": ["worms revolution screenshots", "the puppies are playing with a toy"], "captions_pred_audio": ["a child speaks music plays video game sounds sound effects and sound effects play ", "a dog is barking and growling"], "question": "which entity is more active", "label": 1}, {"captions": ["children speak and play together", "a man talks while a clock does ticktock"], "sample_ids": ["yVVP8XvWJTo", "spYNpeN7rPY"], "start_seconds": ["260", "1"], "properties": ["children, speak, play", "a clock, ticktock, man"], "captions_pred_video": ["footage of a playground at a school or daycare center", "in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a man is speaking and breathing with background noise "], "question": "which entity is a clock?", "label": 1}, {"captions": ["an infant crying frantically", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zwOBqeFTgiU", "vbZ-0lGPneg"], "start_seconds": ["30", "30"], "properties": ["cry, infant, frantically", "a woman, a television program, a bird"], "captions_pred_video": ["of the baby crying in the car seat", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a baby cries loudly", "a woman is speaking and a dog is whimpering"], "question": "which entity is a person", "label": 1}, {"captions": ["race cars go around a track as a man commentates", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["uZesmtKZGSw", "wDVMhEdTiVw"], "start_seconds": ["250", "30"], "properties": ["car, track, man", "gun, shoot, water"], "captions_pred_video": ["1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a man is speaking and a car is revving with laughter in the background ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["waves crash against a shoreline and wind blows", "an airplane engine spools and people speak"], "sample_ids": ["zdYdyF9-m8U", "wTjoRj1se3U"], "start_seconds": ["7", "390"], "properties": ["wind, crash, shoreline", "airplane, engine, spool"], "captions_pred_video": ["a person kayaking in the ocean near a cliff", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["waves crash and wind blows ", "a jet engine is running and people are talking"], "question": "which entity is stationary", "label": 1}, {"captions": ["birds tweet and squawk", "wind blows as people chatter quietly"], "sample_ids": ["w1mlz3Pe4fU", "xBxDz0CFVn0"], "start_seconds": ["300", "30"], "properties": ["squawk, tweet, scream", "wind, chatter, people"], "captions_pred_video": ["of a bird in a cage", "footage is blurry and out of focus"], "captions_pred_audio": ["birds are chirping and singing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a group of people chatter and talk as multiple horns honk in the background", "someone is typing on a computer keyboard"], "sample_ids": ["yLy-WycbVVE", "v0x1odnXtP0"], "start_seconds": ["30", "210"], "properties": ["background, people, talk", "keyboard, type, computer"], "captions_pred_video": ["a soccer field in a stadium with yellow and red seats", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking and a church bell is ringing with wind noise in the background ", "a person is typing on a keyboard"], "question": "which entity is typing on a keyboard", "label": 1}, {"captions": ["a man speaks while turning a water faucet on", "a propeller rotates loudly and intensely"], "sample_ids": ["vf9xf3vMsGM", "ugHJF0hfYkg"], "start_seconds": ["540", "10"], "properties": ["A man speaks while turning a water faucet on.", "loud, intense, propeller"], "captions_pred_video": ["of the person washing their hands under the faucet", "a man in a helicopter cockpit wearing headphones"], "captions_pred_audio": ["a man is speaking while water is running in the background", "a helicopter is flying overhead "], "question": "which entity is quieter", "label": 0}, {"captions": ["a toilet flushes and water drains", "a vehicle engine accelerating then running on idle"], "sample_ids": ["sfAvvZwdLCY", "vYkA3cfXp5Q"], "start_seconds": ["20", "30"], "properties": ["water drains, flushes, water", "engine, accelerate, idle"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a toilet is flushed", "an engine is idling"], "question": "which entity is a machine", "label": 1}, {"captions": ["water splashes as an animal walks through", "wind blows as people chatter quietly"], "sample_ids": ["w1ir-sZ3Im8", "xBxDz0CFVn0"], "start_seconds": ["90", "30"], "properties": ["animal, water, splashes", "wind, chatter, people"], "captions_pred_video": ["footage of a group of people riding horses through a river", "footage is blurry and out of focus"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a man is speaking with wind noise in the background "], "question": "which entity is more quiet", "label": 1}, {"captions": ["a machine beeps continuously", "an engine runs loudly"], "sample_ids": ["y682ml90jGw", "vqZuVbG6-HI"], "start_seconds": ["11", "130"], "properties": ["beeps, machine, continuously", "loud, engine, run"], "captions_pred_video": [null, "footage is blurry because it's raining outside"], "captions_pred_audio": ["a beeping sound is being made ", "a lawn mower is running and men are speaking "], "question": "which entity is louder", "label": 1}, {"captions": ["a man is filing a hard object", "an airplane engine runs"], "sample_ids": ["vveS8HT7Uog", "yVPZ2MNWpms"], "start_seconds": ["100", "0"], "properties": ["a man, hard, object", "engine, airplane, runs"], "captions_pred_video": ["footage is of a workbench with various tools on it including a hammer and a screwdriver", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a man is filing and speaking with background noise and breathing ", "a car is driving by on the road "], "question": "which object is moving", "label": 1}, {"captions": ["a woman speaks and taps on a hard surface before running tap water", "some tunes played by whistling"], "sample_ids": ["wvKpEYswXO0", "u6BnG6YZqJ4"], "start_seconds": ["150", "0"], "properties": ["water, tap, run", "tune, play, whistling"], "captions_pred_video": ["of the person preparing food in the kitchen", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a woman is speaking and tapping with background noise and water running ", "a person whistling a song"], "question": "which entity is not a musical instrument?", "label": 0}, {"captions": ["a woman talks while a baby cries and a man whispers", "a stream of water runs briefly"], "sample_ids": ["smDKStoHBJo", "x-PeY8Yb8M4"], "start_seconds": ["0", "300"], "properties": ["a, talk, baby, cry", "stream, water, run"], "captions_pred_video": ["a man holding a crying baby in his arms", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a baby is crying and a woman is speaking", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a heavy rain falls endlessly", "continuous sneezing together with speech"], "sample_ids": ["wP8ZKrlx3oA", "x4dZyf9Gbj0"], "start_seconds": ["40", "130"], "properties": ["heavy, rain, fall", "continuous, sneeze, speech"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "footage is blurry and out of focus"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a woman sneezes and speaks"], "question": "which entity is more likely to be heard", "label": 1}, {"captions": ["a duck quacks several times", "a clock alarm sounds and gears turn"], "sample_ids": ["vh30P49Po6s", "w2M4i1mklOA"], "start_seconds": ["30", "30"], "properties": ["quacks, duck, several", "alarm, gears, turn"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "footage of an antique clock"], "captions_pred_audio": ["a duck is quacking loudly", "a clock is ticking and a bell is ringing "], "question": "which entity is a clock?", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "vehicles pass by on a roadway"], "sample_ids": ["vb1fPSDI4c", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["multiple, people, yell", "pass, vehicle, roadway"], "captions_pred_video": [null, "footage of a fire truck entering a garage"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a car is driving on the road "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "three men talk while wind blows and some liquid flows"], "sample_ids": ["zALy31PjDl0", "vJ7JPEFhyLA"], "start_seconds": ["21", "16"], "properties": ["a man, a vehicle, a horn", "three men, wind, flow"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity has more people", "label": 1}, {"captions": ["a railroad crossing bell rings as a train horn blows", "some men converse over an engine running"], "sample_ids": ["tZGN5a7ybxo", "sCiy7QS1U"], "start_seconds": ["60", "300"], "properties": ["ring, train, horn", "men, converse, engine"], "captions_pred_video": ["is taken from a moving vehicle on the train tracks", null], "captions_pred_audio": ["a train is moving and blowing its horn ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a train?", "label": 0}, {"captions": ["two women and a man talk while a kid cries", "water splashes as an animal walks through"], "sample_ids": ["wyllXV6PjKo", "w1ir-sZ3Im8"], "start_seconds": ["30", "90"], "properties": ["a kid, talk, cry", "animal, water, splashes"], "captions_pred_video": [null, "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman speaks and a baby cries", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks as horns blow", "water splashes as an animal walks through"], "sample_ids": ["tHyNqRyK34A", "w1ir-sZ3Im8"], "start_seconds": ["24", "90"], "properties": ["a, man, speaks", "animal, water, splashes"], "captions_pred_video": ["being taken from inside a vehicle on the street at night", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a man is speaking and a car is honking with background noise ", "water splashes and gurgles as people speak"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaking with light rustling", "some men converse over an engine running"], "sample_ids": ["zOZleIRqZm4", "sCiy7QS1U"], "start_seconds": ["80", "300"], "properties": ["light, rustling, man", "men, converse, engine"], "captions_pred_video": ["a person picking berries from the bushes in the garden", null], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking with light rustling?", "label": 0}, {"captions": ["a man talks followed by a woman shouting", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["s3cTDAj31g", "su6FAOcOA8c"], "start_seconds": ["80", "4"], "properties": ["man, talk, woman", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a man is speaking and a baby is crying", "a woman is speaking and a subway train is moving "], "question": "which entity has a woman speaking?", "label": 1}, {"captions": ["a horn blasts as warning bells ring", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["zgUgkpk78xU", "zj2R0XoFr5k"], "start_seconds": ["70", "50"], "properties": ["horn, bells, ring", "airplane, boy, fly"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is flying", "label": 1}, {"captions": ["multiple people speak and children yell while water gurgles", "wind blows as people chatter quietly"], "sample_ids": ["vb1fPSDI4c", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["multiple, people, yell", "wind, chatter, people"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a crowd of people are talking and laughing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["sa6TLVbooCc", "tdWhHV3X25Q"], "start_seconds": ["240", "60"], "properties": ["people, laugh, child", "applause, audience, yells"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a grown man speaks and water bubbles and runs", "several insects fly while two men talk"], "sample_ids": ["vSeGhaZt-aI", "s-T9OVOiMLo"], "start_seconds": ["50", "330"], "properties": ["water, bubbles, run", "several, fly, men"], "captions_pred_video": ["a man in a kitchen preparing a smoothie with a blender", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking and pouring liquid with background noise ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a machine beeps continuously"], "sample_ids": ["xyx6eNVEYRY", "y682ml90jGw"], "start_seconds": ["380", "11"], "properties": ["loud, engine, muffles", "beeps, machine, continuously"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", null], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a beeping sound is being made "], "question": "which entity is quieter", "label": 1}, {"captions": ["a rumble grows louder", "wind blows as people chatter quietly"], "sample_ids": ["y4MY9mp8-TA", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["loudness, increase, rumble", "wind, chatter, people"], "captions_pred_video": ["a helicopter flying in the sky", "footage is blurry and out of focus"], "captions_pred_audio": ["a helicopter flies overhead ", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a vehicle is skidding and squealing tires", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["soTOh3zYJfY", "zFjIWfSD-4"], "start_seconds": ["40", "410"], "properties": ["vehicle, skid, tires", "People, motor, brakes"], "captions_pred_video": ["a red car drifting on a winding road with smoke coming out of it", null], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["children cheer as a man speaks then an audience screams", "an infant crying as a woman laughs"], "sample_ids": ["vJvryTwuAV8", "xhmRY9yhC7c"], "start_seconds": ["16", "20"], "properties": ["audience, cheer, man", "a, laugh, infant"], "captions_pred_video": ["a crowd of people are gathered in a mall, watching a man take a selfie in front of the crowd", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking and a crowd is shouting and whooping ", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["rain falls on a surface as men speak and music plays", "some men converse over an engine running"], "sample_ids": ["w0xsN8X18Y", "sCiy7QS1U"], "start_seconds": ["30", "300"], "properties": ["music, surface, rain", "men, converse, engine"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking while a motorboat is moving in the background ", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is about a man speaking and music playing?", "label": 0}, {"captions": ["someone whistles a song", "pigeons vocalize and birds chirp"], "sample_ids": ["sIXTftIuUgw", "uiS58TNyUiw"], "start_seconds": ["90", "430"], "properties": ["someone, song, whistle", "vocalize, bird, chirp"], "captions_pred_video": [null, "of the pigeon in the cage"], "captions_pred_audio": ["a person whistling a song", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a loud engine muffles a man as he speaks", "a vehicle engine revs as the vehicle passes"], "sample_ids": ["xyx6eNVEYRY", "yDoT73BWsdA"], "start_seconds": ["380", "10"], "properties": ["loud, engine, muffles", "engine, revs, vehicle"], "captions_pred_video": ["footage of a helicopter landing on a runway at an airport", "a man driving a race car with a helmet on the steering wheel"], "captions_pred_audio": ["an aircraft engine is running and a man is speaking ", "a race car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a person sneezes followed by another person speaking", "someone snores nearby"], "sample_ids": ["t8CV69hcvF0", "spJCm8tD9Zo"], "start_seconds": ["210", "90"], "properties": ["person, sneeze, follow", "someone snores, nearby, someone"], "captions_pred_video": ["of an airplane flying in the dark sky at night", "of a man laying on the ground with his mouth open"], "captions_pred_audio": ["a woman sneezes and speaks", "a person is snoring loudly"], "question": "which entity is a person", "label": 0}, {"captions": ["children speak and play together", "an insect buzzes around continuously"], "sample_ids": ["yVVP8XvWJTo", "v25l1jef3JY"], "start_seconds": ["260", "0"], "properties": ["children, speak, play", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a playground at a school or daycare center", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["children are speaking and breathing with background noise ", "a fly is buzzing around a microphone "], "question": "which entity is not a person", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "pigeons vocalize and birds chirp"], "sample_ids": ["sofxkNWaP0s", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["wind, engine, louder", "vocalize, bird, chirp"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "of the pigeon in the cage"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a toilet flushes and water drains", "gunshots ring out, a man yells, and more shots follow"], "sample_ids": ["sfAvvZwdLCY", "vKrYfzleLB8"], "start_seconds": ["20", "110"], "properties": ["water drains, flushes, water", "a, ring, gunshots"], "captions_pred_video": ["footage of the toilet in the bathroom", "stock footage of a person holding a gun in their hand"], "captions_pred_audio": ["a toilet is flushed", "a man is speaking with background noise and a cap gun is fired "], "question": "which entity is more likely to be in a movie", "label": 1}, {"captions": ["a young woman speaks and laughs and an animal snorts", "a man speaks as birds chirp and a vehicle passes nearby"], "sample_ids": ["uEU-Hg5MTN8", "siJFXfGWgDk"], "start_seconds": ["27", "50"], "properties": ["a woman, laughs, animal", "a, bird, vehicle"], "captions_pred_video": ["of a girl wearing a pig mask and a boy hugging her", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "a man is speaking and birds are chirping in the background "], "question": "which entity has a vehicle passing nearby?", "label": 1}, {"captions": ["cutting machine running then powering down followed by a series of metal clanking in the background", "a stream of water runs briefly"], "sample_ids": ["zofjfKhqLk8", "x-PeY8Yb8M4"], "start_seconds": ["10", "300"], "properties": ["background, metal, clank", "stream, water, run"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a car is driving on a wet road "], "question": "which entity is a stream of water?", "label": 1}, {"captions": ["a diesel truck engine runs continuously", "a duck quacks continuously"], "sample_ids": ["sZvwOuuPGP0", "vh30P49Po6s"], "start_seconds": ["50", "30"], "properties": ["engine, diesel, truck", "quacks, continuously, duck"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a medium engine is running ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["an engine idles consistently before sputtering some", "water pouring and bubbling"], "sample_ids": ["rwTERCUno", "uyRfq-jKPpo"], "start_seconds": ["90", "50"], "properties": ["engine, idle, sputter", "water, bubbles, pouring"], "captions_pred_video": [null, "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["an engine is idling and vibrating", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a man speaks as a car is passing by", "a bird chirps in response to a woman chirping for the birds"], "sample_ids": ["sK4u5T8hW78", "uOpoD0gGXcs"], "start_seconds": ["30", "120"], "properties": ["a, car, pass", "chirps, woman, bird"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a herd of cows grazing in the field"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "birds are chirping and a man is speaking"], "question": "which entity is a response to a chirps", "label": 1}, {"captions": ["an engine revs and a turning noise is made", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["tOSWIURC-4", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["noise, engine, revs", "airplane, boy, fly"], "captions_pred_video": [null, "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a lawn mower is running ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a moving object", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "vehicles pass by on a roadway"], "sample_ids": ["sfAvvZwdLCY", "tgbONvsP47Y"], "start_seconds": ["20", "0"], "properties": ["flushes, drains, water", "pass, vehicle, roadway"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a toilet is flushed", "a car is driving on the road "], "question": "which entity is moving", "label": 1}, {"captions": ["music plays and repeated slaps accompany human sniveling, then insect buzz", "a man speaks followed by another man speaking outside"], "sample_ids": ["weDbePuc-Xc", "viuTg1M-dqg"], "start_seconds": ["40", "30"], "properties": ["music, slaps, human", "two men, speak, follow"], "captions_pred_video": ["a cartoon frog and a butterfly are sitting on the ground next to each other", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a man is speaking and birds are chirping with a frog croaking in the background ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has two men speaking?", "label": 1}, {"captions": ["animals bleat and cry out and then a woman speaks", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["yZp6xizR0yU", "zj2R0XoFr5k"], "start_seconds": ["30", "50"], "properties": ["animal, bleat, cry", "airplane, boy, fly"], "captions_pred_video": ["footage of a woman feeding goats in a barn", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a woman is speaking and a goat is bleating", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video of a plane flying by?", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a door slams shut and an object moves on a hard surface"], "sample_ids": ["vuUVPzd2FXw", "zkKdxzNC97Y"], "start_seconds": ["160", "27"], "properties": ["a, steam, release", "hard, surface, door"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "footage of the door opening and closing in slow motion"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a door is opened and closed"], "question": "which entity is a door?", "label": 1}, {"captions": ["people speak and laugh as a child speaks", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["sa6TLVbooCc", "xfaoyyzw2WU"], "start_seconds": ["240", "180"], "properties": ["people, laugh, child", "loud, jet engine, roar"], "captions_pred_video": ["a video of a little boy trying to get a toy from a shelf in a store", "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a woman is speaking and a child is coughing", "an aircraft engine roars and a man speaks "], "question": "which is louder", "label": 1}, {"captions": ["a duck quacks several times", "an audience gives applause"], "sample_ids": ["vh30P49Po6s", "x6iCUDmRpKQ"], "start_seconds": ["30", "38"], "properties": ["quacks, duck, several", "applause, audience, give"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a black background with the moon and stars in the sky"], "captions_pred_audio": ["a duck is quacking loudly", "a group of people are clapping and cheering"], "question": "which is not a verb", "label": 0}, {"captions": ["water runs into a sink while men speak", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["vzceMbklWc", "vbZ-0lGPneg"], "start_seconds": ["180", "30"], "properties": ["water, sink, run", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["water is running and a man is speaking", "a woman is speaking and a dog is whimpering"], "question": "which entity has more moving parts", "label": 1}, {"captions": ["a child speaks in closed space", "a stream of water runs briefly"], "sample_ids": ["yW6FWLSLkx4", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["child, space, speak", "stream, water, run"], "captions_pred_video": ["of the doll sitting on the bed with her hand outstretched towards the viewer", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a woman is speaking with background noise and breathing sounds ", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["water pouring and bubbling", "pigeons vocalize and birds chirp"], "sample_ids": ["uyRfq-jKPpo", "uiS58TNyUiw"], "start_seconds": ["50", "430"], "properties": ["water, bubbles, pouring", "vocalize, bird, chirp"], "captions_pred_video": ["on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu", "of the pigeon in the cage"], "captions_pred_audio": ["water is running from a faucet", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["a person screams glaringly", "birds chirp and objects are moved around"], "sample_ids": ["xC8kbrKJmco", "yPUYU6t3rwo"], "start_seconds": ["0", "370"], "properties": ["glaringly, screams, person", "birds chirp, objects are moved around, birds"], "captions_pred_video": [null, "footage and stock-footage/b-roll of a beekeeper opening a beehive"], "captions_pred_audio": ["a goat is bleating ", "insects buzz and a man speaks"], "question": "which entity is more active", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "people cheer as a vehicle engine revs"], "sample_ids": ["sTpirNYo8vQ", "xjhAnI2q6hM"], "start_seconds": ["30", "6"], "properties": ["a, tone, fast", "engine revs, vehicle, people"], "captions_pred_video": ["of a man taking a selfie on a bus", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle?", "label": 1}, {"captions": ["a horn blasts loudly as a train passes", "water is sprayed across a hard surface"], "sample_ids": ["zsLxS-uLJTw", "sQwlkXjQabo"], "start_seconds": ["20", "10"], "properties": ["horn, blast, train", "water, spray, surface"], "captions_pred_video": ["footage of the train on the tracks at sunrise or sunset", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a train blows its horn and moves on the tracks ", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a duck quacks continuously", "a car speeding up in the distance"], "sample_ids": ["vh30P49Po6s", "u0TrcHhkPQ"], "start_seconds": ["30", "20"], "properties": ["quacks, continuously, duck", "distance, car, speed"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", null], "captions_pred_audio": ["a duck is quacking loudly", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a goat bleats and someone makes a calling noise", "an insect buzzes around continuously"], "sample_ids": ["vlS6YMeWAPo", "v25l1jef3JY"], "start_seconds": ["40", "0"], "properties": ["noise, bleat, call", "buzzes, continuously, insect"], "captions_pred_video": ["footage of a goat in a pen behind a wooden fence", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a goat bleats and birds chirp", "a fly is buzzing around a microphone "], "question": "which entity is not a noise", "label": 1}, {"captions": ["a cat meows and children speak", "an infant crying frantically"], "sample_ids": ["x5cuQjOdM3E", "zwOBqeFTgiU"], "start_seconds": ["30", "30"], "properties": ["cat, speak, children", "cry, infant, frantically"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of the baby crying in the car seat"], "captions_pred_audio": ["a cat meows and a woman speaks", "a baby cries loudly"], "question": "which entity is crying", "label": 1}, {"captions": ["someone whistles briefly", "a man speaks as a motor runs in the background"], "sample_ids": ["uFoga8sHpiw", "xZepNM9qcRA"], "start_seconds": ["90", "30"], "properties": ["sound, duration, pitch", "background, motor, run"], "captions_pred_video": ["footage of a bird in a cage", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a person whistles a song", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a longer duration", "label": 1}, {"captions": ["several beeps are followed by a hit and a woman talking", "water pouring and bubbling"], "sample_ids": ["w34HjHr6gAY", "uyRfq-jKPpo"], "start_seconds": ["30", "50"], "properties": ["beeps, hit, woman", "water, bubbles, pouring"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "water is running from a faucet"], "question": "which entity is a video of a liquid flowing and bubbling?", "label": 1}, {"captions": ["an emergency vehicle engine runs then a horn blows and siren sounds", "some men talk among st themselves as cars speed and race loudly"], "sample_ids": ["y2bVZ7rz-5M", "uZesmtKZGSw"], "start_seconds": ["280", "250"], "properties": ["engine, horn, siren", "men, talk, cars"], "captions_pred_video": ["footage of a parade of fire trucks driving down the street", "1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet holden monaro gts 1970 chevrolet"], "captions_pred_audio": ["a truck is honking its horn and a siren is blaring ", "a man is speaking and a car is revving with laughter in the background "], "question": "which entity is a vehicle", "label": 0}, {"captions": ["water flows as men speak and yell", "someone is typing on a computer keyboard"], "sample_ids": ["vJ7JPEFhyLA", "v0x1odnXtP0"], "start_seconds": ["16", "210"], "properties": ["water, flow, men", "keyboard, type, computer"], "captions_pred_video": ["a man in a red shirt paddling a kayak in the water", "how to make money on youtube in spanish"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a person is typing on a keyboard"], "question": "which is a still image", "label": 1}, {"captions": ["continuous snoring", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["sLkeqCDJIyw", "yajyRTUQk3U"], "start_seconds": ["120", "400"], "properties": ["loud, snoring, noise", "a woman, something, fried"], "captions_pred_video": [", what is the man doing on the couch? sleeping", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a person is snoring loudly", "a woman is speaking while food is frying in the background"], "question": "which entity is a video", "label": 1}, {"captions": ["two men speak as a buffeting wind blows", "a stream of water runs briefly"], "sample_ids": ["y8WEcpOlT3I", "x-PeY8Yb8M4"], "start_seconds": ["40", "300"], "properties": ["wind, speak, buffeting", "stream, water, run"], "captions_pred_video": ["on how to use a sewing machine youtube", "a man sitting on a rock in the middle of a river"], "captions_pred_audio": ["a man is speaking with wind noise in the background ", "a car is driving on a wet road "], "question": "which entity is moving", "label": 1}, {"captions": ["a horn rings out as a machine runs by", "water pouring and bubbling"], "sample_ids": ["slZLHwNbbt4", "uyRfq-jKPpo"], "start_seconds": ["300", "50"], "properties": ["a, horn, run", "water, bubbles, pouring"], "captions_pred_video": ["footage of a train coming down the tracks on a sunny day", "on youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro puff hairstyle youtube how to do an afro pu"], "captions_pred_audio": ["a train is moving and blowing its horn with a clickety-clack sound ", "water is running from a faucet"], "question": "which entity is a liquid", "label": 1}, {"captions": ["an adult woman and an adult man speak", "a frog vocalizes as birds chirp"], "sample_ids": ["zTLVJCo4WEE", "wqUmIEzuNz4"], "start_seconds": ["30", "30"], "properties": ["two people, adult, speak", "frog, bird, vocalize"], "captions_pred_video": ["- a boy with a rifle aiming at a target", "a frog sitting in the grass on a sunny day"], "captions_pred_audio": ["a woman speaks and crickets chirp", "a cat meows and rustles"], "question": "which entity is a single entity", "label": 1}, {"captions": ["children cry and people talk", "a jet engine roars, almost making a man inaudible"], "sample_ids": ["xLwHe825Zs", "xfaoyyzw2WU"], "start_seconds": ["18", "180"], "properties": ["people talk, children cry, people talk", "loud, jet engine, roar"], "captions_pred_video": [null, "footage of an airplane on the tarmac at an airport"], "captions_pred_audio": ["a baby cries and a woman speaks", "an aircraft engine roars and a man speaks "], "question": "which entity is louder", "label": 1}, {"captions": ["a man speaks as water trickles down a stream", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sapQIQUhFc", "sSMl2vc3ek"], "start_seconds": ["280", "20"], "properties": ["water, stream, trickles", "loud, multiple, distance"], "captions_pred_video": [null, null], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["the clinking of a train bell with the humming of an engine and a train horn blowing", "a man speaks as a vehicles passes by then a woman speaks"], "sample_ids": ["zgUgkpk78xU", "siJFXfGWgDk"], "start_seconds": ["70", "50"], "properties": ["clinking, humming, horn", "man, woman, vehicle"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", "footage of a beekeeper working with bees in a beehive"], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking and birds are chirping in the background "], "question": "which entity is a video of a man speaking?", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "an airplane engine runs"], "sample_ids": ["vs65y4qmyBE", "yVPZ2MNWpms"], "start_seconds": ["340", "0"], "properties": ["engine, run, man", "engine, airplane, runs"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "footage of an airport with planes parked on the tarmac"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a car is driving by on the road "], "question": "which entity has a running engine", "label": 1}, {"captions": ["a machine engine runs and a man speaks", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["vs65y4qmyBE", "vfYTJq7nU"], "start_seconds": ["340", "130"], "properties": ["engine, run, man", "rustling, ducks, quack"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", null], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a duck quacks and a woman speaks"], "question": "which entity is about a machine?", "label": 0}, {"captions": ["a man talks nearby and another man talks far away while some liquid flows", "continuous sneezing together with speech"], "sample_ids": ["sapQIQUhFc", "x4dZyf9Gbj0"], "start_seconds": ["280", "130"], "properties": ["liquid, flow, distance", "continuous, sneeze, speech"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking and a stream is flowing in the background ", "a woman sneezes and speaks"], "question": "which entity is more like a sneeze", "label": 1}, {"captions": ["a child yells and another yells", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["vMDHu7Lxcgw", "w5W5Kqtc8E"], "start_seconds": ["410", "100"], "properties": ["two, yell, child", "wind, blow, vehicle"], "captions_pred_video": ["a boy playing on a trampoline in the backyard", null], "captions_pred_audio": ["a woman is speaking and a child is shouting", "a motorboat is moving and people are shouting and cheering "], "question": "which entity has more than one person yelling?", "label": 0}, {"captions": ["an engine idles consistently before sputtering some", "gunshots ring out, a man yells, and more shots follow"], "sample_ids": ["rwTERCUno", "vKrYfzleLB8"], "start_seconds": ["90", "110"], "properties": ["engine, idle, sputter", "a, ring, gunshots"], "captions_pred_video": [null, "stock footage of a person holding a gun in their hand"], "captions_pred_audio": ["an engine is idling and vibrating", "a man is speaking with background noise and a cap gun is fired "], "question": "which entity is more violent", "label": 1}, {"captions": ["a airplane flies overhead as a woman speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zj2R0XoFr5k", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["airplane, fly, woman", "rooster, crow, background, men"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster in the background?", "label": 1}, {"captions": ["metal rumbles followed by a kid giggles then more metal rumbling followed by a guy speaking", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sQGXqGcwOTc", "zj2R0XoFr5k"], "start_seconds": ["3", "50"], "properties": ["audio, kid, giggles", "airplane, boy, fly"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video?", "label": 1}, {"captions": ["people speak softly as food sizzles", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["yhQ2Lg-7qDY", "ukg5L09Wpvo"], "start_seconds": ["130", "150"], "properties": ["food, sizzle, speak", "clickety-clack, train, whistle"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a train blows its whistle and blows its horn "], "question": "which entity is a train?", "label": 1}, {"captions": ["heavy rain splashes as it falls", "multiple insects buzz over rustling wind"], "sample_ids": ["wP8ZKrlx3oA", "tMJne1a4AFI"], "start_seconds": ["40", "0"], "properties": ["fall, rain, splash", "wind, buzz, rustling"], "captions_pred_video": ["footage of a flooded street in the middle of a desert with mountains in the background", "a swarm of bees on the ground"], "captions_pred_audio": ["a heavy rain is falling on a surface", "a swarm of bees buzzing around"], "question": "which entity is not a splash", "label": 1}, {"captions": ["an engine runs loudly", "water flows and trickles"], "sample_ids": ["vqZuVbG6-HI", "tB7hWb9gTuQ"], "start_seconds": ["130", "30"], "properties": ["loud, engine, run", "water, flow, trickle"], "captions_pred_video": ["footage is blurry because it's raining outside", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["a lawn mower is running and men are speaking ", "water is splashing and gurgling"], "question": "which entity is quieter", "label": 1}, {"captions": ["birds chirp and a dog breathes heavily", "a young female speaks, followed by spraying and a female screaming"], "sample_ids": ["y2ZBGpgbhHM", "uYT5gxnyMWM"], "start_seconds": ["30", "50"], "properties": ["dog, chirp, breathe", "female, spraying, scream"], "captions_pred_video": [null, "footage of a person spraying paint on the ceiling"], "captions_pred_audio": ["birds chirping and a dog panting", "a woman is speaking and a baby is crying"], "question": "which entity is a person", "label": 1}, {"captions": ["a motor runs in the distance as a soft wind periodically gusts", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["xyL9F5VrjkE", "vbZ-0lGPneg"], "start_seconds": ["20", "30"], "properties": ["wind, motor, distance", "a woman, a television program, a bird"], "captions_pred_video": ["of a caterpillar truck loading logs into a trailer", "of a man holding a baby duck in his hands"], "captions_pred_audio": ["the wind is blowing and a car is passing by ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a television program?", "label": 1}, {"captions": ["loud ringing of a telephone stops followed by a man speaking and a digital beep", "someone whistles a tune"], "sample_ids": ["uzQnlJXBbOM", "sIXTftIuUgw"], "start_seconds": ["50", "90"], "properties": ["ringing, beep, stop", "someone, tune, whistle"], "captions_pred_video": ["footage of a person using a cell phone on a table", null], "captions_pred_audio": ["a telephone rings and a man speaks", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a cat meows as a young woman speaks", "birds chirp and wind blows"], "sample_ids": ["x5cuQjOdM3E", "sxIvBMSavMQ"], "start_seconds": ["30", "210"], "properties": ["cat, meows, young woman", "birds, chirp, wind"], "captions_pred_video": ["a black background with an airplane flying in the sky", "beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a beehive beekeeping 101 how to extract honey from a"], "captions_pred_audio": ["a cat meows and a woman speaks", "birds are chirping and insects are buzzing"], "question": "which entity is more quiet", "label": 1}, {"captions": ["a toilet flushes and water drains", "females talk and laugh over gusting wind"], "sample_ids": ["sfAvvZwdLCY", "un9VQlzgZM"], "start_seconds": ["20", "5"], "properties": ["water drains, flushes, water", "females, talk, laugh"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["birds fly and flutter around", "winds blows roughly as a vehicle races past"], "sample_ids": ["wGKgwOP3h30", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["fly, flutter, around", "wind, blows, vehicle"], "captions_pred_video": ["of the pigeons in the coop", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["pigeons coo and flap their wings", "a jet engine roars and wind blows "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a race car approaches quickly and slows down squealing tires", "a duck quacks continuously"], "sample_ids": ["sEprKHm8Sj8", "vh30P49Po6s"], "start_seconds": ["90", "30"], "properties": ["car, tires, slows", "quacks, continuously, duck"], "captions_pred_video": ["rally 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a duck is quacking loudly"], "question": "which entity is a bird?", "label": 1}, {"captions": ["an engine runs and wind blows", "a vehicle accelerates before a race car idles then accelerates quickly"], "sample_ids": ["vs65y4qmyBE", "sjlVMgdGSK0"], "start_seconds": ["340", "30"], "properties": ["engine, run, wind", "accelerates, vehicle, race car"], "captions_pred_video": ["a car is engulfed in flames on the side of the road", "a 1965 ford falcon drag racing at 100mph on a 1/8 mile track"], "captions_pred_audio": ["a heavy engine is running and men are speaking ", "a car accelerates and revs its engine "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a siren comes to life as a horn blares", "some tunes played by whistling"], "sample_ids": ["u--KhUW8l1Y", "u6BnG6YZqJ4"], "start_seconds": ["0", "0"], "properties": ["horn, siren, life", "tune, play, whistling"], "captions_pred_video": ["a firefighter spraying water from a fire hydrant at night", "a young boy standing in front of a group of kids in a classroom"], "captions_pred_audio": ["a fire truck siren blares and a horn blows ", "a person whistling a song"], "question": "which entity is a musical instrument", "label": 1}, {"captions": ["a duck quacks loudly and continuously", "an insect buzzes around continuously"], "sample_ids": ["vh30P49Po6s", "v25l1jef3JY"], "start_seconds": ["30", "0"], "properties": ["loud, continuous, quacks", "buzzes, continuously, insect"], "captions_pred_video": ["of a man brushing his teeth with a toothbrush in his mouth", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a duck is quacking loudly", "a fly is buzzing around a microphone "], "question": "which entity is quieter", "label": 1}, {"captions": ["a horn honks followed by a loud continuous buzzing while men speak", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["wsHBIgzs9Fs", "ukg5L09Wpvo"], "start_seconds": ["50", "150"], "properties": ["horn, continuous, buzzing", "clickety-clack, train, whistle"], "captions_pred_video": ["shows a motorcycle riding down a country road with a motorcycle in the foreground", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a car accelerates and revs its engine while a man speaks ", "a train blows its whistle and blows its horn "], "question": "which entity is continuous", "label": 1}, {"captions": ["a man talks while a clock does ticktock", "an insect buzzes around continuously"], "sample_ids": ["spYNpeN7rPY", "v25l1jef3JY"], "start_seconds": ["1", "0"], "properties": ["a clock, ticktock, man", "buzzes, continuously, insect"], "captions_pred_video": ["in 10 words or less what is the name of the song in the maybank advertisement? maybank advertisement, maybank, advertisement, advertisements, advertisement video, advertisements video, advertisements, advertisement, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertisement video, advertis", "a black background with a cartoon character in the foreground"], "captions_pred_audio": ["a man is speaking and breathing with background noise ", "a fly is buzzing around a microphone "], "question": "which entity is moving", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "an audience gives applause as a man yells and a group sings"], "sample_ids": ["w2bYrCVLT60", "tdWhHV3X25Q"], "start_seconds": ["120", "60"], "properties": ["ducks, speak, quack", "applause, audience, yells"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", "a man is talking to another man on a stage in front of a microphone"], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a man is speaking and a crowd is clapping"], "question": "which entity is more active", "label": 1}, {"captions": ["a goat screams and people speak in the background", "tapping occurs then a baby cries"], "sample_ids": ["xC8kbrKJmco", "wIJK3-5y0kA"], "start_seconds": ["0", "30"], "properties": ["background, goat, scream", "a, cry, baby"], "captions_pred_video": [null, "of a baby playing with a cat in a dark room"], "captions_pred_audio": ["a goat is bleating ", "a baby cries and a woman speaks"], "question": "which entity is crying", "label": 1}, {"captions": ["a few ducks quack and scamper and a man speaks", "someone whistles a tune"], "sample_ids": ["w2bYrCVLT60", "sIXTftIuUgw"], "start_seconds": ["120", "90"], "properties": ["ducks, speak, quack", "someone, tune, whistle"], "captions_pred_video": ["of the ducks drinking from a pink pool in the grass", null], "captions_pred_audio": ["ducks are quacking and a man is speaking", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["a person is snoring while sleeping", "several insects fly while two men talk"], "sample_ids": ["vJrjSeP17yE", "s-T9OVOiMLo"], "start_seconds": ["40", "330"], "properties": ["a person is sleeping, snoring, person", "several, fly, men"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a person snoring loudly", "a man is speaking while insects are buzzing in the background "], "question": "which entity is a person", "label": 0}, {"captions": ["a woman speaks and is crumpling paper", "several beeps are followed by a hit and a woman talking"], "sample_ids": ["xvDdE3zNf8Y", "w34HjHr6gAY"], "start_seconds": ["120", "30"], "properties": ["A, crumple, paper", "beeps, hit, woman"], "captions_pred_video": ["of a woman in a white shirt and glasses holding a purple tie", "the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz"], "captions_pred_audio": ["a woman speaks and crumples paper", "a beep sounds followed by a child speaking"], "question": "which entity is a video of a woman speaking and crumpling paper?", "label": 0}, {"captions": ["birds chirp quietly and an adult man speaks", "winds blows roughly as a vehicle races past"], "sample_ids": ["zuua6-5goWw", "xjvTpk2Zpr8"], "start_seconds": ["30", "70"], "properties": ["birds, chirp, quiet, man, speaks", "wind, blows, vehicle"], "captions_pred_video": ["in your own words screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot 10 of 10] screenshot", "footage of a dhl plane landing on the runway"], "captions_pred_audio": ["birds are chirping and a man is speaking with background noise ", "a jet engine roars and wind blows "], "question": "which entity is more calm", "label": 0}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "a man speaks as a motor runs in the background"], "sample_ids": ["vuUVPzd2FXw", "xZepNM9qcRA"], "start_seconds": ["160", "30"], "properties": ["a, steam, release", "background, motor, run"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity has a motor running in the background?", "label": 1}, {"captions": ["wind noise makes sound into a microphone", "pigeons vocalize and birds chirp"], "sample_ids": ["w8uLijTqtlU", "uiS58TNyUiw"], "start_seconds": ["70", "430"], "properties": ["wind, microphone, noise", "vocalize, bird, chirp"], "captions_pred_video": ["footage is blurry and shaky", "of the pigeon in the cage"], "captions_pred_audio": ["the wind is blowing strongly", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a speedboat passes quickly on the water", "water is sprayed across a hard surface"], "sample_ids": ["tjmoSi330GM", "sQwlkXjQabo"], "start_seconds": ["23", "10"], "properties": ["speed, water, boat", "water, spray, surface"], "captions_pred_video": ["a person riding a jet ski on a lake with trees in the background", "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a motorboat speeds through water with wind noise ", "spraying followed by silence"], "question": "which entity is moving across a hard surface", "label": 1}, {"captions": ["paper is crumpling consistently", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["v5cSxLaHADY", "wDVMhEdTiVw"], "start_seconds": ["0", "30"], "properties": ["paper is crumpling, paper is white, paper is crumpling", "gun, shoot, water"], "captions_pred_video": ["footage of the person holding a pair of scissors", "a blurry image of trees and water in the forest"], "captions_pred_audio": ["paper is crumpled and crinkled", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is not a video of a gun shooting?", "label": 0}, {"captions": ["an electric engine works nearby followed by a child talking", "people cheer as a vehicle engine revs"], "sample_ids": ["xSKJGCItUWE", "xjhAnI2q6hM"], "start_seconds": ["10", "6"], "properties": ["engine, work, child", "engine revs, vehicle, people"], "captions_pred_video": ["footage of the helicopter flying in the room", "a school bus decorated with christmas lights is floating in the water"], "captions_pred_audio": ["a high pitched engine is running and a child speaks", "a truck is revving its engine and a man is speaking "], "question": "which entity is about a vehicle engine?", "label": 1}, {"captions": ["water is sprayed across a hard surface", "a man speaks followed by another man speaking outside"], "sample_ids": ["sQwlkXjQabo", "viuTg1M-dqg"], "start_seconds": ["10", "30"], "properties": ["water, spray, surface", "two men, speak, follow"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a vehicle accelerates before a race car idles then accelerates quickly", "pigeons vocalize and birds chirp"], "sample_ids": ["sjlVMgdGSK0", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["accelerates, vehicle, race car", "vocalize, bird, chirp"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "of the pigeon in the cage"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["a stream of water flows quickly", "loud intermittent buzzing with intermittent laughter"], "sample_ids": ["wbHTKEJZyhc", "sLUnaPT5gM8"], "start_seconds": ["20", "0"], "properties": ["stream, water, flow", "loud, laughter, intermittent"], "captions_pred_video": ["footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and a bridge in the background video footage of a river in autumn with trees and", "of a baby laying on his stomach in a blue shirt and diaper"], "captions_pred_audio": ["a waterfall is flowing and people are speaking ", "a baby is laughing and breathing while a man is speaking "], "question": "which entity is more quiet", "label": 0}, {"captions": ["a clock ticks quietly and rhythmically", "a stream of water flows as people talk and wind blows"], "sample_ids": ["u7C-AEBQM", "xBxDz0CFVn0"], "start_seconds": ["30", "30"], "properties": ["ticks, rhythmic, quiet", "stream, water, flow"], "captions_pred_video": [null, "footage is blurry and out of focus"], "captions_pred_audio": ["a ticktock of a clock", "a man is speaking with wind noise in the background "], "question": "which entity is moving", "label": 1}, {"captions": ["a man talks followed by a woman shouting", "water is sprayed across a hard surface"], "sample_ids": ["s3cTDAj31g", "sQwlkXjQabo"], "start_seconds": ["80", "10"], "properties": ["man, talk, woman", "water, spray, surface"], "captions_pred_video": [null, "a close-up of a red car with water droplets on the hood"], "captions_pred_audio": ["a man is speaking and a baby is crying", "spraying followed by silence"], "question": "which entity is a liquid", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "a loud snarling engine is followed by a man laughing"], "sample_ids": ["sfAvvZwdLCY", "zl9Dqx-j7q4"], "start_seconds": ["20", "6"], "properties": ["flushes, drains, water", "engine, laugh, loud"], "captions_pred_video": ["footage of the toilet in the bathroom", "footage of a man driving a car in the dark"], "captions_pred_audio": ["a toilet is flushed", "a jet engine roars "], "question": "which entity is louder", "label": 1}, {"captions": ["a woman speaks in a fast tone with a male", "someone whistles a tune"], "sample_ids": ["sTpirNYo8vQ", "sIXTftIuUgw"], "start_seconds": ["30", "90"], "properties": ["a, tone, fast", "someone, tune, whistle"], "captions_pred_video": ["of a man taking a selfie on a bus", null], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a person whistling a song"], "question": "which entity is a musical performance", "label": 1}, {"captions": ["people clap and speak in the distance", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["wwyfGO2J4", "tDVADusiIoc"], "start_seconds": ["90", "60"], "properties": ["clap, distance, speak", "water, radio, man"], "captions_pred_video": [null, "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaking together with birds chirping and distant murmuring", "water splashes and wind noise is made into a microphone"], "sample_ids": ["uiS58TNyUiw", "sDSppXIlJrs"], "start_seconds": ["430", "27"], "properties": ["audio, man, speaking", "microphone, water, wind"], "captions_pred_video": ["of the pigeon in the cage", "a man is paddling a small wooden boat in the water"], "captions_pred_audio": ["a man is speaking and a bee is buzzing", "the wind is blowing and water is splashing"], "question": "which entity is a recording of a man speaking?", "label": 0}, {"captions": ["water is sprayed across a hard surface", "pigeons vocalize and birds chirp"], "sample_ids": ["sQwlkXjQabo", "uiS58TNyUiw"], "start_seconds": ["10", "430"], "properties": ["water, spray, surface", "vocalize, bird, chirp"], "captions_pred_video": ["a close-up of a red car with water droplets on the hood", "of the pigeon in the cage"], "captions_pred_audio": ["spraying followed by silence", "a man is speaking and a bee is buzzing"], "question": "which entity is not a bird?", "label": 0}, {"captions": ["birds chirp as a bell rings", "multiple people speak and children yell while water gurgles"], "sample_ids": ["ziUT9IFTkjg", "vb1fPSDI4c"], "start_seconds": ["10", "30"], "properties": ["chirp, bell, ring", "multiple, people, yell"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a crowd of people are talking and laughing"], "question": "which entity is more quiet", "label": 0}, {"captions": ["water flows as a woman laughs and a man speaks", "females talk and laugh over gusting wind"], "sample_ids": ["vddP56-ogds", "un9VQlzgZM"], "start_seconds": ["30", "5"], "properties": ["water, flow, laugh", "females, talk, laugh"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running and gurgling and a man is speaking", "a woman is speaking and laughing with wind noise and breathing in the background "], "question": "which entity has more people laughing", "label": 1}, {"captions": ["an insect buzzes around continuously", "a woman speaks followed by another woman whimpering and speaking"], "sample_ids": ["v25l1jef3JY", "xOZfdgAgJ9o"], "start_seconds": ["0", "40"], "properties": ["buzzes, continuously, insect", "woman, whimpering, speaking"], "captions_pred_video": ["a black background with a cartoon character in the foreground", "footage of a woman talking to a man in a doctor's office"], "captions_pred_audio": ["a fly is buzzing around a microphone ", "a woman is speaking and a baby is crying"], "question": "which entity is speaking", "label": 1}, {"captions": ["birds chirp then an animal grunts", "a man speaks as a car is passing by"], "sample_ids": ["tDlysoZiA1I", "sK4u5T8hW78"], "start_seconds": ["0", "30"], "properties": ["animal, grunt, chirp", "a, car, pass"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a moving object", "label": 1}, {"captions": ["water splashes as an animal walks through", "small dogs yip and bark sharply"], "sample_ids": ["w1ir-sZ3Im8", "v-wcQf4BDY0"], "start_seconds": ["90", "120"], "properties": ["animal, water, splashes", "bark, yip, sharply"], "captions_pred_video": ["footage of a group of people riding horses through a river", "footage is blurry and shaky, making it difficult to see what is happening"], "captions_pred_audio": ["water splashes and gurgles as people speak", "a dog barks and growls"], "question": "which animal is more active", "label": 1}, {"captions": ["a woman speaks over sizzling noise", "a man speaks over a radio as wind blows and water splashes"], "sample_ids": ["yajyRTUQk3U", "tDVADusiIoc"], "start_seconds": ["400", "60"], "properties": ["noise, woman, speak", "water, radio, man"], "captions_pred_video": ["- a woman cooking in the kitchen", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking over a radio?", "label": 1}, {"captions": ["people talk quietly in the distance, followed by a police car siren wailing", "vehicles pass by on a roadway"], "sample_ids": ["wy1eKjR7KC0", "tgbONvsP47Y"], "start_seconds": ["30", "0"], "properties": ["people, talk, distance", "pass, vehicle, roadway"], "captions_pred_video": ["two police officers riding motorcycles down the street", "footage of a fire truck entering a garage"], "captions_pred_audio": ["a man is speaking and a siren is going off", "a car is driving on the road "], "question": "which entity is more likely to be in motion", "label": 1}, {"captions": ["cats meow and then a person begins to talk while the cats continue to meow", "pigeons vocalize and birds chirp"], "sample_ids": ["x5cuQjOdM3E", "uiS58TNyUiw"], "start_seconds": ["30", "430"], "properties": ["cat, talk, meow", "vocalize, bird, chirp"], "captions_pred_video": ["a black background with an airplane flying in the sky", "of the pigeon in the cage"], "captions_pred_audio": ["a cat meows and a woman speaks", "a man is speaking and a bee is buzzing"], "question": "which entity is a bird", "label": 1}, {"captions": ["loud clanking and banging with brief male speech", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["sWZzXuWYY", "su6FAOcOA8c"], "start_seconds": ["420", "4"], "properties": ["male, speech, banging", "engine, idle, woman"], "captions_pred_video": [null, "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a sewing machine runs and a man speaks", "a woman is speaking and a subway train is moving "], "question": "which entity is a bus?", "label": 1}, {"captions": ["a woman and man speak while food is frying", "water splashes as an animal walks through"], "sample_ids": ["zk-xJGQU8-4", "w1ir-sZ3Im8"], "start_seconds": ["130", "90"], "properties": ["food, man, woman", "animal, water, splashes"], "captions_pred_video": ["a man and a woman cooking in a wok on the stove", "footage of a group of people riding horses through a river"], "captions_pred_audio": ["a woman is speaking while dishes are clanging and music is playing in the background ", "water splashes and gurgles as people speak"], "question": "which entity is about a person", "label": 0}, {"captions": ["a consistent ticking pattern", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sCeWURVHfOM", "sSMl2vc3ek"], "start_seconds": ["30", "20"], "properties": ["ticking, pattern, clock", "loud, multiple, distance"], "captions_pred_video": ["- a close-up view of the clock's inner workings", null], "captions_pred_audio": ["ticking of a clock", "a person snoring loudly"], "question": "which entity is not a clock?", "label": 1}, {"captions": ["people converse in the distance as a clock ticks", "small dogs growl, bark and yip."], "sample_ids": ["vZAw4apG0Es", "sShpyu2l4YQ"], "start_seconds": ["30", "0"], "properties": ["people, clock, converse", "growl, bark, yip"], "captions_pred_video": ["a clock made out of wood and gears with birds on top of it", "the puppies are playing with a toy"], "captions_pred_audio": ["a clock is ticking and people are talking", "a dog is barking and growling"], "question": "which entity is more active", "label": 1}, {"captions": ["a man speaks and wind blows as an aircraft engine becomes louder", "a toilet flushes and a female speaks"], "sample_ids": ["sofxkNWaP0s", "yaln9y8I7ms"], "start_seconds": ["30", "230"], "properties": ["wind, engine, louder", "female, flushes, toilet"], "captions_pred_video": ["of the airplane taking off from the runway with the speed limit sign in the foreground", "footage is blurry and out of focus"], "captions_pred_audio": ["a man is speaking while a jet engine roars in the background ", "a toilet flushes and a man speaks"], "question": "which entity is a bathroom?", "label": 1}, {"captions": ["a car speeds away loudly followed by a car revving loudly and driving away while outside", "three men talk while wind blows and some liquid flows"], "sample_ids": ["sjlVMgdGSK0", "vJ7JPEFhyLA"], "start_seconds": ["30", "16"], "properties": ["car, revving, loudly", "three men, wind, flow"], "captions_pred_video": ["a 1965 ford falcon drag racing at 100mph on a 1/8 mile track", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["a car accelerates and revs its engine ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is not a car?", "label": 1}, {"captions": ["a small airplane approaches and then flies by, after and during which a boy speaks", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["zj2R0XoFr5k", "su6FAOcOA8c"], "start_seconds": ["50", "4"], "properties": ["airplane, boy, fly", "engine, idle, woman"], "captions_pred_video": ["footage of a small airplane flying in the sky stock videos and royalty-free footage", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a woman speaks while a helicopter flies overhead ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["a person is snoring while sleeping", "a frog croaks as other frogs croak in the background"], "sample_ids": ["vJrjSeP17yE", "yswmmRZFItk"], "start_seconds": ["40", "0"], "properties": ["a person is sleeping, snoring, person", "background, frog, croak"], "captions_pred_video": ["a black background with a small plane flying in the sky", "a close up of a frog in the water"], "captions_pred_audio": ["a person snoring loudly", "a frog is croaking"], "question": "which entity is a croaker", "label": 1}, {"captions": ["a vehicle engine revs and tires squeal", "an airplane engine spools and people speak"], "sample_ids": ["yDoT73BWsdA", "wTjoRj1se3U"], "start_seconds": ["10", "390"], "properties": ["engine revs, tires squeal, vehicle", "airplane, engine, spool"], "captions_pred_video": ["a man driving a race car with a helmet on the steering wheel", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a race car accelerates and revs its engine ", "a jet engine is running and people are talking"], "question": "which entity is a vehicle", "label": 0}, {"captions": ["multiple beeps are followed by a squawk and a child speaking", "a man speaks as a car is passing by"], "sample_ids": ["w34HjHr6gAY", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["beeps, squawk, child speaking", "a, car, pass"], "captions_pred_video": ["the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz 1995 the wizard of oz", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["a beep sounds followed by a child speaking", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a man speaking to a car?", "label": 1}, {"captions": ["birds chirp and objects are moved around", "three men talk while wind blows and some liquid flows"], "sample_ids": ["yPUYU6t3rwo", "vJ7JPEFhyLA"], "start_seconds": ["370", "16"], "properties": ["birds chirp, objects are moved around, birds", "three men, wind, flow"], "captions_pred_video": ["footage and stock-footage/b-roll of a beekeeper opening a beehive", "a man in a red shirt paddling a kayak in the water"], "captions_pred_audio": ["insects buzz and a man speaks", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is about moving objects around", "label": 0}, {"captions": ["a machine beeps continuously", "a gun shoots, followed by water sloshing nearby"], "sample_ids": ["y682ml90jGw", "wDVMhEdTiVw"], "start_seconds": ["11", "30"], "properties": ["beeps, machine, continuously", "gun, shoot, water"], "captions_pred_video": [null, "a blurry image of trees and water in the forest"], "captions_pred_audio": ["a beeping sound is being made ", "a gun is fired followed by splashing and a person sneezing"], "question": "which entity is more likely to be used in a war", "label": 1}, {"captions": ["people speaking indiscriminately in the distance with a person snoring loudly nearby", "a duck quacks loudly and continuously"], "sample_ids": ["w2JXXIAdUdg", "vh30P49Po6s"], "start_seconds": ["10", "30"], "properties": ["snoring, distance, person", "loud, continuous, quacks"], "captions_pred_video": ["a close up shot of a person's mouth with a toothbrush in it", "of a man brushing his teeth with a toothbrush in his mouth"], "captions_pred_audio": ["a person snoring and a dog whimpering", "a duck is quacking loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["running water in a faucet with some clinks", "a man speaks followed by another man speaking outside"], "sample_ids": ["zNRChLjqcU", "viuTg1M-dqg"], "start_seconds": ["220", "30"], "properties": ["water, faucet, run", "two men, speak, follow"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["water is running from a faucet into a sink", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a video of two men speaking?", "label": 1}, {"captions": ["a vehicle engine runs while a woman makes an announcement", "a train horn blows as it passes by"], "sample_ids": ["su6FAOcOA8c", "zVacuqSb4LI"], "start_seconds": ["4", "30"], "properties": ["engine, run, woman", "horn, blows, train"], "captions_pred_video": ["shows a group of people riding on a crowded subway train", "by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by mike amstong a video by"], "captions_pred_audio": ["a woman is speaking and a subway train is moving ", "a train whistle blows and a train passes by with a whistle blowing "], "question": "which entity is a vehicle?", "label": 0}, {"captions": ["a baby cries and fusses, a woman speaks, and a man speaks", "several insects fly while two men talk"], "sample_ids": ["wyllXV6PjKo", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["a baby, a woman, a man", "several, fly, men"], "captions_pred_video": [null, "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a woman speaks and a baby cries", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more people", "label": 0}, {"captions": ["a man speaks as a car is passing by", "several insects fly while two men talk"], "sample_ids": ["sK4u5T8hW78", "s-T9OVOiMLo"], "start_seconds": ["30", "330"], "properties": ["a, car, pass", "several, fly, men"], "captions_pred_video": ["for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking with background noise and breathing sounds ", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more moving parts", "label": 1}, {"captions": ["people speak in a closed space", "a man speaks on a radio as wind blows"], "sample_ids": ["sTpirNYo8vQ", "tDVADusiIoc"], "start_seconds": ["30", "60"], "properties": ["people, space, speak", "man, radio, blows"], "captions_pred_video": ["of a man taking a selfie on a bus", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a man is speaking while a car is revving and accelerating ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is in a closed space", "label": 0}, {"captions": ["a man speaks as birds chirp and a vehicle passes nearby", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["siJFXfGWgDk", "wz7N8YRy74I"], "start_seconds": ["50", "30"], "properties": ["a, bird, vehicle", "rooster, crow, background, men"], "captions_pred_video": ["footage of a beekeeper working with bees in a beehive", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking and birds are chirping in the background ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a bird in it?", "label": 0}, {"captions": ["a horn blasts as warning bells ring", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["zgUgkpk78xU", "tiDFTC-5vU"], "start_seconds": ["70", "30"], "properties": ["horn, bells, ring", "male, duck, laugh"], "captions_pred_video": ["of a train passing through a small town on a sunny day at 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 1080p 60fps 108", null], "captions_pred_audio": ["a train blows its horn as it speeds down the tracks ", "a man is speaking and ducks are quacking"], "question": "which entity is a warning", "label": 0}, {"captions": ["water splashing and wind blowing as a powerful engine roars", "water drips and bubbles as a man speaks"], "sample_ids": ["yZmhM1HcsyE", "vSeGhaZt-aI"], "start_seconds": ["4", "50"], "properties": ["engine, roar, water", "water, bubbles, speak"], "captions_pred_video": ["footage of a speedboat on a lake with water spraying from the back of the boat", "a man in a kitchen preparing a smoothie with a blender"], "captions_pred_audio": ["a motorboat speeds through water with wind noise in the background ", "a man is speaking and pouring liquid with background noise "], "question": "which entity is more calm", "label": 1}, {"captions": ["a man talks while metallic objects are rapped and steam is released", "several insects fly while two men talk"], "sample_ids": ["vuUVPzd2FXw", "s-T9OVOiMLo"], "start_seconds": ["160", "330"], "properties": ["a, steam, release", "several, fly, men"], "captions_pred_video": ["of the person cooking on the grill with a spatula", "a man climbing up a tree using a rope to reach the top of the tree"], "captions_pred_audio": ["a man is speaking and dishes are clanging", "a man is speaking while insects are buzzing in the background "], "question": "which entity has more flying insects", "label": 1}, {"captions": ["a loud snarling engine is followed by a man laughing", "a baby laughs giddily and a woman laughs then speaks"], "sample_ids": ["zl9Dqx-j7q4", "wjsXBsc7M40"], "start_seconds": ["6", "10"], "properties": ["engine, laugh, loud", "a baby laughs, a woman laughs, a woman speaks"], "captions_pred_video": ["footage of a man driving a car in the dark", "footage of the baby playing with a toothbrush"], "captions_pred_audio": ["a jet engine roars ", "a baby laughs and a woman speaks"], "question": "which entity is a human", "label": 1}, {"captions": ["a door opens and birds chirp", "a man speaks as a car is passing by"], "sample_ids": ["yeFvk9x0wWI", "sK4u5T8hW78"], "start_seconds": ["30", "30"], "properties": ["door, open, birds", "a, car, pass"], "captions_pred_video": ["a mouse in a cage on the sidewalk in front of a fence", "for 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai accent 1 6l 2007 hyundai"], "captions_pred_audio": ["birds chirp in the background as a car drives by ", "a man is speaking with background noise and breathing sounds "], "question": "which entity is a person", "label": 1}, {"captions": ["people converse as a motor runs and air brakes hiss", "television program is played far away while a woman talks and birds tweet nearby"], "sample_ids": ["zFjIWfSD-4", "vbZ-0lGPneg"], "start_seconds": ["410", "30"], "properties": ["People, motor, brakes", "a woman, a television program, a bird"], "captions_pred_video": [null, "of a man holding a baby duck in his hands"], "captions_pred_audio": ["a man is speaking while a car is driving and a ticking sound is heard ", "a woman is speaking and a dog is whimpering"], "question": "which entity has a bird?", "label": 1}, {"captions": ["bees buzz as wind blows", "a woman talks while something is fried and objects are tapped"], "sample_ids": ["tMJne1a4AFI", "yajyRTUQk3U"], "start_seconds": ["0", "400"], "properties": ["bees, buzz, wind", "a woman, something, fried"], "captions_pred_video": ["a swarm of bees on the ground", "- a woman cooking in the kitchen"], "captions_pred_audio": ["a swarm of bees buzzing around", "a woman is speaking while food is frying in the background"], "question": "which entity is a person", "label": 1}, {"captions": ["people speak softly as food sizzles", "an airplane engine spools and people speak"], "sample_ids": ["yhQ2Lg-7qDY", "wTjoRj1se3U"], "start_seconds": ["130", "390"], "properties": ["food, sizzle, speak", "airplane, engine, spool"], "captions_pred_video": ["a pan filled with meat and sauce being cooked on a stove top", "footage of a man playing with a remote control airplane in a field"], "captions_pred_audio": ["a faucet is running and a man is speaking", "a jet engine is running and people are talking"], "question": "which entity is about a plane?", "label": 1}, {"captions": ["music plays and a woman speaks on a radio before gunshots are fired", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["xKB8O8LTs6s", "tDlysoZiA1I"], "start_seconds": ["70", "0"], "properties": ["music, radio, gunshots", "animal, grunts, chirps"], "captions_pred_video": ["in your own words a screenshot of the game's loading screen", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["music plays while a woman speaks and gunshots are fired ", "birds are chirping and a rooster is crowing "], "question": "which entity is more quiet", "label": 1}, {"captions": ["dogs barking and whimpering", "a person is snoring while sleeping"], "sample_ids": ["tIY7qOV3rEM", "vJrjSeP17yE"], "start_seconds": ["0", "40"], "properties": ["barking, whimpering, dog", "a person is sleeping, snoring, person"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "a black background with a small plane flying in the sky"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a person snoring loudly"], "question": "which entity is a person", "label": 1}, {"captions": ["a girl speaks followed by a scream and more girls talking", "music plays and someone speaks before gunfire and an explosion occurs"], "sample_ids": ["uYT5gxnyMWM", "xKB8O8LTs6s"], "start_seconds": ["50", "70"], "properties": ["a, scream, girl", "music, gunfire, explosion"], "captions_pred_video": ["footage of a person spraying paint on the ceiling", "in your own words a screenshot of the game's loading screen"], "captions_pred_audio": ["a woman is speaking and a baby is crying", "music plays while a woman speaks and gunshots are fired "], "question": "which entity has more gunfire", "label": 1}, {"captions": ["birds chirp and an insect buzzes around", "people converse as a motor runs and air brakes hiss"], "sample_ids": ["t97k0cejSQE", "zFjIWfSD-4"], "start_seconds": ["250", "410"], "properties": ["bird, chirp, insect", "People, motor, brakes"], "captions_pred_video": ["a bee on a purple thistle flower", null], "captions_pred_audio": ["a bee buzzes and a woman speaks", "a man is speaking while a car is driving and a ticking sound is heard "], "question": "which entity is more likely to be in a car", "label": 1}, {"captions": ["a motor slows to a stopover traffic noises", "a woman speaks as she rubs two objects together"], "sample_ids": ["zofjfKhqLk8", "vzxHnu-SFEw"], "start_seconds": ["10", "80"], "properties": ["noise, stop, motor", "two objects, woman, speak"], "captions_pred_video": ["footage of a man using a machine to cut a piece of wood", "how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to make a paper cup with scissors youtube how to"], "captions_pred_audio": ["a large engine is running and a bell is ringing", "a woman is speaking and breathing with mechanisms in the background "], "question": "which entity is a person", "label": 1}, {"captions": ["a muffled toilet flushes and the water drains", "dishes cling together then a man begins to speak"], "sample_ids": ["sfAvvZwdLCY", "sQGXqGcwOTc"], "start_seconds": ["20", "3"], "properties": ["flushes, drains, water", "cling, speak, dishes"], "captions_pred_video": ["footage of the toilet in the bathroom", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a toilet is flushed", "mechanisms are operating and water is splashing "], "question": "which entity is about water?", "label": 0}, {"captions": ["a man speaks on a radio as wind blows", "a series of light horn beeps is followed by a loud steam whistle"], "sample_ids": ["tDVADusiIoc", "wnpJndXuxLc"], "start_seconds": ["60", "50"], "properties": ["man, radio, blows", "beeps, loud, whistle"], "captions_pred_video": ["a person riding on the back of a sailboat in rough seas", "footage of the train coming down the tracks on a snowy day"], "captions_pred_audio": ["a man is speaking while the wind is blowing and water is splashing", "a steam whistle blows and a train moves with wind noise in the background "], "question": "which entity is louder", "label": 1}, {"captions": ["wind blows and a vehicle blows a hard then a train blows a horn", "a car accelerates and wind blows"], "sample_ids": ["wnpJndXuxLc", "u0TrcHhkPQ"], "start_seconds": ["50", "20"], "properties": ["blows, vehicle, train", "accelerates, wind, blows"], "captions_pred_video": ["footage of the train coming down the tracks on a snowy day", null], "captions_pred_audio": ["a steam whistle blows and a train moves with wind noise in the background ", "a race car accelerates and revs its engine "], "question": "which entity is moving", "label": 1}, {"captions": ["goats bleat and metal clings", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["tH17JPjDPnc", "tiDFTC-5vU"], "start_seconds": ["260", "30"], "properties": ["bleat, metal, clings", "male, duck, laugh"], "captions_pred_video": ["feed of the goats eating hay in the barn", null], "captions_pred_audio": ["a cow is mooing and mechanisms are ticking ", "a man is speaking and ducks are quacking"], "question": "which entity is a person speaking?", "label": 1}, {"captions": ["a man speaks and a rooster crows while men talk in the background", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["wz7N8YRy74I", "wz7N8YRy74I"], "start_seconds": ["30", "30"], "properties": ["rooster, crow, background, men", "rooster, crow, background, men"], "captions_pred_video": ["footage of the sun shining through the clouds on a cloudy day", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking birds are chirping and a rooster is crowing ", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity has a rooster crow while men talk in the background?", "label": 0}, {"captions": ["a crowd yells, reacts and applauds", "a man speaks as a motor runs in the background"], "sample_ids": ["wztCSUxOf8", "xZepNM9qcRA"], "start_seconds": ["130", "30"], "properties": ["a crowd, yells, applauds", "background, motor, run"], "captions_pred_video": [null, "a close-up view of the motorcycle's engine and exhaust system"], "captions_pred_audio": ["a man is speaking and a crowd is clapping", "a man speaks while a motorcycle revs and accelerates "], "question": "which entity is silent", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a man speaks on a radio as wind blows"], "sample_ids": ["zl9Dqx-j7q4", "tDVADusiIoc"], "start_seconds": ["6", "60"], "properties": ["motors rev, laugh, loudly", "man, radio, blows"], "captions_pred_video": ["footage of a man driving a car in the dark", "a person riding on the back of a sailboat in rough seas"], "captions_pred_audio": ["a jet engine roars ", "a man is speaking while the wind is blowing and water is splashing"], "question": "which entity is a man speaking on a radio?", "label": 1}, {"captions": ["women speak as water runs briefly, children call out, and a man speaks", "a vehicle engine accelerating then running on idle"], "sample_ids": ["uRExseg-0XI", "vYkA3cfXp5Q"], "start_seconds": ["210", "30"], "properties": ["woman, man, water", "engine, accelerate, idle"], "captions_pred_video": ["stock footage or video footage of a person stirring a pot on a stove with a long-handled wooden spoon", "footage of a car driving down the street on a sunny day"], "captions_pred_audio": ["a man is speaking while water is running and birds are chirping ", "an engine is idling"], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a bird chirps in response to a woman chirping for the birds", "a person is burping while a girl speaks"], "sample_ids": ["uOpoD0gGXcs", "vdoxuJn9lTc"], "start_seconds": ["120", "40"], "properties": ["chirps, woman, bird", "person, burp, girl"], "captions_pred_video": ["a herd of cows grazing in the field", "a group of young girls playing a video game together in a living room"], "captions_pred_audio": ["birds are chirping and a man is speaking", "a child speaks followed by a burp"], "question": "which entity is a person?", "label": 1}, {"captions": ["birds chirp as a bell rings", "a clock ticktocks"], "sample_ids": ["ziUT9IFTkjg", "v-g-j2uTByM"], "start_seconds": ["10", "30"], "properties": ["chirp, bell, ring", "ticktocks, clock, ticktocks"], "captions_pred_video": [null, "in your own words a cuckoo clock hanging on the wall"], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a clock is ticking loudly"], "question": "which entity is silent", "label": 1}, {"captions": ["a stream of water runs briefly", "a bus engine idles while a woman speaks making an announcement"], "sample_ids": ["x-PeY8Yb8M4", "su6FAOcOA8c"], "start_seconds": ["300", "4"], "properties": ["stream, water, run", "engine, idle, woman"], "captions_pred_video": ["a man sitting on a rock in the middle of a river", "shows a group of people riding on a crowded subway train"], "captions_pred_audio": ["a car is driving on a wet road ", "a woman is speaking and a subway train is moving "], "question": "which entity is stationary", "label": 1}, {"captions": ["someone whistles a song", "an infant crying as a woman laughs"], "sample_ids": ["sIXTftIuUgw", "xhmRY9yhC7c"], "start_seconds": ["90", "20"], "properties": ["someone, song, whistle", "a, laugh, infant"], "captions_pred_video": [null, "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a person whistling a song", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["dishes cling together then a man begins to speak", "water flows and trickles"], "sample_ids": ["sQGXqGcwOTc", "tB7hWb9gTuQ"], "start_seconds": ["3", "30"], "properties": ["cling, speak, dishes", "water, flow, trickle"], "captions_pred_video": ["a man washing dishes in a commercial kitchen", "the rocks on the beach are surrounded by water and the sky is visible in the background"], "captions_pred_audio": ["mechanisms are operating and water is splashing ", "water is splashing and gurgling"], "question": "which entity is a liquid", "label": 1}, {"captions": ["small dogs growl, bark and yip.", "a train whistle keeps going off while the clickety-clack of the train on the rails are continuous"], "sample_ids": ["sShpyu2l4YQ", "ukg5L09Wpvo"], "start_seconds": ["0", "150"], "properties": ["growl, bark, yip", "clickety-clack, train, whistle"], "captions_pred_video": ["the puppies are playing with a toy", "footage of a train passing through a forest on a dirt road"], "captions_pred_audio": ["a dog is barking and growling", "a train blows its whistle and blows its horn "], "question": "which entity is a train", "label": 1}, {"captions": ["a small engine spits as it runs", "a harsh wind blows as a man speaks and another man speaks"], "sample_ids": ["sZvwOuuPGP0", "y8WEcpOlT3I"], "start_seconds": ["50", "40"], "properties": ["spits, engine, runs", "harsh, wind, blows"], "captions_pred_video": ["of a bulldozer clearing a road in a forest stock footage and royalty-free videos", "on how to use a sewing machine youtube"], "captions_pred_audio": ["a medium engine is running ", "a man is speaking with wind noise in the background "], "question": "which entity is not a person?", "label": 0}, {"captions": ["a baby laugh at a sputter", "a small airplane approaches and then flies by, after and during which a boy speaks"], "sample_ids": ["sLUnaPT5gM8", "zj2R0XoFr5k"], "start_seconds": ["0", "50"], "properties": ["laugh, sputter, baby", "airplane, boy, fly"], "captions_pred_video": ["of a baby laying on his stomach in a blue shirt and diaper", "footage of a small airplane flying in the sky stock videos and royalty-free footage"], "captions_pred_audio": ["a baby is laughing and breathing while a man is speaking ", "a woman speaks while a helicopter flies overhead "], "question": "which entity is a video", "label": 1}, {"captions": ["a train engine runs and a horn blows", "a male speaks and another male speaks"], "sample_ids": ["zPX9o1uDiI", "viuTg1M-dqg"], "start_seconds": ["40", "30"], "properties": ["engine, horn, run", "two males, speaking, male"], "captions_pred_video": [null, "footage of water coming out of a hole in the ground"], "captions_pred_audio": ["a train moves with its horn blowing and wheels squealing ", "a man is speaking with background noise and breathing sounds "], "question": "which entity has more than one speaker", "label": 1}, {"captions": ["leaves rustle while man speaks", "an infant crying as a woman laughs"], "sample_ids": ["zOZleIRqZm4", "xhmRY9yhC7c"], "start_seconds": ["80", "20"], "properties": ["leaves, rustle, speak", "a, laugh, infant"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "of a baby crying in a baby bouncer"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a baby cries and a woman speaks"], "question": "which entity is a person", "label": 1}, {"captions": ["television program is played far away while a woman talks and birds tweet nearby", "paper folding and crinkling"], "sample_ids": ["vbZ-0lGPneg", "zPpG3RD8lSs"], "start_seconds": ["30", "20"], "properties": ["a woman, a television program, a bird", "paper, fold, crinkle"], "captions_pred_video": ["of a man holding a baby duck in his hands", "how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's day card out of a piece of paper youtube how to make a valentine's"], "captions_pred_audio": ["a woman is speaking and a dog is whimpering", "the wind blows and a mouse clicks "], "question": "which entity is a demonstration of folding and crinkling?", "label": 1}, {"captions": ["motors rev and run loudly as a person laughs", "a toilet flushes and water drains unevenly"], "sample_ids": ["zl9Dqx-j7q4", "vhJWZheqaE"], "start_seconds": ["6", "0"], "properties": ["motors rev, laugh, loudly", "water drains unevenly, toilet flushes, water drains"], "captions_pred_video": ["footage of a man driving a car in the dark", null], "captions_pred_audio": ["a jet engine roars ", "a toilet is flushed"], "question": "which entity is a toilet?", "label": 1}, {"captions": ["running water in a faucet with some clinks", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["zNRChLjqcU", "w5W5Kqtc8E"], "start_seconds": ["220", "100"], "properties": ["water, faucet, run", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["water is running from a faucet into a sink", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is a vehicle?", "label": 1}, {"captions": ["a man speaks then blows a vehicle horn as wind blows", "dishes cling together then a man begins to speak"], "sample_ids": ["zALy31PjDl0", "sQGXqGcwOTc"], "start_seconds": ["21", "3"], "properties": ["a man, a vehicle, a horn", "cling, speak, dishes"], "captions_pred_video": ["a motorcycle is parked on the side of a brick walkway", "a man washing dishes in a commercial kitchen"], "captions_pred_audio": ["a man is speaking and a car horn is honking", "mechanisms are operating and water is splashing "], "question": "which entity is about a man speaking?", "label": 0}, {"captions": ["a jet engine screams, then increases its power", "a car accelerates and wind blows"], "sample_ids": ["vBslzh7saPw", "u0TrcHhkPQ"], "start_seconds": ["90", "20"], "properties": ["power, scream, increase", "accelerates, wind, blows"], "captions_pred_video": ["a pickup truck carrying a large object down the road", null], "captions_pred_audio": ["a jet engine roars and accelerates ", "a race car accelerates and revs its engine "], "question": "which entity is moving faster", "label": 1}, {"captions": ["a motor vehicle roars, drowning out people speaking in the background", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["s4Uz1Ffgo04", "vfYTJq7nU"], "start_seconds": ["100", "130"], "properties": ["roars, background, people speaking", "rustling, ducks, quack"], "captions_pred_video": ["footage of an ambulance arriving at the scene of an accident", null], "captions_pred_audio": ["a man is speaking while a boat is moving and wind is blowing ", "a duck quacks and a woman speaks"], "question": "which entity is more quiet", "label": 1}, {"captions": ["leaves rustling followed by a small bell chiming as birds chirp in the background", "a vehicle engine runs and wind blows before women yell"], "sample_ids": ["ziUT9IFTkjg", "w5W5Kqtc8E"], "start_seconds": ["10", "100"], "properties": ["background, birds, rustling", "wind, blow, vehicle"], "captions_pred_video": [null, null], "captions_pred_audio": ["birds are chirping and a chime is ringing ", "a motorboat is moving and people are shouting and cheering "], "question": "which entity is about a vehicle engine running and wind blowing?", "label": 1}, {"captions": ["insects humming with a dog barking and small goat bleating", "wind blows as people chatter quietly"], "sample_ids": ["tIY7qOV3rEM", "xBxDz0CFVn0"], "start_seconds": ["0", "30"], "properties": ["animal, bark, dog, barking, small, goat, bleating", "wind, chatter, people"], "captions_pred_video": ["a dog is standing in the middle of a dirt road in the woods", "footage is blurry and out of focus"], "captions_pred_audio": ["a dog is barking and a cat is meowing", "a man is speaking with wind noise in the background "], "question": "which entity is quieter", "label": 1}, {"captions": ["a toilet flushes and water drains", "a person snores loudly multiple times at a close distance"], "sample_ids": ["sfAvvZwdLCY", "sSMl2vc3ek"], "start_seconds": ["20", "20"], "properties": ["water drains, flushes, water", "loud, multiple, distance"], "captions_pred_video": ["footage of the toilet in the bathroom", null], "captions_pred_audio": ["a toilet is flushed", "a person snoring loudly"], "question": "which entity is louder", "label": 1}, {"captions": ["people clap and speak in the distance", "music plays, a person speaks, followed by whooshes and a ding"], "sample_ids": ["wwyfGO2J4", "tQWGZLItBXk"], "start_seconds": ["90", "170"], "properties": ["clap, distance, speak", "music, person, ding"], "captions_pred_video": [null, "worms revolution screenshots"], "captions_pred_audio": ["people are clapping and speaking with background noise ", "a child speaks music plays video game sounds sound effects and sound effects play "], "question": "which entity has a person speaking?", "label": 1}, {"captions": ["a woman talks while something is fried and objects are tapped", "rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown"], "sample_ids": ["yajyRTUQk3U", "vfYTJq7nU"], "start_seconds": ["400", "130"], "properties": ["a woman, something, fried", "rustling, ducks, quack"], "captions_pred_video": ["- a woman cooking in the kitchen", null], "captions_pred_audio": ["a woman is speaking while food is frying in the background", "a duck quacks and a woman speaks"], "question": "which entity is about a woman talking?", "label": 0}, {"captions": ["several ducks quack and cocks crow far away", "various birds chirp and squeal, and an animal grunts"], "sample_ids": ["sNB8zxXneIM", "tDlysoZiA1I"], "start_seconds": ["20", "0"], "properties": ["several, quack, cocks", "animal, grunts, chirps"], "captions_pred_video": ["a group of geese in a cage", "'s main subject is a dog standing on top of a laptop"], "captions_pred_audio": ["a rooster is crowing and wind is blowing ", "birds are chirping and a rooster is crowing "], "question": "which entity is more animal-like", "label": 1}, {"captions": ["leaves rustle while man speaks", "a man speaks and a rooster crows while men talk in the background"], "sample_ids": ["zOZleIRqZm4", "wz7N8YRy74I"], "start_seconds": ["80", "30"], "properties": ["leaves, rustle, speak", "rooster, crow, background, men"], "captions_pred_video": ["a person picking berries from the bushes in the garden", "footage of the sun shining through the clouds on a cloudy day"], "captions_pred_audio": ["a man is speaking with crickets chirping in the background", "a man is speaking birds are chirping and a rooster is crowing "], "question": "which entity is more likely to be in a farm setting", "label": 1}, {"captions": ["several insects fly while two men talk", "a male is speaking and a duck quacks as others laugh"], "sample_ids": ["s-T9OVOiMLo", "tiDFTC-5vU"], "start_seconds": ["330", "30"], "properties": ["several, fly, men", "male, duck, laugh"], "captions_pred_video": ["a man climbing up a tree using a rope to reach the top of the tree", null], "captions_pred_audio": ["a man is speaking while insects are buzzing in the background ", "a man is speaking and ducks are quacking"], "question": "which entity has more people", "label": 1}, {"captions": ["various birds chirp and squeal, and an animal grunts", "someone whistles a tune"], "sample_ids": ["tDlysoZiA1I", "sIXTftIuUgw"], "start_seconds": ["0", "90"], "properties": ["animal, grunts, chirps", "someone, tune, whistle"], "captions_pred_video": ["'s main subject is a dog standing on top of a laptop", null], "captions_pred_audio": ["birds are chirping and a rooster is crowing ", "a person whistling a song"], "question": "which entity is a human", "label": 1}] \ No newline at end of file diff --git a/lavis/configs/datasets/esc50/defaults_mm_cls.yaml b/lavis/configs/datasets/esc50/defaults_mm_cls.yaml new file mode 100644 index 0000000000000000000000000000000000000000..375b62b3bbbc5a8d062c9ed5896848989321b4a7 --- /dev/null +++ b/lavis/configs/datasets/esc50/defaults_mm_cls.yaml @@ -0,0 +1,37 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + esc50_cls: # + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: False + + text_processor: + train: + name: blip_instruction + modality: audio + task: classification + eval: + name: blip_caption + + data_type: [audio] + + build_info: + annotations: + + val: + url: + - https://raw.githubusercontent.com/karolpiczak/ESC-50/master/meta/esc50.csv + storage: + - /export/einstein-vision/audio_datasets/ESC-50-master/meta/esc50.csv + + audio: + storage: /export/einstein-vision/audio_datasets/ESC-50-master/audio \ No newline at end of file diff --git a/lavis/configs/datasets/flickr30k/defaults.yaml b/lavis/configs/datasets/flickr30k/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9127cab813b8caa3286a7bf79533f33babbebde3 --- /dev/null +++ b/lavis/configs/datasets/flickr30k/defaults.yaml @@ -0,0 +1,24 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + flickr30k: + # data_dir: ${env.data_dir}/datasets + data_type: images + + build_info: + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json + storage: flickr30k/annotations/train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json + storage: flickr30k/annotations/val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json + storage: flickr30k/annotations/test.json + images: + storage: flickr30k/images + # storage: /export/share/datasets/vision/flickr30k diff --git a/lavis/configs/datasets/flickr30k/defaults_cap.yaml b/lavis/configs/datasets/flickr30k/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..021f5033e7b02e2071f0b0bdca1f4ad04e2e4032 --- /dev/null +++ b/lavis/configs/datasets/flickr30k/defaults_cap.yaml @@ -0,0 +1,39 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + flickr30k_caption: # name of the dataset builder + # dataset_card: dataset_card/coco_caption.md + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json + storage: flickr30k/annotations/train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json + storage: flickr30k/annotations/val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json + storage: flickr30k/annotations/test.json + images: + # storage: flickr30k/images + storage: /export/share/datasets/vision/flickr30k diff --git a/lavis/configs/datasets/flickr30k/defaults_cap_instruct.yaml b/lavis/configs/datasets/flickr30k/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..15f07503823fb79e6b3fe0698e5ee4b684a6eaeb --- /dev/null +++ b/lavis/configs/datasets/flickr30k/defaults_cap_instruct.yaml @@ -0,0 +1,41 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + flickr30k_caption_instruct: # name of the dataset builder + # dataset_card: dataset_card/coco_caption.md + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: caption + eval: + name: blip_caption + + build_info: + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json + storage: flickr30k/annotations/train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json + storage: flickr30k/annotations/val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json + storage: flickr30k/annotations/test.json + images: + # storage: flickr30k/images + storage: /export/share/datasets/vision/flickr30k diff --git a/lavis/configs/datasets/gqa/balanced_testdev.yaml b/lavis/configs/datasets/gqa/balanced_testdev.yaml new file mode 100644 index 0000000000000000000000000000000000000000..86114fb964cda8e58f9848c216c5f4ae5f28ca70 --- /dev/null +++ b/lavis/configs/datasets/gqa/balanced_testdev.yaml @@ -0,0 +1,30 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + gqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json + storage: + - gqa/annotations/train_balanced_questions.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json + storage: + - gqa/annotations/testdev_balanced_questions.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json + storage: + - gqa/annotations/test_balanced_questions.json + images: + storage: gqa/images/ diff --git a/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml b/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d3b495894af4460dbfd20b4a4a951811e57977f --- /dev/null +++ b/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml @@ -0,0 +1,46 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + gqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + task: qa + modality: image + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json + storage: + - gqa/annotations/train_balanced_questions.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json + storage: + - gqa/annotations/testdev_balanced_questions.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json + storage: + - gqa/annotations/test_balanced_questions.json + images: + storage: /export/share/datasets/vision/GQA/images #gqa/images/ diff --git a/lavis/configs/datasets/gqa/balanced_val.yaml b/lavis/configs/datasets/gqa/balanced_val.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ca420dfdcef381bebb261b4178b9a288bc331d5f --- /dev/null +++ b/lavis/configs/datasets/gqa/balanced_val.yaml @@ -0,0 +1,30 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + gqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json + storage: + - gqa/annotations/train_balanced_questions.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json + storage: + - gqa/annotations/val_balanced_questions.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json + storage: + - gqa/annotations/test_balanced_questions.json + images: + storage: gqa/images/ diff --git a/lavis/configs/datasets/gqa/balanced_val_instruct.yaml b/lavis/configs/datasets/gqa/balanced_val_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..43f2277dcec3e923d8fdb583b8b1311f3a4eee76 --- /dev/null +++ b/lavis/configs/datasets/gqa/balanced_val_instruct.yaml @@ -0,0 +1,47 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + gqa_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + task: qa + modality: image + eval: + name: blip_question + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json + storage: + - gqa/annotations/train_balanced_questions.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json + storage: + - gqa/annotations/val_balanced_questions.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json + storage: + - gqa/annotations/test_balanced_questions.json + images: + storage: /export/share/datasets/vision/GQA/images #gqa/images/ diff --git a/lavis/configs/datasets/gqa/defaults.yaml b/lavis/configs/datasets/gqa/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c2d87cd7afcedd52f65c5103d5d9d697f5de7f7 --- /dev/null +++ b/lavis/configs/datasets/gqa/defaults.yaml @@ -0,0 +1,36 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + gqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json + - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json + storage: + - gqa/annotations/train_all_questions_0.json + - gqa/annotations/val_all_questions.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json + storage: + - aokvqa/annotations/aokvqa_v1p0_val.json + - aokvqa/annotations/large_vocab_train_lavis.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json + storage: + - aokvqa/annotations/aokvqa_v1p0_test.json + - aokvqa/annotations/large_vocab_train_lavis.json + images: + storage: gqa/images/ diff --git a/lavis/configs/datasets/gqa/defaults_instruct.yaml b/lavis/configs/datasets/gqa/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..86bcc6e1e37e2b5da456331f989471d5e4e5fcd5 --- /dev/null +++ b/lavis/configs/datasets/gqa/defaults_instruct.yaml @@ -0,0 +1,55 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + gqa_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + task: qa + modality: image + eval: + name: blip_question + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/gqa/train_all_questions_0.json + # - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json + # - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/gqa/val_all_questions.json + storage: + - gqa/annotations/train_all_questions_0.json + - gqa/annotations/val_all_questions.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json + storage: + - aokvqa/annotations/aokvqa_v1p0_val.json + - aokvqa/annotations/large_vocab_train_lavis.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json + storage: + - aokvqa/annotations/aokvqa_v1p0_test.json + - aokvqa/annotations/large_vocab_train_lavis.json + images: + storage: /export/share/datasets/vision/GQA/images #gqa/images/ diff --git a/lavis/configs/datasets/iconqa/defaults.yaml b/lavis/configs/datasets/iconqa/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b09339987d5d102c1f46d01d699b59c321a9a388 --- /dev/null +++ b/lavis/configs/datasets/iconqa/defaults.yaml @@ -0,0 +1,52 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + iconqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + + text_processor: + train: + name: blip_question + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_train.json + # - /export/share/datasets/vision_language/iconqa/annotations_train.json + storage: + - iconqa/annotations/train.json + # - /export/share/datasets/vision_language/iconqa/annotations_train.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json + # - /export/share/datasets/vision_language/iconqa/annotations_val.json + storage: + - iconqa/annotations/val.json + # - /export/share/datasets/vision_language/iconqa/annotations_val.json + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json + # - /export/share/datasets/vision_language/iconqa/annotations_test.json + storage: + - iconqa/annotations/test.json + # - /export/share/datasets/vision_language/iconqa/annotations_test.json + images: + storage: /export/share/datasets/vision_language/iconqa/all_images/ + diff --git a/lavis/configs/datasets/iconqa/defaults_instruct.yaml b/lavis/configs/datasets/iconqa/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74c286a10bcbdcf78d6b8e476c2657073b9190b9 --- /dev/null +++ b/lavis/configs/datasets/iconqa/defaults_instruct.yaml @@ -0,0 +1,55 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + iconqa_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + + text_processor: + train: + name: blip_instruction + modality: image + task: qa + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_train.json + # - /export/share/datasets/vision_language/iconqa/annotations_train.json + storage: + - iconqa/annotations/train.json + # - /export/share/datasets/vision_language/iconqa/annotations_train.json + # val: + # url: + # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json + # # - /export/share/datasets/vision_language/iconqa/annotations_val.json + # storage: + # - iconqa/annotations/val.json + # # - /export/share/datasets/vision_language/iconqa/annotations_val.json + # test: + # url: + # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json + # # - /export/share/datasets/vision_language/iconqa/annotations_test.json + # storage: + # - iconqa/annotations/test.json + # # - /export/share/datasets/vision_language/iconqa/annotations_test.json + + images: + storage: /export/share/datasets/vision_language/iconqa/all_images/ + diff --git a/lavis/configs/datasets/imagenet/defaults.yaml b/lavis/configs/datasets/imagenet/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a20779b43df16508f096d2159db3357e8b3ee4d --- /dev/null +++ b/lavis/configs/datasets/imagenet/defaults.yaml @@ -0,0 +1,15 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + imagenet: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + splits: ["val"] + images: + storage: /export/share/datasets/vision/imagenet diff --git a/lavis/configs/datasets/laion/defaults_2B_multi.yaml b/lavis/configs/datasets/laion/defaults_2B_multi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..279a736fa1b8a5fd79821a7732c4d3bd7c4d5214 --- /dev/null +++ b/lavis/configs/datasets/laion/defaults_2B_multi.yaml @@ -0,0 +1,13 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + laion2B_multi: + + data_type: images + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar diff --git a/lavis/configs/datasets/laion/defaults_400M.yaml b/lavis/configs/datasets/laion/defaults_400M.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a93b769959937d5cc510db50963c7b075c91393d --- /dev/null +++ b/lavis/configs/datasets/laion/defaults_400M.yaml @@ -0,0 +1,20 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + laion400M: + + data_type: images + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar +# storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar diff --git a/lavis/configs/datasets/laion/defaults_400M_instruct.yaml b/lavis/configs/datasets/laion/defaults_400M_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b7f4a5c1766a75878150fa5d58ab42b5454e81cb --- /dev/null +++ b/lavis/configs/datasets/laion/defaults_400M_instruct.yaml @@ -0,0 +1,31 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + laion400M_instruct: + + data_type: images + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + + text_processor: + train: + name: blip_instruction + modality: image + task: caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar +# storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar diff --git a/lavis/configs/datasets/llava150k/defaults_dial.yaml b/lavis/configs/datasets/llava150k/defaults_dial.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f315dd6f7eadc1e9986e85e3ea8483e38b10fdf6 --- /dev/null +++ b/lavis/configs/datasets/llava150k/defaults_dial.yaml @@ -0,0 +1,32 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + llava150k_dialogue_instruct: #394276 train examples + + data_type: images + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: "blip_caption" + + build_info: + annotations: + train: + url: + - https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json + storage: + - LLaVA-Instruct-150K/annotations/lava_instruct_150k.json + # Be careful not to append minus sign (-) before split to avoid itemizing + images: + storage: /export/share/datasets/vision/coco/images/train2017 diff --git a/lavis/configs/datasets/modelnet40/defaults_cls.yaml b/lavis/configs/datasets/modelnet40/defaults_cls.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c16d0aa63d1b235c27f8d9e2e2757fe41158e49 --- /dev/null +++ b/lavis/configs/datasets/modelnet40/defaults_cls.yaml @@ -0,0 +1,55 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + modelnet40_cls: # name of the dataset builder + data_type: [pc, images] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + pc_processor: + train: + name: "ulip_pc" + eval: + name: "ulip_pc" + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_shape_names.txt + - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat + - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_train.txt + storage: + - modelnet40_normal_resampled/modelnet40_shape_names.txt + - modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat + - /modelnet40_normal_resampled/modelnet40_train.txt + val: + url: + - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_shape_names.txt + - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat + - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_test.txt + storage: + - modelnet40_normal_resampled/modelnet40_shape_names.txt + - modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat + - modelnet40_normal_resampled/modelnet40_test.txt + + pc: + storage: /export/home/ULIP/data/modelnet40_normal_resampled + + images: + storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet40_pc_img \ No newline at end of file diff --git a/lavis/configs/datasets/msrvtt/defaults_cap.yaml b/lavis/configs/datasets/msrvtt/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a3385b46931b640ec18113a20232010f3a11e233 --- /dev/null +++ b/lavis/configs/datasets/msrvtt/defaults_cap.yaml @@ -0,0 +1,24 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msrvtt_cap: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json + storage: msrvtt/annotations/cap_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json + storage: msrvtt/annotations/cap_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json + storage: msrvtt/annotations/cap_test.json + videos: + storage: msrvtt/videos diff --git a/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml b/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ce550ec35ab4252dc0925241c97eecfde6f60bc5 --- /dev/null +++ b/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml @@ -0,0 +1,48 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msrvtt_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + text_processor: + train: + name: blip_instruction + task: caption + modality: video + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json + storage: msrvtt/annotations/cap_train.json + # val: + # url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json + # storage: msrvtt/annotations/cap_val.json + # test: + # url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json + # storage: msrvtt/annotations/cap_test.json + videos: + storage: msrvtt/videos diff --git a/lavis/configs/datasets/msrvtt/defaults_qa.yaml b/lavis/configs/datasets/msrvtt/defaults_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df1c4ad0a79117604ea903d567948e15ec941382 --- /dev/null +++ b/lavis/configs/datasets/msrvtt/defaults_qa.yaml @@ -0,0 +1,27 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msrvtt_qa: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json + storage: msrvtt/annotations/qa_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json + storage: msrvtt/annotations/qa_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json + storage: msrvtt/annotations/qa_test.json + ans2label: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json + storage: msrvtt/annotations/qa_ans2label.json + videos: + storage: msrvtt/videos diff --git a/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml b/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1d9c582472dbba95435f4ac19857df182fc94834 --- /dev/null +++ b/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msrvtt_qa_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + text_processor: + train: + name: blip_instruction + task: qa + modality: video + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json + storage: msrvtt/annotations/qa_train.json + # val: + # url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json + # storage: msrvtt/annotations/qa_val.json + # test: + # url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json + # storage: msrvtt/annotations/qa_test.json + ans2label: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json + storage: msrvtt/annotations/qa_ans2label.json + videos: + storage: msrvtt/videos diff --git a/lavis/configs/datasets/msrvtt/defaults_ret.yaml b/lavis/configs/datasets/msrvtt/defaults_ret.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f0cc55b39c9ba69f3aa61f17e4c4519923db7df0 --- /dev/null +++ b/lavis/configs/datasets/msrvtt/defaults_ret.yaml @@ -0,0 +1,24 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msrvtt_retrieval: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json + storage: msrvtt/annotations/retrieval_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json + storage: msrvtt/annotations/retrieval_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json + storage: msrvtt/annotations/retrieval_test.json + videos: + storage: msrvtt/videos diff --git a/lavis/configs/datasets/msvd/defaults_cap.yaml b/lavis/configs/datasets/msvd/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d9e870bd4b6b045034aefe47dbb5f8cff9bdc45b --- /dev/null +++ b/lavis/configs/datasets/msvd/defaults_cap.yaml @@ -0,0 +1,24 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msvd_cap: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json + storage: msvd/annotations/cap_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json + storage: msvd/annotations/cap_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json + storage: msvd/annotations/cap_test.json + videos: + storage: msvd/videos diff --git a/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml b/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4032b298c6eaa7f4767a89717dbe22719283a559 --- /dev/null +++ b/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml @@ -0,0 +1,50 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msvd_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + text_processor: + train: + name: blip_instruction + task: caption + modality: video + eval: + name: blip_caption + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json + storage: msvd/annotations/cap_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json + storage: msvd/annotations/cap_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json + storage: msvd/annotations/cap_test.json + videos: + # storage: msvd/videos + storage: /export/share/datasets/vision_language/msvd/videos diff --git a/lavis/configs/datasets/msvd/defaults_qa.yaml b/lavis/configs/datasets/msvd/defaults_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b4bbbd3fb44bf0bb40d14c68bc07b825a541577 --- /dev/null +++ b/lavis/configs/datasets/msvd/defaults_qa.yaml @@ -0,0 +1,29 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msvd_qa: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json + storage: msvd/annotations/qa_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json + storage: msvd/annotations/qa_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json + storage: msvd/annotations/qa_test.json + ans2label: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json + storage: msvd/annotations/qa_ans2label.json + videos: + storage: msvd/videos + + instance_id_key: question_id diff --git a/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml b/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b7b2ad77b7446941a617dcc48e20807846571a7b --- /dev/null +++ b/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml @@ -0,0 +1,53 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msvd_qa_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + text_processor: + train: + name: blip_instruction + task: qa + modality: video + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json + storage: msvd/annotations/qa_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json + storage: msvd/annotations/qa_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json + storage: msvd/annotations/qa_test.json + ans2label: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json + storage: msvd/annotations/qa_ans2label.json + videos: + storage: /export/share/datasets/vision_language/msvd/videos + + instance_id_key: question_id diff --git a/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml b/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c5da606ed675a84e72e22acfad2cd01f1239672 --- /dev/null +++ b/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml @@ -0,0 +1,66 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +datasets: + musicavqa_mm: # name of the dataset builder + data_type: [video, audio] + + video_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + text_processor: + train: + name: blip_question + eval: + name: blip_question + + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: False + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-val.json + # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json + storage: + - /musicavqa/annotations/avqa-val.json + # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json + + test: + url: + # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-test.json + - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json + storage: + # - /musicavqa/annotations/avqa-test.json + - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json + + templates: null + + audio: + storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real + + video: + storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real + diff --git a/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml b/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..11d78f6cfbbe5cb1d83137a54f0509cecae48ef9 --- /dev/null +++ b/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml @@ -0,0 +1,69 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + musicavqa_mm_instruct: # name of the dataset builder + data_type: [video, audio] + + video_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + text_processor: + train: + name: blip_instruction + task: qa + modality: video + eval: + name: blip_question + + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: False + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-val.json + # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json + storage: + - /musicavqa/annotations/avqa-val.json + # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json + + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-test.json + # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json + storage: + - /musicavqa/annotations/avqa-test.json + # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json + + templates: null + + audio: + storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real + + video: + storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real + diff --git a/lavis/configs/datasets/nlvr/defaults.yaml b/lavis/configs/datasets/nlvr/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..96a985598259861e9d23b47d2dffcf7d06b22e69 --- /dev/null +++ b/lavis/configs/datasets/nlvr/defaults.yaml @@ -0,0 +1,24 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + nlvr: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json + storage: nlvr/annotations/train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json + storage: nlvr/annotations/dev.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json + storage: nlvr/annotations/test.json + images: + storage: /export/share/datasets/vision/NLVR2/ diff --git a/lavis/configs/datasets/nocaps/defaults.yaml b/lavis/configs/datasets/nocaps/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..062b03c00e1bf24461e201a3dcfb9250a456b9d7 --- /dev/null +++ b/lavis/configs/datasets/nocaps/defaults.yaml @@ -0,0 +1,22 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + nocaps: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json + storage: nocaps/annotations/nocaps_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json + storage: nocaps/annotations/nocaps_test.json + images: + storage: nocaps/images + # storage: /export/share/datasets/vision/nocaps/ diff --git a/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml b/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..083e628d497a39bb8f9684f7132683ddf060d99d --- /dev/null +++ b/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml @@ -0,0 +1,54 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + objaverse_mm_caption: # 651576 train examples + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_train" + image_size: 224 + pc_processor: + train: + name: "ulip_pc" + eval: + name: "ulip_pc" + + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + data_type: [pc, images] # [images|pc] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_train.csv + # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json + storage: + - objaverse/annotations/train.csv + # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json + + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_val.csv + # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json + storage: + - objaverse/annotations/val.csv + # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json + + templates: null + + pc: + storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel + + images: + storage: /export/einstein-vision/3d_vision/objaverse_captions/images/ \ No newline at end of file diff --git a/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml b/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec36400bc4f9dc17a2e6b64cc557e6e4c424f97c --- /dev/null +++ b/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml @@ -0,0 +1,55 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + objaverse_mm_caption_instruct: # 651576 train examples + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_train" + image_size: 224 + pc_processor: + train: + name: "ulip_pc" + eval: + name: "ulip_pc" + text_processor: + train: + name: "blip_instruction" + modality: pc + task: caption + eval: + name: "blip_caption" + + data_type: [pc, images] # [images|pc] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_train.csv + # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json + storage: + - objaverse/annotations/train.csv + # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json + + # val: + # url: + # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_val.csv + # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json + # storage: + # # - objaverse/annotations/val.csv + # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json + + templates: null + + pc: + storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel + + images: + storage: /export/einstein-vision/3d_vision/objaverse_captions/images/ \ No newline at end of file diff --git a/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml b/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0e91346329921d2365429ec59f3f7952d27eaf9b --- /dev/null +++ b/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml @@ -0,0 +1,55 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + objaverse_mm_qa: # 250070 + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_train" + image_size: 224 + pc_processor: + train: + name: "ulip_pc" + eval: + name: "ulip_pc" + text_processor: + train: + name: "blip_instruction" + modality: pc + task: qa + eval: + name: "blip_question" + + + data_type: pc # [images|pc] + + build_info: + kwargs: + add_binary: True + remove_model_answer: True + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/CAP3DQA_final.csv + # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final.csv + storage: + - objaverse_qa/annotations/train.csv + # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final.csv + # val: + # url: + # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/CAP3DQA_final_val.csv + # # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final_val.csv + # storage: + # - objaverse_qa/annotations/val.csv + # # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final_val.csv + + templates: null + + pc: + storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel \ No newline at end of file diff --git a/lavis/configs/datasets/ocrvqa/defaults.yaml b/lavis/configs/datasets/ocrvqa/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eab021156013e3208cc3711ef4b581215b47355f --- /dev/null +++ b/lavis/configs/datasets/ocrvqa/defaults.yaml @@ -0,0 +1,33 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + ocr_vqa: # 1002146 train examples + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + + text_processor: + train: + name: "blip_question" + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/ocrvqa/ocrvqa.json + # - /export/video-language-dataset/ocrvqa/ocrvqa.json + storage: + - ocrvqa/annotations/ocrvqa.json + # - /export/video-language-dataset/ocrvqa/ocrvqa.json + images: + storage: /export/video-language-dataset/ocrvqa/images/ diff --git a/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml b/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6fb4eef8551db678811fdf228dec8db0280afe5 --- /dev/null +++ b/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml @@ -0,0 +1,35 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + ocr_vqa_instruct: # 1002146 train examples + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: qa + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/ocrvqa/ocrvqa.json + # - /export/video-language-dataset/ocrvqa/ocrvqa.json + storage: + - ocrvqa/annotations/ocrvqa.json + # - /export/video-language-dataset/ocrvqa/ocrvqa.json + images: + storage: /export/video-language-dataset/ocrvqa/images/ diff --git a/lavis/configs/datasets/okvqa/defaults.yaml b/lavis/configs/datasets/okvqa/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a442c9bb2f12951ae812b924b5b12a038bcce75d --- /dev/null +++ b/lavis/configs/datasets/okvqa/defaults.yaml @@ -0,0 +1,37 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + ok_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + # TODO make this order insensitive + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json + storage: + - okvqa/annotations/okvqa_train.json + # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json + # - okvqa/annotations/mscoco_train2014_annotations.json + test: + url: + # TODO make this order insensitive + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json + storage: + - okvqa/annotations/vqa_val_eval.json + - okvqa/annotations/answer_list.json + - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json + - okvqa/annotations/mscoco_val2014_annotations.json + images: + storage: coco/images/ diff --git a/lavis/configs/datasets/okvqa/defaults_instruct.yaml b/lavis/configs/datasets/okvqa/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..96c56c240cca95d9c87f624185e1af4b0be1b026 --- /dev/null +++ b/lavis/configs/datasets/okvqa/defaults_instruct.yaml @@ -0,0 +1,53 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + ok_vqa_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: qa + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + # TODO make this order insensitive + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json + storage: + - okvqa/annotations/okvqa_train.json + # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json + # - okvqa/annotations/mscoco_train2014_annotations.json + # test: + # url: + # # TODO make this order insensitive + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json + # storage: + # - okvqa/annotations/vqa_val_eval.json + # - okvqa/annotations/answer_list.json + # - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json + # - okvqa/annotations/mscoco_val2014_annotations.json + images: + storage: /export/share/datasets/vision/coco/images diff --git a/lavis/configs/datasets/sbu_caption/defaults.yaml b/lavis/configs/datasets/sbu_caption/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a5a22053831056a241556ed1a37321595f00794 --- /dev/null +++ b/lavis/configs/datasets/sbu_caption/defaults.yaml @@ -0,0 +1,22 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + sbu_caption: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json + # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json + storage: + - sbu_captions/annotations/sbu.json + images: + storage: sbu_captions/images + # storage: /export/share/datasets/vision_language/sbu_resize diff --git a/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml b/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1611df1c8c493863554d1e36f8e1ae9a6b261d61 --- /dev/null +++ b/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml @@ -0,0 +1,38 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + sbu_caption_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json + # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json + storage: + - sbu_captions/annotations/sbu.json + images: + storage: sbu_captions/images + # storage: /export/share/datasets/vision_language/sbu_resize diff --git a/lavis/configs/datasets/scienceqa/defaults.yaml b/lavis/configs/datasets/scienceqa/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e33493b6d67178a8f386a611f4bef08d94d2681d --- /dev/null +++ b/lavis/configs/datasets/scienceqa/defaults.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + scienceqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_question + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_train.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json + storage: + - scienceqa/annotations/problems_train.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_val.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json + storage: + - scienceqa/annotations/problems_val.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_test.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json + storage: + - scienceqa/annotations/problems_test.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json + + images: + storage: /export/video-language-dataset/ScienceQA/data/scienceqa/images/ + diff --git a/lavis/configs/datasets/scienceqa/defaults_instruct.yaml b/lavis/configs/datasets/scienceqa/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3128647c21c28ee3a473ca5aa5d35848c8435d46 --- /dev/null +++ b/lavis/configs/datasets/scienceqa/defaults_instruct.yaml @@ -0,0 +1,54 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + scienceqa_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: qa + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_train.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json + storage: + - scienceqa/annotations/problems_train.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_val.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json + storage: + - scienceqa/annotations/problems_val.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_test.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json + storage: + - scienceqa/annotations/problems_test.json + # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json + + images: + storage: /export/video-language-dataset/ScienceQA/data/scienceqa/images/ + diff --git a/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml b/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1c6fb08a21f7cd245139f83f4e6f840f03606211 --- /dev/null +++ b/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +datasets: + shapenet_mm_caption: # name of the dataset builder + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_train" + image_size: 224 + pc_processor: + train: + name: "ulip_pc" + eval: + name: "ulip_pc" + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + data_type: [pc, images] # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json + # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json + storage: + - shapenet/annotations/train_ann.json + # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json + # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json + storage: + - shapenet/annotations/test_ann.json + # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json + + templates: null + + pc: + storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc + + images: + storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images \ No newline at end of file diff --git a/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml b/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..002379fc0bef72c8e327c3284967f4f234c76303 --- /dev/null +++ b/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml @@ -0,0 +1,53 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +datasets: + shapenet_mm_caption_instruct: # name of the dataset builder + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_train" + image_size: 224 + pc_processor: + train: + name: "ulip_pc" + eval: + name: "ulip_pc" + text_processor: + train: + name: "blip_instruction" + modality: pc + task: caption + eval: + name: "blip_caption" + + data_type: [pc, images] # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json + # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json + storage: + - shapenet/annotations/train_ann.json + # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json + # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json + storage: + - shapenet/annotations/test_ann.json + # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json + + templates: null + + pc: + storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc + + images: + storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images \ No newline at end of file diff --git a/lavis/configs/datasets/snli_ve/defaults.yaml b/lavis/configs/datasets/snli_ve/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..91b6cf7fd9b79b1d6a26ae25eed38cda61b83d01 --- /dev/null +++ b/lavis/configs/datasets/snli_ve/defaults.yaml @@ -0,0 +1,25 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + snli_ve: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json + storage: snli/annotations/ve_train.json + val: + url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json + storage: snli/annotations/ve_dev.json + test: + url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json + storage: snli/annotations/ve_test.json + images: + storage: flickr30k/images/flickr30k-images + # storage: /export/share/datasets/vision/flickr30k/flickr30k-images diff --git a/lavis/configs/datasets/snli_ve/defaults_instruct.yaml b/lavis/configs/datasets/snli_ve/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..32d30846c11f85c042416d34b08024325cf443b9 --- /dev/null +++ b/lavis/configs/datasets/snli_ve/defaults_instruct.yaml @@ -0,0 +1,49 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + snli_ve_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_train.json + # - /export/share/dongxuli/data/lavis/snli/ve_train.json + storage: + - snli/annotations/ve_train.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_dev.json + # - /export/share/dongxuli/data/lavis/snli/ve_dev.json + storage: + - snli/annotations/ve_dev.json + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_test.json + # - /export/share/dongxuli/data/lavis/snli/ve_test.json + storage: + - snli/annotations/ve_test.json + images: + # storage: flickr30k/images/flickr30k-images + storage: /export/share/datasets/vision/flickr30k/flickr30k-images diff --git a/lavis/configs/datasets/textcaps/defaults.yaml b/lavis/configs/datasets/textcaps/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..36b30915de6dfe86195b45dd8246e2d5affcb1d7 --- /dev/null +++ b/lavis/configs/datasets/textcaps/defaults.yaml @@ -0,0 +1,46 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + textcaps_caption: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json + storage: + - TextCaps/TextCaps_0.1_train.json + val: + url: + - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json + storage: + - TextCaps/TextCaps_0.1_val.json + test: + url: + - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json + storage: + - TextCaps/TextCaps_0.1_test.json + images: + # storage: nocaps/images + storage: /export/share/datasets/vision_language/TextCaps/images diff --git a/lavis/configs/datasets/textcaps/defaults_instruct.yaml b/lavis/configs/datasets/textcaps/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..109da7c897df72f05125fe657127c7784b78785a --- /dev/null +++ b/lavis/configs/datasets/textcaps/defaults_instruct.yaml @@ -0,0 +1,47 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + textcaps_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + modality: image + task: caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json + storage: + - TextCaps/TextCaps_0.1_train.json + val: + url: + - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json + storage: + - TextCaps/TextCaps_0.1_val.json + test: + url: + - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json + storage: + - TextCaps/TextCaps_0.1_test.json + images: + # storage: nocaps/images + storage: /export/share/datasets/vision_language/TextCaps/images diff --git a/lavis/configs/datasets/valor/defaults_mm_cap.yaml b/lavis/configs/datasets/valor/defaults_mm_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7451e6e0aa7004e79d204bc6f5e4f8cdd4bcd4de --- /dev/null +++ b/lavis/configs/datasets/valor/defaults_mm_cap.yaml @@ -0,0 +1,68 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + valor_mm_caption: # name of the dataset builder + data_type: [video, audio] + + video_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: False + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json + # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json + storage: + - valor/annotations/desc_val.json + # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json + + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json + # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json + storage: + - valor/annotations/desc_test.json + # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json + + templates: null + + audio: + storage: /export/video-language-dataset/data/VALOR/videos + + video: + storage: /export/video-language-dataset/data/VALOR/videos + diff --git a/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml b/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a47aa8570a655536681f4c1f427a2c90844ca2d --- /dev/null +++ b/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml @@ -0,0 +1,70 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + valor_mm_caption_instruct: # name of the dataset builder + data_type: [video, audio] + + video_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: False + + text_processor: + train: + name: blip_instruction + modality: image + task: caption + eval: + name: blip_caption + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json + # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json + storage: + - valor/annotations/desc_val.json + # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json + + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json + # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json + storage: + - valor/annotations/desc_test.json + # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json + + templates: null + + audio: + storage: /export/video-language-dataset/data/VALOR/videos + + video: + storage: /export/video-language-dataset/data/VALOR/videos + diff --git a/lavis/configs/datasets/vatex/defaults_cap.yaml b/lavis/configs/datasets/vatex/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a5dc1c7ff6d30e8235582c44b9c284a90ab5b3b3 --- /dev/null +++ b/lavis/configs/datasets/vatex/defaults_cap.yaml @@ -0,0 +1,24 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + msvd_cap: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json + storage: vatex/annotations/cap_train.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json + storage: vatex/annotations/cap_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json + storage: vatex/annotations/cap_test.json + videos: + storage: /export/share/dongxuli/data/vatex diff --git a/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml b/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7fd5f0038c693e671afb397735e1529265d40f7d --- /dev/null +++ b/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml @@ -0,0 +1,62 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vatex_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + video_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: True + + data_type: [video, audio] + + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + eval: + name: beats_audio + sampling_rate: 16000 + is_eval: False + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json + storage: + - vatex/annotations/cap_train.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json + storage: + - vatex/annotations/cap_val.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json + storage: + - vatex/annotations/cap_test.json + + video: + storage: /export/video-language-dataset/data/vatex/ + + audio: + storage: /export/video-language-dataset/data/vatex/ diff --git a/lavis/configs/datasets/vg/defaults_caption.yaml b/lavis/configs/datasets/vg/defaults_caption.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed303b58d8976ab5a4b1da7c234405a14d559fff --- /dev/null +++ b/lavis/configs/datasets/vg/defaults_caption.yaml @@ -0,0 +1,18 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vg_caption: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json + storage: vg/annotations/vg_caption.json + images: + storage: vg/images/ diff --git a/lavis/configs/datasets/vg/defaults_caption_instruct.yaml b/lavis/configs/datasets/vg/defaults_caption_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8015e94ae6ee97bc821ab152b3897944aec2aaf4 --- /dev/null +++ b/lavis/configs/datasets/vg/defaults_caption_instruct.yaml @@ -0,0 +1,34 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vg_caption_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + task: caption + modality: image + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json + storage: vg/annotations/vg_caption.json + images: + storage: /export/share/datasets/vision/visual-genome/ #vg/images/ diff --git a/lavis/configs/datasets/vg/defaults_vqa.yaml b/lavis/configs/datasets/vg/defaults_vqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e12e5c860a0db616a80967f7515b47abedba519e --- /dev/null +++ b/lavis/configs/datasets/vg/defaults_vqa.yaml @@ -0,0 +1,18 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vg_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json + storage: vg/annotations/vg_qa.json + images: + storage: vg/images/ diff --git a/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml b/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..787c9529c21cf37fbdfcfd7f1c06593fec76163d --- /dev/null +++ b/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml @@ -0,0 +1,34 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vg_vqa_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + task: qa + modality: image + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json + storage: vg/annotations/vg_qa.json + images: + storage: /export/share/datasets/vision/visual-genome/ #vg/images/ diff --git a/lavis/configs/datasets/violin/defaults_cap.yaml b/lavis/configs/datasets/violin/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc90d482333cf2aaf6770a1ef83faef737101d1b --- /dev/null +++ b/lavis/configs/datasets/violin/defaults_cap.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + violin_caption: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json + # - /export/video-language-dataset/data/violin/annotations_lavis.json + storage: + - violin/annotations/train.json + # - /export/video-language-dataset/data/violin/annotations_lavis.json + # val: + # url: + # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json + # - /export/video-language-dataset/data/violin/annotations_lavis_test.json + # storage: + # # - violin/annotations/test.json + # - /export/video-language-dataset/data/violin/annotations_lavis_test.json + videos: + storage: /export/video-language-dataset/data/violin/videos diff --git a/lavis/configs/datasets/violin/defaults_cap_instruct.yaml b/lavis/configs/datasets/violin/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d43317f497b2aa2cf4db0322a3003ddb13ca76d --- /dev/null +++ b/lavis/configs/datasets/violin/defaults_cap_instruct.yaml @@ -0,0 +1,53 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + violin_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_instruction + modality: video + task: caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json + # - /export/video-language-dataset/data/violin/annotations_lavis.json + storage: + - violin/annotations/train.json + # - /export/video-language-dataset/data/violin/annotations_lavis.json + # val: + # url: + # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json + # - /export/video-language-dataset/data/violin/annotations_lavis_test.json + # storage: + # # - violin/annotations/test.json + # - /export/video-language-dataset/data/violin/annotations_lavis_test.json + videos: + storage: /export/video-language-dataset/data/violin/videos diff --git a/lavis/configs/datasets/violin/defaults_entail.yaml b/lavis/configs/datasets/violin/defaults_entail.yaml new file mode 100644 index 0000000000000000000000000000000000000000..82c33bd7ec84bafa4628eec6e6ac6b350a85c7a0 --- /dev/null +++ b/lavis/configs/datasets/violin/defaults_entail.yaml @@ -0,0 +1,52 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + violin_entailment: # 22452 + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json + # - /export/video-language-dataset/data/violin/annotations_lavis.json + storage: + - violin/annotations/train.json + # - /export/video-language-dataset/data/violin/annotations_lavis.json + # val: + # url: + # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json + # - /export/video-language-dataset/data/violin/annotations_lavis_test.json + # storage: + # # - violin/annotations/test.json + # - /export/video-language-dataset/data/violin/annotations_lavis_test.json + videos: + storage: /export/video-language-dataset/data/violin/videos diff --git a/lavis/configs/datasets/violin/defaults_entail_instruct.yaml b/lavis/configs/datasets/violin/defaults_entail_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8eda1a263b8a51c0a425bbf6e5bdabc61a3de717 --- /dev/null +++ b/lavis/configs/datasets/violin/defaults_entail_instruct.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + violin_entailment_instruct: # 22452 + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json + # - /export/video-language-dataset/data/violin/annotations_lavis.json + storage: + - violin/annotations/train.json + # - /export/video-language-dataset/data/violin/annotations_lavis.json + # val: + # url: + # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json + # - /export/video-language-dataset/data/violin/annotations_lavis_test.json + # storage: + # # - violin/annotations/test.json + # - /export/video-language-dataset/data/violin/annotations_lavis_test.json + videos: + storage: /export/video-language-dataset/data/violin/videos diff --git a/lavis/configs/datasets/visdial/defaults_dial.yaml b/lavis/configs/datasets/visdial/defaults_dial.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4aaf71584c32c1566658c1439180030fcf0f2e6 --- /dev/null +++ b/lavis/configs/datasets/visdial/defaults_dial.yaml @@ -0,0 +1,41 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + visdial: # name of the dataset builder + data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json + storage: + - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json + val: + url: + - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json + storage: + - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json + # test: + # url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json + # storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json + images: + storage: /export/share/datasets/vision_language/visdial/ diff --git a/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml b/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4849c914822acf73a45440c6b2c432e3423c3261 --- /dev/null +++ b/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml @@ -0,0 +1,41 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + visdial_instruct: # name of the dataset builder + data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json + storage: + - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json + val: + url: + - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json + storage: + - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json + # test: + # url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json + # storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json + images: + storage: /export/share/datasets/vision_language/visdial/ diff --git a/lavis/configs/datasets/vizwiz/defaults.yaml b/lavis/configs/datasets/vizwiz/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e44fa7761305da83331bbcd06bbf43db794c03cf --- /dev/null +++ b/lavis/configs/datasets/vizwiz/defaults.yaml @@ -0,0 +1,43 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vizwiz_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_question + eval: + name: blip_question + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/val.json + # - /export/share/datasets/vision/vizwiz/Annotations/val.json + storage: + - vizwiz/annotations/val.json + # - /export/share/datasets/vision/vizwiz/Annotations/val.json + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/test.json + # - /export/share/datasets/vision/vizwiz/Annotations/test.json + storage: + - vizwiz/annotations/test.json + # - /export/share/datasets/vision/vizwiz/Annotations/test.json + images: + storage: /export/share/datasets/vision/vizwiz/images diff --git a/lavis/configs/datasets/vlep/defaults_cap.yaml b/lavis/configs/datasets/vlep/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0987a49e21e7427046bc9b0aead656819ba1b533 --- /dev/null +++ b/lavis/configs/datasets/vlep/defaults_cap.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vlep_caption: # 4900 + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json + # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json + storage: + - vlep/annotations/annotations_train_existing.json + # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json + # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json + storage: + - vlep/annotations/annotations_dev_existing.json + # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json + videos: + storage: /export/video-language-dataset/data/vlep/videos diff --git a/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml b/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d5e703ca5daa65a58e369643599da13dc71edcf --- /dev/null +++ b/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml @@ -0,0 +1,53 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vlep_caption_instruct: # 4900 + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_instruction + modality: image + task: caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json + # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json + storage: + - vlep/annotations/annotations_train_existing.json + # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json + # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json + storage: + - vlep/annotations/annotations_dev_existing.json + # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json + videos: + storage: /export/video-language-dataset/data/vlep/videos diff --git a/lavis/configs/datasets/vsr/defaults.yaml b/lavis/configs/datasets/vsr/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f9e29b847b82fa03fc1b68fd821fc80b845a19ae --- /dev/null +++ b/lavis/configs/datasets/vsr/defaults.yaml @@ -0,0 +1,49 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vsr_classification_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl + storage: + - vsr/annotations/train.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl + storage: + - vsr/annotations/dev.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl + storage: + - vsr/annotations/test.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl + images: + storage: /export/share/datasets/vision_language/VSR/images diff --git a/lavis/configs/datasets/vsr/defaults_classification.yaml b/lavis/configs/datasets/vsr/defaults_classification.yaml new file mode 100644 index 0000000000000000000000000000000000000000..11edcfd285a188c47a702f150d2673c7d96a21f8 --- /dev/null +++ b/lavis/configs/datasets/vsr/defaults_classification.yaml @@ -0,0 +1,49 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vsr_classification: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl + storage: + - vsr/annotations/train.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl + storage: + - vsr/annotations/dev.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl + storage: + - vsr/annotations/test.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl + images: + storage: /export/share/datasets/vision_language/VSR/images diff --git a/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml b/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b09c521ef4e3e003edd604e9d93125372f11660a --- /dev/null +++ b/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml @@ -0,0 +1,49 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vsr_caption_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl + storage: + - vsr/annotations/train.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl + storage: + - vsr/annotations/dev.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl + storage: + - vsr/annotations/test.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl + images: + storage: /export/share/datasets/vision_language/VSR/images diff --git a/lavis/configs/datasets/vsr/defaults_instruct.yaml b/lavis/configs/datasets/vsr/defaults_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e96c6e765bda4de503a7f05a3adf91f3db5a40c7 --- /dev/null +++ b/lavis/configs/datasets/vsr/defaults_instruct.yaml @@ -0,0 +1,53 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + vsr_caption_instruct: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + + vis_processor: + train: + name: "clip_image_train" + image_size: 224 + eval: + name: "clip_image_eval" + image_size: 224 + + text_processor: + train: + name: blip_instruction + task: caption + modality: image + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl + storage: + - vsr/annotations/train.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl + storage: + - vsr/annotations/dev.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl + test: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl + storage: + - vsr/annotations/test.jsonl + # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl + images: + storage: /export/share/datasets/vision_language/VSR/images diff --git a/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml b/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..463a8a5d91890f41abfa6c6e6fa200931c7e639d --- /dev/null +++ b/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml @@ -0,0 +1,63 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +datasets: + wavcaps_mm_caption: # name of the dataset builder + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + n_frames: 2 + frame_length: 512 + eval: + name: beats_audio + sampling_rate: 16000 + n_frames: 2 + frame_length: 512 + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + data_type: [audio] + + build_info: + kwargs: + cached: False + cached_dir: /export/share/datasets/audio/WavCaps/beats_features/ + + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json + - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json + - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json + - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json + storage: + - wavcaps/json_files/BBC_Sound_Effects/bbc_final.json + - wavcaps/json_files/FreeSound/fsd_final.json + - wavcaps/json_files/SoundBible/sb_final.json + - wavcaps/json_files/AudioSet_SL/as_final.json + - wavcaps/annotations/json_data.json + # train: + # url: + # - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json + # - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json + # - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json + # - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json + # - /export/share/datasets/audio/WavCaps/json_data.json + # storage: + # - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json + # - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json + # - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json + # - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json + # - /export/share/datasets/audio/WavCaps/json_data.json + + audio: + storage: /export/share/datasets/audio/WavCaps/ + \ No newline at end of file diff --git a/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml b/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..227a475701cd73e122fb4ed8a13461b18c114284 --- /dev/null +++ b/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml @@ -0,0 +1,63 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +datasets: + wavcaps_mm_caption_instruct: # name of the dataset builder + audio_processor: + train: + name: beats_audio + sampling_rate: 16000 + n_frames: 2 + frame_length: 512 + eval: + name: beats_audio + sampling_rate: 16000 + n_frames: 2 + frame_length: 512 + text_processor: + train: + name: "blip_instruction" + modality: audio + task: caption + eval: + name: "blip_caption" + + data_type: [audio] + + build_info: + kwargs: + cached: True + cached_dir: /export/share/datasets/audio/WavCaps/beats_features/ + + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + # url: + # - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json + # - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json + # - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json + # - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json + # - /export/share/datasets/audio/WavCaps/json_data.json + # storage: + # - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json + # - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json + # - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json + # - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json + # - /export/share/datasets/audio/WavCaps/json_data.json + url: + - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json + - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json + - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json + - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json + storage: + - wavcaps/json_files/BBC_Sound_Effects/bbc_final.json + - wavcaps/json_files/FreeSound/fsd_final.json + - wavcaps/json_files/SoundBible/sb_final.json + - wavcaps/json_files/AudioSet_SL/as_final.json + - wavcaps/annotations/json_data.json + + audio: + storage: /export/share/datasets/audio/WavCaps/ + \ No newline at end of file diff --git a/lavis/configs/datasets/webvid/defaults_cap.yaml b/lavis/configs/datasets/webvid/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94203848e1baad44ea41ff22942e21762c61fe43 --- /dev/null +++ b/lavis/configs/datasets/webvid/defaults_cap.yaml @@ -0,0 +1,41 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + webvid2m_caption: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 5 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + eval: + name: alpro_video_eval + n_frms: 5 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json + # - /export/home/LAVIS/webvid_annotation.json + storage: + - webvid2m/annotations/train.json + # - /export/home/LAVIS/webvid_annotation.json + images: + storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos diff --git a/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml b/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a98325f148504423583265895fdb6013b783055d --- /dev/null +++ b/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml @@ -0,0 +1,43 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + webvid2m_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 5 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + eval: + name: alpro_video_eval + n_frms: 5 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + text_processor: + train: + name: "blip_instruction" + modality: video + task: caption + eval: + name: "blip_caption" + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json + # - /export/home/LAVIS/webvid_annotation.json + storage: + - webvid2m/annotations/train.json + # - /export/home/LAVIS/webvid_annotation.json + images: + storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos diff --git a/lavis/configs/datasets/youcook/defaults_cap.yaml b/lavis/configs/datasets/youcook/defaults_cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..035c897e898acb8e7d5ad6c1ba29d9d0149712a8 --- /dev/null +++ b/lavis/configs/datasets/youcook/defaults_cap.yaml @@ -0,0 +1,51 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + youcook_caption: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json + # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json + storage: + - youcook/annotations/train_annotations.json + # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json + # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json + storage: + - youcook/annotations/val_annotations.json + # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json + videos: + storage: /export/video-language-dataset/data/youcook/raw_videos diff --git a/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml b/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45b371c72238eed9f996efb0b03a89cb80446632 --- /dev/null +++ b/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml @@ -0,0 +1,53 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + youcook_caption_instruct: # name of the dataset builder + # data_dir: ${env.data_dir}/datasets + data_type: videos # [images|videos|features] + + vis_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + text_processor: + train: + name: blip_instruction + modality: video + task: caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json + # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json + storage: + - youcook/annotations/train_annotations.json + # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json + val: + url: + - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json + # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json + storage: + - youcook/annotations/val_annotations.json + # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json + videos: + storage: /export/video-language-dataset/data/youcook/raw_videos diff --git a/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml b/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e46f417073a37c1d8f11c480cee6191f0a0bc9da --- /dev/null +++ b/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml @@ -0,0 +1,62 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + yt8m_mm_dialogue: # name of the dataset builder + data_type: [video] #extracted features of videos (I3D, VGGish) # [images|videos|features] + + video_processor: + train: + name: alpro_video_train + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + eval: + name: alpro_video_eval + n_frms: 4 + image_size: 224 + min_scale: 0.9 + max_scale: 1.0 + full_video: False + + audio_processor: + train: + name: beats_audio + # sampling_rate: 16000 + eval: + name: beats_audio + # sampling_rate: 16000 + is_eval: True + + text_processor: + train: + name: blip_caption + eval: + name: blip_caption + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json + storage: + - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json + val: + url: + - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json + storage: + - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json + + templates: null + + audio: + storage: /export/video-language-dataset/data/yt-8m/audios + + video: + storage: /export/video-language-dataset/data/yt-8m/videos + diff --git a/lavis/configs/default.yaml b/lavis/configs/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f58d32e264250895ab02b3d2e78a2ba6dfd3c125 --- /dev/null +++ b/lavis/configs/default.yaml @@ -0,0 +1,10 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +env: + # For default users + # cache_root: "cache" + # For internal use with persistent storage + cache_root: "/export/home/.cache/lavis" diff --git a/lavis/configs/models/albef_classification_ve.yaml b/lavis/configs/models/albef_classification_ve.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3a2accab99fad7e2a880944515baefab496b18a7 --- /dev/null +++ b/lavis/configs/models/albef_classification_ve.yaml @@ -0,0 +1,40 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: albef_classification + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" + pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" + + num_classes: 3 + + use_distill: True + momentum: 0.995 + alpha: 0.4 + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + vit_layer_norm_epsilon: 1e-6 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config_albef.json" + +preprocess: + vis_processor: + train: + name: "blip_image_train" + eval: + name: "blip_image_eval" + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/albef_feature_extractor.yaml b/lavis/configs/models/albef_feature_extractor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7def58e04a7b567e0a836e54f3dffdc62e1748ee --- /dev/null +++ b/lavis/configs/models/albef_feature_extractor.yaml @@ -0,0 +1,30 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: albef_pretrain + pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" + + # vit encoder + vit_type: "base" + image_size: 224 + vit_ckpt_layer: 0 + vit_drop_path_rate: 0 + vit_layer_norm_epsilon: 1e-6 + vit_grad_ckpt: False + + # bert config + med_config_path: "configs/models/med_config_albef.json" + + embed_dim: 256 + +preprocess: + vis_processor: + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/albef_nlvr.yaml b/lavis/configs/models/albef_nlvr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..86f17224aa0dfaa4739725e7c0516df4c679aa2d --- /dev/null +++ b/lavis/configs/models/albef_nlvr.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: albef_nlvr + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt" + + num_classes: 2 + + use_distill: True + momentum: 0.995 + alpha: 0.4 + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + vit_layer_norm_epsilon: 1e-6 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config_albef.json" + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 384 + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/albef_pretrain_base.yaml b/lavis/configs/models/albef_pretrain_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26e00efa423345b4a78332635d1a7c2e368fb02e --- /dev/null +++ b/lavis/configs/models/albef_pretrain_base.yaml @@ -0,0 +1,38 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: albef_pretrain + + load_pretrained: True + pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" + + # vit encoder + vit_type: "base" + image_size: 224 + vit_ckpt_layer: 0 + vit_drop_path_rate: 0 + vit_layer_norm_epsilon: 1e-6 + vit_grad_ckpt: False + + # bert config + med_config_path: "configs/models/med_config_albef.json" + mlm_mask_prob: 0.15 + + embed_dim: 256 + momentum: 0.995 + alpha: 0.4 + temp: 0.07 + + max_txt_len: 30 + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 256 + text_processor: + train: + name: "blip_caption" diff --git a/lavis/configs/models/albef_retrieval_coco.yaml b/lavis/configs/models/albef_retrieval_coco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9971e6ca5d9aa85790ee2aefd9b7251e8a8b200c --- /dev/null +++ b/lavis/configs/models/albef_retrieval_coco.yaml @@ -0,0 +1,46 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: albef_retrieval + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt" + + queue_size: 65536 + + # vit encoder + vit_type: "base" + image_size: 384 + vit_ckpt_layer: 0 + vit_drop_path_rate: 0 + vit_layer_norm_epsilon: 1e-6 + vit_grad_ckpt: False + + # bert config + med_config_path: "configs/models/med_config_albef.json" + + embed_dim: 256 + momentum: 0.995 + alpha: 0.4 + temp: 0.07 + use_distill: True + + max_txt_len: 30 + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 384 + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/albef_retrieval_flickr.yaml b/lavis/configs/models/albef_retrieval_flickr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f5f77f0f99912d0f2c501e567dd0360e5c2b9336 --- /dev/null +++ b/lavis/configs/models/albef_retrieval_flickr.yaml @@ -0,0 +1,46 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: albef_retrieval + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" + finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt + + queue_size: 65536 + + # vit encoder + vit_type: "base" + image_size: 384 + vit_ckpt_layer: 0 + vit_drop_path_rate: 0 + vit_layer_norm_epsilon: 1e-6 + vit_grad_ckpt: False + + # bert config + med_config_path: "configs/models/med_config_albef.json" + + embed_dim: 256 + momentum: 0.995 + alpha: 0.4 + temp: 0.07 + use_distill: True + + max_txt_len: 30 + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 384 + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/albef_vqav2.yaml b/lavis/configs/models/albef_vqav2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e35559f356bd77f9eedaa76b43d393a142f40239 --- /dev/null +++ b/lavis/configs/models/albef_vqav2.yaml @@ -0,0 +1,40 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: albef_vqa + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt" + + use_distill: True + momentum: 0.995 + alpha: 0.4 + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + vit_layer_norm_epsilon: 1e-6 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config_albef.json" + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 384 + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + train: + name: "blip_question" + eval: + name: "blip_question" diff --git a/lavis/configs/models/alpro_qa_msrvtt.yaml b/lavis/configs/models/alpro_qa_msrvtt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e3f58a1308c0d2a2075c037f6defcd4500e29b1b --- /dev/null +++ b/lavis/configs/models/alpro_qa_msrvtt.yaml @@ -0,0 +1,44 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: alpro_qa + num_classes: 1500 + + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" + + timesformer: + n_frms: 16 + image_size: 224 + + patch_size: 16 + attn_drop_rate: 0. + drop_rate: 0. + drop_path_rate: 0.1 + + use_grad_ckpt: True + ckpt_layer: 12 + + # bert config + med_config_path: "configs/models/bert_config_alpro.json" + +preprocess: + vis_processor: + train: + name: "alpro_video_train" + n_frms: 16 + image_size: 224 + eval: + name: "alpro_video_eval" + n_frms: 16 + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/alpro_qa_msvd.yaml b/lavis/configs/models/alpro_qa_msvd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..17d606fcc0fd8fb8adedbb992db49f6e56e67c5f --- /dev/null +++ b/lavis/configs/models/alpro_qa_msvd.yaml @@ -0,0 +1,43 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: alpro_qa + num_classes: 2423 + + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" + + timesformer: + n_frms: 16 + image_size: 224 + + patch_size: 16 + attn_drop_rate: 0. + drop_rate: 0. + drop_path_rate: 0.1 + use_grad_ckpt: True + ckpt_layer: 12 + + # bert config + med_config_path: "configs/models/bert_config_alpro.json" + +preprocess: + vis_processor: + train: + name: "alpro_video_train" + n_frms: 16 + image_size: 224 + eval: + name: "alpro_video_eval" + n_frms: 16 + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/alpro_retrieval_didemo.yaml b/lavis/configs/models/alpro_retrieval_didemo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bd021c5a5d2e93e53e74ef4cf2a94bb921a6cd83 --- /dev/null +++ b/lavis/configs/models/alpro_retrieval_didemo.yaml @@ -0,0 +1,35 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: alpro_retrieval + + load_finetuned: True + + finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" + + timesformer: + n_frms: 8 + image_size: 224 + + patch_size: 16 + attn_drop_rate: 0. + drop_rate: 0. + drop_path_rate: 0.1 + use_grad_ckpt: False + + # bert config + med_config_path: "configs/models/bert_config_alpro.json" + +preprocess: + vis_processor: + eval: + name: "alpro_video_eval" + n_frms: 8 + image_size: 224 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/alpro_retrieval_msrvtt.yaml b/lavis/configs/models/alpro_retrieval_msrvtt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..431aa3ea65f83a6213c88ae07465e0c1ff7cb3ea --- /dev/null +++ b/lavis/configs/models/alpro_retrieval_msrvtt.yaml @@ -0,0 +1,41 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: alpro_retrieval + + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" + + timesformer: + n_frms: 8 + image_size: 224 + + patch_size: 16 + attn_drop_rate: 0. + drop_rate: 0. + drop_path_rate: 0.1 + use_grad_ckpt: False + + # bert config + med_config_path: "configs/models/bert_config_alpro.json" + +preprocess: + vis_processor: + train: + name: "alpro_video_train" + n_frms: 8 + image_size: 224 + eval: + name: "alpro_video_eval" + n_frms: 8 + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/bert_config.json b/lavis/configs/models/bert_config.json new file mode 100644 index 0000000000000000000000000000000000000000..477a9f42513d0afb774735f07177161bdd1ae94b --- /dev/null +++ b/lavis/configs/models/bert_config.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "add_type_embeddings": false, + "vocab_size": 30522, + "encoder_width": 768, + "add_cross_attention": true +} \ No newline at end of file diff --git a/lavis/configs/models/bert_config_alpro.json b/lavis/configs/models/bert_config_alpro.json new file mode 100644 index 0000000000000000000000000000000000000000..a21b3a2c9344651c1d88797338de5830ca3fc043 --- /dev/null +++ b/lavis/configs/models/bert_config_alpro.json @@ -0,0 +1,23 @@ +{ + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "add_type_embeddings": true, + "type_vocab_size": 2, + "vocab_size": 30522, + "encoder_width": 768, + "add_cross_attention": false, + "fusion_layer": 6 +} \ No newline at end of file diff --git a/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml b/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45bd52ad02917fdb9cf67b209e2c1f3b65d4384a --- /dev/null +++ b/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml @@ -0,0 +1,25 @@ +model: + vit_model: "clip_L" + + qformer_num_query_token: 16 + qformer_cross_attention_freq: 1 + + sd_train_text_encoder: False + sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" + + load_finetuned: False + load_pretrained: True + # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz" + +preprocess: + vis_processor: + train: + name: "blip_diffusion_inp_image_eval" + eval: + name: "blip_diffusion_inp_image_eval" + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml b/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d3a65ecc70041fee85c4d1f2db0c82c95f211355 --- /dev/null +++ b/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml @@ -0,0 +1,27 @@ +model: + vit_model: "clip_L" + + qformer_num_query_token: 16 + qformer_cross_attention_freq: 1 + + sd_train_text_encoder: False + sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" + + load_finetuned: False + load_pretrained: True + # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz" + + controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny" + +preprocess: + vis_processor: + train: + name: "blip_diffusion_inp_image_eval" + eval: + name: "blip_diffusion_inp_image_eval" + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml b/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..29421a72565a63a9d60d5c9980a84219fce80155 --- /dev/null +++ b/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml @@ -0,0 +1,27 @@ +model: + vit_model: "clip_L" + + qformer_num_query_token: 16 + qformer_cross_attention_freq: 1 + + sd_train_text_encoder: False + sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" + + load_finetuned: False + load_pretrained: True + # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz" + + controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth" + +preprocess: + vis_processor: + train: + name: "blip_diffusion_inp_image_eval" + eval: + name: "blip_diffusion_inp_image_eval" + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml b/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml new file mode 100644 index 0000000000000000000000000000000000000000..275eba088a93654ef69304ff127879e50296a910 --- /dev/null +++ b/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml @@ -0,0 +1,27 @@ +model: + vit_model: "clip_L" + + qformer_num_query_token: 16 + qformer_cross_attention_freq: 1 + + sd_train_text_encoder: False + sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" + + load_finetuned: False + load_pretrained: True + # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz" + + controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-hed" + +preprocess: + vis_processor: + train: + name: "blip_diffusion_inp_image_eval" + eval: + name: "blip_diffusion_inp_image_eval" + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml b/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6591e15c1a5c9c6052a95caba26c2b635a842785 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: caption_coco_flant5xl + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth" + + # vit encoder + image_size: 364 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp32" + freeze_vit: False + + # Q-Former + num_query_token: 32 + + # T5 + t5_model: "google/flan-t5-xl" + + # generation configs + prompt: "a photo of" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 364 + eval: + name: "blip_image_eval" + image_size: 364 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml b/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5005fb72ada67d0e304483e5b98428f4be7c0236 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: caption_coco_opt2.7b + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth" + + # vit encoder + image_size: 364 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp32" + freeze_vit: False + + # Q-Former + num_query_token: 32 + + # OPT + opt_model: "facebook/opt-2.7b" + + # generation configs + prompt: "a photo of" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 364 + eval: + name: "blip_image_eval" + image_size: 364 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml b/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..464da1bb28668f6aa9106b3aac44cb500f85d727 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: caption_coco_opt6.7b + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth" + + # vit encoder + image_size: 364 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp32" + freeze_vit: False + + # Q-Former + num_query_token: 32 + + # OPT + opt_model: "facebook/opt-6.7b" + + # generation configs + prompt: "a photo of" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 364 + eval: + name: "blip_image_eval" + image_size: 364 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_coco.yaml b/lavis/configs/models/blip2/blip2_coco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03abc369b866db180c4e7bff8b00de637bc55cf0 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_coco.yaml @@ -0,0 +1,36 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: coco + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" + + # vit encoder + image_size: 364 + drop_path_rate: 0 + use_grad_checkpoint: True + vit_precision: "fp32" + freeze_vit: False + + # Q-Former + num_query_token: 32 + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 364 + eval: + name: "blip_image_eval" + image_size: 364 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml b/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c2e3de96890d7a73aa75d4a35a4ff5928deb24d --- /dev/null +++ b/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml @@ -0,0 +1,43 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: flant5xl + load_finetuned: False + load_pretrained: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # T5 + t5_model: "google/flan-t5-xl" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml b/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c26cce2fc251d91400412ebbdbb66f00ddf77e54 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml @@ -0,0 +1,43 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: flant5xxl + load_finetuned: False + load_pretrained: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # T5 + t5_model: "google/flan-t5-xxl" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml b/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..10365394c0374595cf59d12ef25da3e64ea496f6 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml @@ -0,0 +1,43 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: instruct_vicuna13b + load_finetuned: False + load_pretrained: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # path to Vicuna checkpoint + llm_model: "./llm/vicuna-13b" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml b/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af67777d3940c7e6b75ea9ee7cac6a1f56b13b62 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml @@ -0,0 +1,43 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: instruct_vicuna7b + load_finetuned: False + load_pretrained: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # path to Vicuna checkpoint + llm_model: "./llm/vicuna-7b" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_pretrain.yaml b/lavis/configs/models/blip2/blip2_pretrain.yaml new file mode 100644 index 0000000000000000000000000000000000000000..126025ebaeb20ec88ebc2af61d16acd37843125d --- /dev/null +++ b/lavis/configs/models/blip2/blip2_pretrain.yaml @@ -0,0 +1,36 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pretrain + load_finetuned: False + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml b/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf90da225618de43a3b5fa70954b363227fcd804 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pretrain_flant5xl + load_finetuned: False + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # T5 + t5_model: "google/flan-t5-xl" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml b/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fca3e9a0aa053245d08d376594f75336ba0150b7 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml @@ -0,0 +1,43 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pretrain_flant5xl + load_finetuned: False + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth" + finetuned: "" + + # vit encoder + vit_model: "clip_L" + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # T5 + t5_model: "google/flan-t5-xl" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml b/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8240904d01dde5b1dfd74baca6bb83421d92ac3e --- /dev/null +++ b/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pretrain_flant5xxl + load_finetuned: False + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # T5 + t5_model: "google/flan-t5-xxl" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml b/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4adfa5cd42752d01b4c8126d3b21ec85df000eee --- /dev/null +++ b/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_llama + load_finetuned: False + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # LLM + llm_model: "/export/home/project/stanford_alpaca/llama_7B" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml b/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6e0bccd3fa69814bbcc294bb0a28089f3a62e5a --- /dev/null +++ b/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pretrain_opt2.7b + load_finetuned: False + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # OPT + opt_model: "facebook/opt-2.7b" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml b/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..89adbfe363272a90c5bc80fbdb8ca33f05e0033c --- /dev/null +++ b/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pretrain_opt6.7b + load_finetuned: False + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" + finetuned: "" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + # OPT + opt_model: "facebook/opt-6.7b" + + # generation configs + prompt: "" + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml b/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a0a0fc6464abcfea3e08655e43e381c9456f62b5 --- /dev/null +++ b/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml @@ -0,0 +1,37 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pretrain + load_finetuned: False + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth" + finetuned: "" + + # vit encoder + vit_model: "clip_L" + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + freeze_vit: True + + # Q-Former + num_query_token: 32 + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip2/blip2_xinstruct_vicuna13b.yaml b/lavis/configs/models/blip2/blip2_xinstruct_vicuna13b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7502033dfadd81c710b5fdf92ef18ad36049a34a --- /dev/null +++ b/lavis/configs/models/blip2/blip2_xinstruct_vicuna13b.yaml @@ -0,0 +1,74 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_xinstruct + model_type: vicuna13b + load_pretrained: True + pretrained: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth + load_finetuned: False + finetuned: "" + stage1_url_or_filename: null + image_model: "eva_clip_g" + pc_model: "ulip2_pointbert" + video_model: "eva_clip_g" + audio_model: "beats" + pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/models/xinstructblip_checkpoints/vicuna13b/image_qformer.pth + pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/models/xinstructblip_checkpoints/vicuna13b/pc_qformer.pth + pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/models/xinstructblip_checkpoints/vicuna13b/video_qformer.pth + pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/models/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth + load_attention_image_qformer: True + load_attention_pc_qformer: True + load_attention_video_qformer: True + load_attention_audio_qformer: True + load_ln_type_image: "image" + load_ln_type_video: "video" + load_ln_type_audio: "audio" + load_qformer_type_image: "image" + load_qformer_type_pc: "pc" + load_qformer_type_video: "video" + load_qformer_type_audio: "audio" + load_projection_image: True + load_projection_pc: True + load_projection_video: True + load_projection_audio: True + load_projection_type_image: "image" + load_projection_type_pc: "pc" + load_projection_type_video: "video" + load_projection_type_audio: "audio" + image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False} + pc_encoder_kwargs : {} + video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False} + audio_encoder_kwargs : {} + image_precision: "fp16" + pc_precision: "fp16" + video_precision: "fp16" + audio_precision: "fp16" + freeze_image: True + freeze_pc: True + freeze_video: True + freeze_audio: True + num_query_token: 32 + llm_model: "/path/to/vicuna-13b" + prompt: "" + max_txt_len: 128 + max_output_txt_len: 256 + apply_lemmatizer: False + num_few_shot_examples: 0 + few_shot_prob: 0 + qformer_text_input: True + llm_text_input: True + modalities : ["image", "video", "audio", "pc"] + use_cues: True + shared_qformer: False + pretrained_shared_qformer: Null + load_attention_shared_qformer: False + load_qformer_type_shared: "" + load_projection_shared: False + load_projection_type_shaped: "" + load_ln_type_shared: "" + shared_qformer_num_features: 512 + prefix: "" + postfix: "" diff --git a/lavis/configs/models/blip2/blip2_xinstruct_vicuna7b.yaml b/lavis/configs/models/blip2/blip2_xinstruct_vicuna7b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bf736815c3914093e534906f871714efc2bf078a --- /dev/null +++ b/lavis/configs/models/blip2/blip2_xinstruct_vicuna7b.yaml @@ -0,0 +1,77 @@ + # Copyright (c) 2023, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_xinstruct + model_type: vicuna7b + load_pretrained: True + pretrained: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth + load_finetuned: False + finetuned: "" + stage1_url_or_filename: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth + image_model: "eva_clip_g" + pc_model: "ulip2_pointbert" + video_model: "eva_clip_g" + audio_model: "beats" + pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth + pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth + pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth + pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer_improved.pth + load_attention_image_qformer: True + load_attention_pc_qformer: True + load_attention_video_qformer: True + load_attention_audio_qformer: True + load_ln_type_image: "image" + load_ln_type_video: "video" + load_ln_type_pc: "pc" + load_ln_type_audio: "audio" + load_qformer_type_image: "image" + load_qformer_type_pc: "pc" + load_qformer_type_video: "video" + load_qformer_type_audio: "audio" + load_projection_image: True + load_projection_pc: True + load_projection_video: True + load_projection_audio: True + load_projection_type_image: "image" + load_projection_type_pc: "pc" + load_projection_type_video: "video" + load_projection_type_audio: "audio" + image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False} + pc_encoder_kwargs : {} + video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False} + audio_encoder_kwargs : {} + image_precision: "fp16" + pc_precision: "fp16" + video_precision: "fp16" + audio_precision: "fp16" + freeze_image: True + freeze_pc: True + freeze_video: True + freeze_audio: True + num_query_token: 32 + llm_model: "/path/to/vicuna-7b" + prompt: "" + max_txt_len: 128 + max_output_txt_len: 256 + apply_lemmatizer: False + num_few_shot_examples: 0 + few_shot_prob: 0 + qformer_text_input: True + llm_text_input: True + modalities : ["audio", "video", "pc", "image"] + use_cues: True + shared_qformer: False + pretrained_shared_qformer: Null + load_attention_shared_qformer: False + load_qformer_type_shared: "" + load_projection_shared: False + load_projection_type_shaped: "" + load_ln_type_shared: "" + shared_qformer_num_features: 512 + prefix: "USER: " + postfix: "\nASSISTANT:" + predict_with_gen: False + clean_tokenization: True \ No newline at end of file diff --git a/lavis/configs/models/blip_caption_base_coco.yaml b/lavis/configs/models/blip_caption_base_coco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2ee481c234290fef7d74667c2ce3e8c66fc7a3ab --- /dev/null +++ b/lavis/configs/models/blip_caption_base_coco.yaml @@ -0,0 +1,38 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_caption + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth" + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config.json" + + # generation configs + prompt: "a picture of " + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + eval: + name: "blip_image_eval" + text_processor: + train: + name: "blip_caption" + prompt: "a picture of " + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip_caption_large_coco.yaml b/lavis/configs/models/blip_caption_large_coco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a0e8ae93c3f5236aac93669c53db448d312aa5eb --- /dev/null +++ b/lavis/configs/models/blip_caption_large_coco.yaml @@ -0,0 +1,37 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_caption + load_finetuned: True + + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" + + vit_type: "large" + vit_grad_ckpt: True + vit_ckpt_layer: 5 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + # generation configs + prompt: "a picture of " + + +preprocess: + vis_processor: + train: + name: "blip_image_train" + eval: + name: "blip_image_eval" + text_processor: + train: + name: "blip_caption" + prompt: "a picture of " + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip_classification_base.yaml b/lavis/configs/models/blip_classification_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bad38f200daeb3177dce269807ffada275e61ac3 --- /dev/null +++ b/lavis/configs/models/blip_classification_base.yaml @@ -0,0 +1,22 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_classification + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" + + use_distill: True + momentum: 0.995 + alpha: 0.4 + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config.json" diff --git a/lavis/configs/models/blip_feature_extractor_base.yaml b/lavis/configs/models/blip_feature_extractor_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eaee381415c9eb7e0bf787ad5cf9b61bf2690489 --- /dev/null +++ b/lavis/configs/models/blip_feature_extractor_base.yaml @@ -0,0 +1,29 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_pretrain + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 224 + + # bert config + med_config_path: "configs/models/med_config.json" + + embed_dim: 256 + +preprocess: + vis_processor: + eval: + name: "blip_image_eval" + image_size: 224 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip_itm_base.yaml b/lavis/configs/models/blip_itm_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c79db89d3cb55575b5f4b8aa499859c5915b183 --- /dev/null +++ b/lavis/configs/models/blip_itm_base.yaml @@ -0,0 +1,31 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_image_text_matching + + load_finetuned: True + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config.json" + + embed_dim: 256 + +preprocess: + vis_processor: + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip_itm_large.yaml b/lavis/configs/models/blip_itm_large.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9bcbf4850d2eb159c506e52a8fa88de59d3a87d7 --- /dev/null +++ b/lavis/configs/models/blip_itm_large.yaml @@ -0,0 +1,31 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_image_text_matching + + load_finetuned: True + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" + + # vit encoder + vit_type: "large" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + embed_dim: 256 + +preprocess: + vis_processor: + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip_nlvr.yaml b/lavis/configs/models/blip_nlvr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..02ecb13f11bdd02b161633d0d8c3c74eab64ba21 --- /dev/null +++ b/lavis/configs/models/blip_nlvr.yaml @@ -0,0 +1,39 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_nlvr + model_type: nlvr + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + + num_classes: 2 + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + vit_layer_norm_epsilon: 1e-6 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config.json" + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 384 + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip_pretrain_base.yaml b/lavis/configs/models/blip_pretrain_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e265b832a618304d50e17a9dbf242bfe4df720db --- /dev/null +++ b/lavis/configs/models/blip_pretrain_base.yaml @@ -0,0 +1,35 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_pretrain + + load_pretrained: True + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 224 + alpha: 0.4 + + # bert config + med_config_path: "configs/models/bert_config.json" + + embed_dim: 256 + + # generation configs + prompt: "a picture of " + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 224 + text_processor: + train: + name: "blip_caption" diff --git a/lavis/configs/models/blip_pretrain_large.yaml b/lavis/configs/models/blip_pretrain_large.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d01cbe3baf09dd118d3e127c1ce1d8e3ea2238a6 --- /dev/null +++ b/lavis/configs/models/blip_pretrain_large.yaml @@ -0,0 +1,22 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_pretrain + + # vit encoder + vit_type: "large" + vit_grad_ckpt: True + vit_ckpt_layer: 5 + + image_size: 224 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + embed_dim: 256 + + # generation configs + prompt: "a picture of " diff --git a/lavis/configs/models/blip_retrieval_coco.yaml b/lavis/configs/models/blip_retrieval_coco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..30eb79028f12266224e5286e563381ba963bd756 --- /dev/null +++ b/lavis/configs/models/blip_retrieval_coco.yaml @@ -0,0 +1,39 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_retrieval + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + + queue_size: 57600 + + # vit encoder + vit_type: "base" + vit_grad_ckpt: True + vit_ckpt_layer: 4 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config.json" + + embed_dim: 256 + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 384 + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip_retrieval_flickr.yaml b/lavis/configs/models/blip_retrieval_flickr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e4bf1fbc2db796a3ce0f08dfa357fe982856d8a0 --- /dev/null +++ b/lavis/configs/models/blip_retrieval_flickr.yaml @@ -0,0 +1,42 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_retrieval + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + + queue_size: 57600 + alpha: 0.4 + + negative_all_rank: False + + # vit encoder + vit_type: "base" + vit_grad_ckpt: True + vit_ckpt_layer: 4 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_config.json" + + embed_dim: 256 + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 384 + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" diff --git a/lavis/configs/models/blip_vqa_aokvqa.yaml b/lavis/configs/models/blip_vqa_aokvqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b3afe3e7a2e3a55c569a8c7fce3d83d1ef3ddabe --- /dev/null +++ b/lavis/configs/models/blip_vqa_aokvqa.yaml @@ -0,0 +1,36 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_vqa + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + vit_drop_path_rate: 0.1 + + image_size: 480 + + # bert config + med_config_path: "configs/models/med_config.json" + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 480 + eval: + name: "blip_image_eval" + image_size: 480 + text_processor: + train: + name: "blip_question" + eval: + name: "blip_question" diff --git a/lavis/configs/models/blip_vqa_okvqa.yaml b/lavis/configs/models/blip_vqa_okvqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eb66ccbbf1f2faed4dfe916b042263861798d951 --- /dev/null +++ b/lavis/configs/models/blip_vqa_okvqa.yaml @@ -0,0 +1,36 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_vqa + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + vit_drop_path_rate: 0.1 + + image_size: 480 + + # bert config + med_config_path: "configs/models/med_config.json" + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 480 + eval: + name: "blip_image_eval" + image_size: 480 + text_processor: + train: + name: "blip_question" + eval: + name: "blip_question" diff --git a/lavis/configs/models/blip_vqav2.yaml b/lavis/configs/models/blip_vqav2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4f0ce8daac2d23d47d342f17630ca86f7002cc50 --- /dev/null +++ b/lavis/configs/models/blip_vqav2.yaml @@ -0,0 +1,36 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip_vqa + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + + # vit encoder + vit_type: "base" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + vit_drop_path_rate: 0.1 + + image_size: 480 + + # bert config + med_config_path: "configs/models/med_config.json" + +preprocess: + vis_processor: + train: + name: "blip_image_train" + image_size: 480 + eval: + name: "blip_image_eval" + image_size: 480 + text_processor: + train: + name: "blip_question" + eval: + name: "blip_question" diff --git a/lavis/configs/models/clip/RN101-quickgelu.json b/lavis/configs/models/clip/RN101-quickgelu.json new file mode 100644 index 0000000000000000000000000000000000000000..1dbd19be9d289887b4e41bd50acdbdc78709efd3 --- /dev/null +++ b/lavis/configs/models/clip/RN101-quickgelu.json @@ -0,0 +1,22 @@ +{ + "embed_dim": 512, + "quick_gelu": true, + "vision_cfg": { + "image_size": 224, + "layers": [ + 3, + 4, + 23, + 3 + ], + "width": 64, + "patch_size": null + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/RN101.json b/lavis/configs/models/clip/RN101.json new file mode 100644 index 0000000000000000000000000000000000000000..bf5babbc5a3ef48653083f10a549f42afe14727a --- /dev/null +++ b/lavis/configs/models/clip/RN101.json @@ -0,0 +1,21 @@ +{ + "embed_dim": 512, + "vision_cfg": { + "image_size": 224, + "layers": [ + 3, + 4, + 23, + 3 + ], + "width": 64, + "patch_size": null + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/RN50-quickgelu.json b/lavis/configs/models/clip/RN50-quickgelu.json new file mode 100644 index 0000000000000000000000000000000000000000..8c2f91260cdeb043434dc1e893cce81d4ce7f0d1 --- /dev/null +++ b/lavis/configs/models/clip/RN50-quickgelu.json @@ -0,0 +1,22 @@ +{ + "embed_dim": 1024, + "quick_gelu": true, + "vision_cfg": { + "image_size": 224, + "layers": [ + 3, + 4, + 6, + 3 + ], + "width": 64, + "patch_size": null + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/RN50.json b/lavis/configs/models/clip/RN50.json new file mode 100644 index 0000000000000000000000000000000000000000..ad98b4b8822d72b5196ddafcb732329ecad2ce56 --- /dev/null +++ b/lavis/configs/models/clip/RN50.json @@ -0,0 +1,21 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "image_size": 224, + "layers": [ + 3, + 4, + 6, + 3 + ], + "width": 64, + "patch_size": null + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/RN50x16.json b/lavis/configs/models/clip/RN50x16.json new file mode 100644 index 0000000000000000000000000000000000000000..66576383a0cbd2ffcdd7a050e5fcbab420c7fecb --- /dev/null +++ b/lavis/configs/models/clip/RN50x16.json @@ -0,0 +1,21 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "image_size": 384, + "layers": [ + 6, + 8, + 18, + 8 + ], + "width": 96, + "patch_size": null + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 768, + "heads": 12, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/RN50x4.json b/lavis/configs/models/clip/RN50x4.json new file mode 100644 index 0000000000000000000000000000000000000000..a41cb630517cc155c1ee6aa8660f6c7948f3ee4b --- /dev/null +++ b/lavis/configs/models/clip/RN50x4.json @@ -0,0 +1,21 @@ +{ + "embed_dim": 640, + "vision_cfg": { + "image_size": 288, + "layers": [ + 4, + 6, + 10, + 6 + ], + "width": 80, + "patch_size": null + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 640, + "heads": 10, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-B-16-plus-240.json b/lavis/configs/models/clip/ViT-B-16-plus-240.json new file mode 100644 index 0000000000000000000000000000000000000000..9347280c60a2a19233ac027d810ded21c26ea867 --- /dev/null +++ b/lavis/configs/models/clip/ViT-B-16-plus-240.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 640, + "vision_cfg": { + "image_size": 240, + "layers": 12, + "width": 896, + "patch_size": 16 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 640, + "heads": 10, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-B-16-plus.json b/lavis/configs/models/clip/ViT-B-16-plus.json new file mode 100644 index 0000000000000000000000000000000000000000..f9cc3e3b0084590581d1ec3e81b930a9a190e036 --- /dev/null +++ b/lavis/configs/models/clip/ViT-B-16-plus.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 640, + "vision_cfg": { + "image_size": 224, + "layers": 12, + "width": 896, + "patch_size": 16 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 640, + "heads": 10, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-B-16.json b/lavis/configs/models/clip/ViT-B-16.json new file mode 100644 index 0000000000000000000000000000000000000000..9afeef0fbc807f130f2b2bc65c1dd85abc9eba72 --- /dev/null +++ b/lavis/configs/models/clip/ViT-B-16.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 512, + "vision_cfg": { + "image_size": 224, + "layers": 12, + "width": 768, + "patch_size": 16 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-B-32-plus-256.json b/lavis/configs/models/clip/ViT-B-32-plus-256.json new file mode 100644 index 0000000000000000000000000000000000000000..27ae13857a0bdf0c7825ba7768de0071bda3e82e --- /dev/null +++ b/lavis/configs/models/clip/ViT-B-32-plus-256.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 640, + "vision_cfg": { + "image_size": 256, + "layers": 12, + "width": 896, + "patch_size": 32 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 640, + "heads": 10, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-B-32-quickgelu.json b/lavis/configs/models/clip/ViT-B-32-quickgelu.json new file mode 100644 index 0000000000000000000000000000000000000000..f5a063adbf96df9e169706286643ab9a261b251c --- /dev/null +++ b/lavis/configs/models/clip/ViT-B-32-quickgelu.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 512, + "quick_gelu": true, + "vision_cfg": { + "image_size": 224, + "layers": 12, + "width": 768, + "patch_size": 32 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-B-32.json b/lavis/configs/models/clip/ViT-B-32.json new file mode 100644 index 0000000000000000000000000000000000000000..abd1f7973dc856ba56004ad0538f4f74f5e08a6d --- /dev/null +++ b/lavis/configs/models/clip/ViT-B-32.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 512, + "vision_cfg": { + "image_size": 224, + "layers": 12, + "width": 768, + "patch_size": 32 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-H-14.json b/lavis/configs/models/clip/ViT-H-14.json new file mode 100644 index 0000000000000000000000000000000000000000..d2c01733dcab1293858bf8aa200f05cdb0b6f56c --- /dev/null +++ b/lavis/configs/models/clip/ViT-H-14.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "image_size": 224, + "layers": 32, + "width": 1280, + "head_width": 80, + "patch_size": 14 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 1024, + "heads": 16, + "layers": 24 + } +} diff --git a/lavis/configs/models/clip/ViT-H-16.json b/lavis/configs/models/clip/ViT-H-16.json new file mode 100644 index 0000000000000000000000000000000000000000..942ed56bf6e24a0c19a41fad87db304444402b4f --- /dev/null +++ b/lavis/configs/models/clip/ViT-H-16.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "image_size": 224, + "layers": 32, + "width": 1280, + "head_width": 80, + "patch_size": 16 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 1024, + "heads": 16, + "layers": 24 + } +} diff --git a/lavis/configs/models/clip/ViT-L-14-280.json b/lavis/configs/models/clip/ViT-L-14-280.json new file mode 100644 index 0000000000000000000000000000000000000000..c8e5fbac8a14c4c66c57df166ffe5dceb188e436 --- /dev/null +++ b/lavis/configs/models/clip/ViT-L-14-280.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "image_size": 280, + "layers": 24, + "width": 1024, + "patch_size": 14 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 768, + "heads": 12, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-L-14-336.json b/lavis/configs/models/clip/ViT-L-14-336.json new file mode 100644 index 0000000000000000000000000000000000000000..4db3a1e77c891cda4d32ea3b9da9bef2c2aade0c --- /dev/null +++ b/lavis/configs/models/clip/ViT-L-14-336.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "image_size": 336, + "layers": 24, + "width": 1024, + "patch_size": 14 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 768, + "heads": 12, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-L-14.json b/lavis/configs/models/clip/ViT-L-14.json new file mode 100644 index 0000000000000000000000000000000000000000..98951b0cbff3776e90b0c2685ce4d04f1f874343 --- /dev/null +++ b/lavis/configs/models/clip/ViT-L-14.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "image_size": 224, + "layers": 24, + "width": 1024, + "patch_size": 14 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 768, + "heads": 12, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-L-16-320.json b/lavis/configs/models/clip/ViT-L-16-320.json new file mode 100644 index 0000000000000000000000000000000000000000..cc09c4877d27597fb0f50332e7cbcf8028586ce2 --- /dev/null +++ b/lavis/configs/models/clip/ViT-L-16-320.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "image_size": 320, + "layers": 24, + "width": 1024, + "patch_size": 16 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 768, + "heads": 12, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-L-16.json b/lavis/configs/models/clip/ViT-L-16.json new file mode 100644 index 0000000000000000000000000000000000000000..78601e7a6822382e3466c1c00459392ee7768024 --- /dev/null +++ b/lavis/configs/models/clip/ViT-L-16.json @@ -0,0 +1,16 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "image_size": 224, + "layers": 24, + "width": 1024, + "patch_size": 16 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 768, + "heads": 12, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/ViT-g-14.json b/lavis/configs/models/clip/ViT-g-14.json new file mode 100644 index 0000000000000000000000000000000000000000..b5c4231a67a82d1c30b675719f3004daed84299b --- /dev/null +++ b/lavis/configs/models/clip/ViT-g-14.json @@ -0,0 +1,18 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "image_size": 224, + "layers": 40, + "width": 1408, + "head_width": 88, + "mlp_ratio": 4.3637, + "patch_size": 14 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 1024, + "heads": 16, + "layers": 24 + } +} diff --git a/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json b/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json new file mode 100644 index 0000000000000000000000000000000000000000..fa4bfb1df0240d72552e7b09dd4d17ee48a1c0e6 --- /dev/null +++ b/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "timm_model_name": "efficientnetv2_rw_s", + "timm_model_pretrained": false, + "timm_pool": "abs_attn", + "timm_proj": "", + "image_size": 288 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 768, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/timm-resnet50d.json b/lavis/configs/models/clip/timm-resnet50d.json new file mode 100644 index 0000000000000000000000000000000000000000..7bb0957cd23e3dd0fb461764c959a75e04cae743 --- /dev/null +++ b/lavis/configs/models/clip/timm-resnet50d.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "timm_model_name": "resnet50d", + "timm_model_pretrained": false, + "timm_pool": "abs_attn", + "timm_proj": "", + "image_size": 224 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/timm-resnetaa50d.json b/lavis/configs/models/clip/timm-resnetaa50d.json new file mode 100644 index 0000000000000000000000000000000000000000..c011e0c02b5d63b1ace51e4625d383adc6aedb50 --- /dev/null +++ b/lavis/configs/models/clip/timm-resnetaa50d.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "timm_model_name": "resnetaa50d", + "timm_model_pretrained": false, + "timm_pool": "abs_attn", + "timm_proj": "", + "image_size": 224 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/timm-resnetblur50.json b/lavis/configs/models/clip/timm-resnetblur50.json new file mode 100644 index 0000000000000000000000000000000000000000..05d0b209ac44198bd0b45c6931dee71eac9b1eab --- /dev/null +++ b/lavis/configs/models/clip/timm-resnetblur50.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "timm_model_name": "resnetblur50", + "timm_model_pretrained": false, + "timm_pool": "abs_attn", + "timm_proj": "", + "image_size": 224 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json b/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json new file mode 100644 index 0000000000000000000000000000000000000000..bc08f2b78543857445d22eec7d288c5fe86391a9 --- /dev/null +++ b/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 512, + "vision_cfg": { + "timm_model_name": "swin_base_patch4_window7_224", + "timm_model_pretrained": false, + "timm_pool": "", + "timm_proj": "linear", + "image_size": 224 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/timm-vit_base_patch16_224.json b/lavis/configs/models/clip/timm-vit_base_patch16_224.json new file mode 100644 index 0000000000000000000000000000000000000000..133b88f2f919de44c19df8318c7297824accbdce --- /dev/null +++ b/lavis/configs/models/clip/timm-vit_base_patch16_224.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 512, + "vision_cfg": { + "timm_model_name": "vit_base_patch16_224", + "timm_model_pretrained": false, + "timm_pool": "", + "timm_proj": "linear", + "image_size": 224 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/timm-vit_base_patch32_224.json b/lavis/configs/models/clip/timm-vit_base_patch32_224.json new file mode 100644 index 0000000000000000000000000000000000000000..9dcc6ffbfda4fb9d206bb693f6c3d53f2757aff8 --- /dev/null +++ b/lavis/configs/models/clip/timm-vit_base_patch32_224.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 512, + "vision_cfg": { + "timm_model_name": "vit_base_patch32_224", + "timm_model_pretrained": false, + "timm_pool": "", + "timm_proj": "linear", + "image_size": 224 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip/timm-vit_small_patch16_224.json b/lavis/configs/models/clip/timm-vit_small_patch16_224.json new file mode 100644 index 0000000000000000000000000000000000000000..8c3ae01ab318ce07c19b7b6326c07aaec1f321a4 --- /dev/null +++ b/lavis/configs/models/clip/timm-vit_small_patch16_224.json @@ -0,0 +1,17 @@ +{ + "embed_dim": 512, + "vision_cfg": { + "timm_model_name": "vit_small_patch16_224", + "timm_model_pretrained": false, + "timm_pool": "", + "timm_proj": "linear", + "image_size": 224 + }, + "text_cfg": { + "context_length": 77, + "vocab_size": 49408, + "width": 512, + "heads": 8, + "layers": 12 + } +} diff --git a/lavis/configs/models/clip_resnet50.yaml b/lavis/configs/models/clip_resnet50.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ce3a2d429646b4b58706715d07da0ecb6c0d767b --- /dev/null +++ b/lavis/configs/models/clip_resnet50.yaml @@ -0,0 +1,11 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: clip + + model_type: RN50 + + pretrained: openai diff --git a/lavis/configs/models/clip_vit_base16.yaml b/lavis/configs/models/clip_vit_base16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a06fa180993c42e63cecee38ec01134c18de7c8 --- /dev/null +++ b/lavis/configs/models/clip_vit_base16.yaml @@ -0,0 +1,17 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: clip + + model_type: ViT-B-16 + + pretrained: openai + +preprocess: + vis_processor: + eval: + name: "clip_image_eval" + image_size: 224 diff --git a/lavis/configs/models/clip_vit_base32.yaml b/lavis/configs/models/clip_vit_base32.yaml new file mode 100644 index 0000000000000000000000000000000000000000..056e3d967853f5c01426514a9f98622bc92241b8 --- /dev/null +++ b/lavis/configs/models/clip_vit_base32.yaml @@ -0,0 +1,52 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: clip + + model_type: ViT-B-32 +# ['RN50', +# 'RN50-quickgelu', +# 'RN50x4', +# 'RN50x16', +# 'RN101', +# 'RN101-quickgelu', +# 'timm-efficientnetv2_rw_s', +# 'timm-resnet50d', +# 'timm-resnetaa50d', +# 'timm-resnetblur50', +# 'timm-swin_base_patch4_window7_224', +# 'timm-vit_base_patch16_224', +# 'timm-vit_base_patch32_224', +# 'timm-vit_small_patch16_224', +# 'ViT-B-16', +# 'ViT-B-16-plus', +# 'ViT-B-16-plus-240', +# 'ViT-B-32', +# 'ViT-B-32-plus-256', +# 'ViT-B-32-quickgelu', +# 'ViT-g-14', +# 'ViT-H-14', +# 'ViT-H-16', +# 'ViT-L-14', +# 'ViT-L-14-280', +# 'ViT-L-14-336', +# 'ViT-L-16', +# 'ViT-L-16-320'] + + pretrained: openai + # "openai" + # following not available for all models + # "yfcc15m" + # "cc12m" + # "laion400m_e31" + # "laion400m_e32" + # "laion400m_avg" + +preprocess: + vis_processor: + eval: + name: "clip_image_eval" + image_size: 224 diff --git a/lavis/configs/models/clip_vit_large14.yaml b/lavis/configs/models/clip_vit_large14.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8ab9f2610f1ae9e0164f39565a8302ab33123548 --- /dev/null +++ b/lavis/configs/models/clip_vit_large14.yaml @@ -0,0 +1,52 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: clip + + model_type: ViT-L-14 +# ['RN50', +# 'RN50-quickgelu', +# 'RN50x4', +# 'RN50x16', +# 'RN101', +# 'RN101-quickgelu', +# 'timm-efficientnetv2_rw_s', +# 'timm-resnet50d', +# 'timm-resnetaa50d', +# 'timm-resnetblur50', +# 'timm-swin_base_patch4_window7_224', +# 'timm-vit_base_patch16_224', +# 'timm-vit_base_patch32_224', +# 'timm-vit_small_patch16_224', +# 'ViT-B-16', +# 'ViT-B-16-plus', +# 'ViT-B-16-plus-240', +# 'ViT-B-32', +# 'ViT-B-32-plus-256', +# 'ViT-B-32-quickgelu', +# 'ViT-g-14', +# 'ViT-H-14', +# 'ViT-H-16', +# 'ViT-L-14', +# 'ViT-L-14-280', +# 'ViT-L-14-336', +# 'ViT-L-16', +# 'ViT-L-16-320'] + + pretrained: openai + # "openai" + # following not available for all models + # "yfcc15m" + # "cc12m" + # "laion400m_e31" + # "laion400m_e32" + # "laion400m_avg" + +preprocess: + vis_processor: + eval: + name: "clip_image_eval" + image_size: 224 diff --git a/lavis/configs/models/clip_vit_large14_336.yaml b/lavis/configs/models/clip_vit_large14_336.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6510d73763fd4f0e5c6512c10c5c0ad8242499b --- /dev/null +++ b/lavis/configs/models/clip_vit_large14_336.yaml @@ -0,0 +1,52 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: clip + + model_type: ViT-L-14-336 +# ['RN50', +# 'RN50-quickgelu', +# 'RN50x4', +# 'RN50x16', +# 'RN101', +# 'RN101-quickgelu', +# 'timm-efficientnetv2_rw_s', +# 'timm-resnet50d', +# 'timm-resnetaa50d', +# 'timm-resnetblur50', +# 'timm-swin_base_patch4_window7_224', +# 'timm-vit_base_patch16_224', +# 'timm-vit_base_patch32_224', +# 'timm-vit_small_patch16_224', +# 'ViT-B-16', +# 'ViT-B-16-plus', +# 'ViT-B-16-plus-240', +# 'ViT-B-32', +# 'ViT-B-32-plus-256', +# 'ViT-B-32-quickgelu', +# 'ViT-g-14', +# 'ViT-H-14', +# 'ViT-H-16', +# 'ViT-L-14', +# 'ViT-L-14-280', +# 'ViT-L-14-336', +# 'ViT-L-16', +# 'ViT-L-16-320'] + + pretrained: openai + # "openai" + # following not available for all models + # "yfcc15m" + # "cc12m" + # "laion400m_e31" + # "laion400m_e32" + # "laion400m_avg" + +preprocess: + vis_processor: + eval: + name: "clip_image_eval" + image_size: 336 diff --git a/lavis/configs/models/gpt_dialogue_base.yaml b/lavis/configs/models/gpt_dialogue_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7bbdae83fbe10b7e7d9001292eb88ba3da4e2e04 --- /dev/null +++ b/lavis/configs/models/gpt_dialogue_base.yaml @@ -0,0 +1,25 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: gpt_dialogue + # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" + # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + + len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens + + len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 + +preprocess: + vis_processor: + train: + name: "gpt_video_ft" + eval: + name: "gpt_video_ft" + text_processor: + train: + name: "gpt_dialogue" + eval: + name: "gpt_dialogue" \ No newline at end of file diff --git a/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml b/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fac355c4312bf54d3d87057d9bc7d665f1f03a06 --- /dev/null +++ b/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml @@ -0,0 +1,58 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: img2prompt_vqa + model_type: base + + image_question_matching_model: + arch: blip_image_text_matching + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" + + # vit encoder + vit_type: "large" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + embed_dim: 256 + + image_captioning_model: + arch: blip_caption + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" + + vit_type: "large" + vit_grad_ckpt: True + vit_ckpt_layer: 5 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + # generation configs + prompt: "a picture of " + + question_generation_moodel: + pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth" + + + +preprocess: + vis_processor: + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/med_config.json b/lavis/configs/models/med_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a566c17bbc185f5bf8b83c7ed7dcb02e1a0ba1f9 --- /dev/null +++ b/lavis/configs/models/med_config.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "add_type_embeddings": false, + "vocab_size": 30524, + "encoder_width": 768, + "add_cross_attention": true +} \ No newline at end of file diff --git a/lavis/configs/models/med_config_albef.json b/lavis/configs/models/med_config_albef.json new file mode 100644 index 0000000000000000000000000000000000000000..529636d733bf35cdb82ec4c7950ede79a5ce80fc --- /dev/null +++ b/lavis/configs/models/med_config_albef.json @@ -0,0 +1,22 @@ +{ + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "add_type_embeddings": false, + "vocab_size": 30522, + "encoder_width": 768, + "add_cross_attention": true, + "fusion_layer": 6 +} \ No newline at end of file diff --git a/lavis/configs/models/med_large_config.json b/lavis/configs/models/med_large_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d5090b06f13c6c1e42d91e30d2cd76c2b6264d3a --- /dev/null +++ b/lavis/configs/models/med_large_config.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "add_type_embeddings": false, + "vocab_size": 30524, + "encoder_width": 1024, + "add_cross_attention": true +} \ No newline at end of file diff --git a/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml b/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..31f43778865db534e0070249db1512f50d937238 --- /dev/null +++ b/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml @@ -0,0 +1,60 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pnp_vqa + model_type: 3b + + image_question_matching_model: + arch: blip_image_text_matching + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" + + # vit encoder + vit_type: "large" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + embed_dim: 256 + + image_captioning_model: + arch: blip_caption + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" + + vit_type: "large" + vit_grad_ckpt: True + vit_ckpt_layer: 5 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + # generation configs + prompt: "a picture of " + + question_answering_model: + arch: pnp_unifiedqav2_fid + + pretrained: "allenai/unifiedqa-v2-t5-3b-1363200" + + t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json" + +preprocess: + vis_processor: + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml b/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5630578bbe24f4788396fbe40ae365580911d1aa --- /dev/null +++ b/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml @@ -0,0 +1,59 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pnp_vqa + model_type: base + + image_question_matching_model: + arch: blip_image_text_matching + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" + + # vit encoder + vit_type: "large" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + embed_dim: 256 + + image_captioning_model: + arch: blip_caption + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" + + vit_type: "large" + vit_grad_ckpt: True + vit_ckpt_layer: 5 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + # generation configs + prompt: "a picture of " + question_answering_model: + arch: pnp_unifiedqav2_fid + + pretrained: "allenai/unifiedqa-v2-t5-base-1363200" + + t5_config_path: "configs/models/pnp-vqa/unifiedqav2_base_config.json" + +preprocess: + vis_processor: + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml b/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bea044c9079c33a7f7ec3a31c13f2da311d042e0 --- /dev/null +++ b/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml @@ -0,0 +1,60 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: pnp_vqa + model_type: large + + image_question_matching_model: + arch: blip_image_text_matching + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" + + # vit encoder + vit_type: "large" + vit_grad_ckpt: False + vit_ckpt_layer: 0 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + embed_dim: 256 + + image_captioning_model: + arch: blip_caption + load_finetuned: True + + finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" + + vit_type: "large" + vit_grad_ckpt: True + vit_ckpt_layer: 5 + + image_size: 384 + + # bert config + med_config_path: "configs/models/med_large_config.json" + + # generation configs + prompt: "a picture of " + + question_answering_model: + arch: pnp_unifiedqav2_fid + + pretrained: "allenai/unifiedqa-v2-t5-large-1363200" + + t5_config_path: "configs/models/pnp-vqa/unifiedqav2_large_config.json" + +preprocess: + vis_processor: + eval: + name: "blip_image_eval" + image_size: 384 + text_processor: + eval: + name: "blip_caption" diff --git a/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json b/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e5220dc592c03afd94f1a9d2077a2a87a3320856 --- /dev/null +++ b/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "T5ForConditionalGeneration" + ], + "d_ff": 16384, + "d_kv": 128, + "d_model": 1024, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "gradient_checkpointing": false, + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 24, + "num_heads": 32, + "num_layers": 24, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.21.3", + "use_cache": true, + "vocab_size": 32128 +} \ No newline at end of file diff --git a/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json b/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json new file mode 100644 index 0000000000000000000000000000000000000000..24ffa8d18a0f317f3c18e5c67bf97ede953d6436 --- /dev/null +++ b/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json @@ -0,0 +1,59 @@ +{ + "architectures": [ + "T5ForConditionalGeneration" + ], + "d_ff": 3072, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "gradient_checkpointing": false, + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "transformers_version": "4.21.3", + "use_cache": true, + "vocab_size": 32128 +} \ No newline at end of file diff --git a/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json b/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4f87ec69734d35cdc0d76b1b3f11f9e80df3cdc1 --- /dev/null +++ b/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json @@ -0,0 +1,59 @@ +{ + "architectures": [ + "T5ForConditionalGeneration" + ], + "d_ff": 4096, + "d_kv": 64, + "d_model": 1024, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "gradient_checkpointing": false, + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 24, + "num_heads": 16, + "num_layers": 24, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "transformers_version": "4.21.3", + "use_cache": true, + "vocab_size": 32128 +} \ No newline at end of file diff --git a/lavis/datasets/__pycache__/data_utils.cpython-310.pyc b/lavis/datasets/__pycache__/data_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e7bb8fb1f44894dd73ddbf55bfcf6d59096f469 Binary files /dev/null and b/lavis/datasets/__pycache__/data_utils.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__init__.py b/lavis/datasets/builders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8c77f4e402c9efa61f25a07418990962cd68bdb8 --- /dev/null +++ b/lavis/datasets/builders/__init__.py @@ -0,0 +1,279 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.datasets.builders.base_dataset_builder import load_dataset_config +from lavis.datasets.builders.caption_builder import ( + COCOCapBuilder, + MSRVTTCapBuilder, + MSVDCapBuilder, + VATEXCapBuilder, + MSRVTTCapInstructBuilder, + MSVDCapInstructBuilder, + VATEXCapInstructBuilder, + WebVid2MCapBuilder, + WebVid2MCapInstructBuilder, + VALORCaptionBuilder, + VALORCaptionInstructBuilder, + ViolinCapBuilder, + ViolinCapInstructBuilder, + VlepCaptionInstructBuilder, + VlepCaptionBuilder, + YouCookCaptionBuilder, + YouCookCaptionInstructBuilder, + COINCaptionBuilder, + COINCaptionInstructBuilder, + CharadeCaptionBuilder, + CharadeCaptionInstructBuilder, + TextCapsCapBuilder, + TextCapsCapInstructBuilder, + Flickr30kCapBuilder, + Flickr30kCapInstructBuilder + +) +from lavis.datasets.builders.image_text_pair_builder import ( + ConceptualCaption12MBuilder, + ConceptualCaption12MInstructBuilder, + ConceptualCaption3MBuilder, + ConceptualCaption3MInstructBuilder, + VGCaptionBuilder, + VGCaptionInstructBuilder, + SBUCaptionBuilder, + SBUCaptionInstructBuilder, + Laion400MBuilder, + Laion400MInstructBuilder +) +from lavis.datasets.builders.classification_builder import ( + NLVRBuilder, + SNLIVisualEntailmentBuilder, + SNLIVisualEntailmentInstructBuilder, + ViolinEntailmentInstructBuilder, + ViolinEntailmentBuilder, + ESC50ClassificationBuilder +) +from lavis.datasets.builders.imagefolder_builder import ImageNetBuilder +from lavis.datasets.builders.video_qa_builder import ( + MSRVTTQABuilder, + MSVDQABuilder, + MSRVTTQAInstructBuilder, + MSVDQAInstructBuilder, + MusicAVQABuilder, + MusicAVQAInstructBuilder +) + +from lavis.datasets.builders.vqa_builder import ( + COCOVQABuilder, + COCOVQAInstructBuilder, + OKVQABuilder, + OKVQAInstructBuilder, + AOKVQABuilder, + AOKVQAInstructBuilder, + VGVQABuilder, + VGVQAInstructBuilder, + GQABuilder, + GQAInstructBuilder, + IconQABuilder, + IconQAInstructBuilder, + ScienceQABuilder, + ScienceQAInstructBuilder, + OCRVQABuilder, + OCRVQAInstructBuilder, + VizWizVQABuilder +) +from lavis.datasets.builders.retrieval_builder import ( + MSRVTTRetrievalBuilder, + DiDeMoRetrievalBuilder, + COCORetrievalBuilder, + Flickr30kBuilder, +) + +from lavis.datasets.builders.audio_caption_builder import ( + AudioSetBuilder, + AudioCapsCapBuilder, + AudioSetInstructBuilder, + AudioCapsInstructCapBuilder, + WavCapsCapInstructBuilder, + WavCapsCapBuilder +) + +from lavis.datasets.builders.object3d_caption_builder import ( + ObjaverseCaptionInstructBuilder, + ShapenetCaptionInstructBuilder, + ObjaverseCaptionBuilder, + ShapenetCaptionBuilder +) +from lavis.datasets.builders.object3d_qa_builder import ObjaverseQABuilder +from lavis.datasets.builders.object3d_classification_builder import ModelNetClassificationBuilder + +from lavis.datasets.builders.audio_qa_builder import AudioCapsQABuilder, ClothoQABuilder + +from lavis.datasets.builders.dialogue_builder import ( + AVSDDialBuilder, + AVSDDialInstructBuilder, + YT8MDialBuilder, + LLaVA150kDialInstructBuilder, + VisDialBuilder, + VisDialInstructBuilder +) +from lavis.datasets.builders.text_to_image_generation_builder import BlipDiffusionFinetuneBuilder + +from lavis.datasets.builders.discrn_builders import DiscrnImagePcBuilder, DiscrnAudioVideoBuilder + +from lavis.common.registry import registry + +__all__ = [ + "BlipDiffusionFinetuneBuilder", + "COCOCapBuilder", + "COCORetrievalBuilder", + "COCOVQABuilder", + "ConceptualCaption12MBuilder", + "ConceptualCaption3MBuilder", + "DiDeMoRetrievalBuilder", + "Flickr30kBuilder", + "GQABuilder", + "ImageNetBuilder", + "MSRVTTCapBuilder", + "MSRVTTQABuilder", + "MSRVTTRetrievalBuilder", + "MSVDCapBuilder", + "MSVDQABuilder", + "NLVRBuilder", + "OKVQABuilder", + "AOKVQABuilder", + "SBUCaptionBuilder", + "SNLIVisualEntailmentBuilder", + "VATEXCapBuilder", + "VGCaptionBuilder", + "VGVQABuilder", + "AVSDDialBuilder", + "Laion400MBuilder", + + "ViolinCapBuilder", + "ViolinEntailmentBuilder", + "VlepCaptionBuilder", + "YouCookCaptionBuilder", + "COINCaptionBuilder", + "CharadeCaptionBuilder", + "YT8MDialBuilder", + "IconQABuilder", + "ScienceQABuilder", + "VisDialBuilder", + "OCRVQABuilder", + "VizWizVQABuilder", + "TextCapsCapBuilder", + "Flickr30kCapBuilder", + "AudioSetBuilder", + "AudioCapsCapBuilder", + "WavCapsCapBuilder", + "WebVid2MCapBuilder", + "VALORCaptionBuilder", + "ObjaverseCaptionBuilder", + "ShapenetCaptionBuilder", + "ObjaverseQABuilder", + "MusicAVQABuilder", + "ESC50ClassificationBuilder", + + ## Instruction Builders + "AOKVQAInstructBuilder", + "OKVQAInstructBuilder", + "AudioSetInstructBuilder", + "AudioCapsInstructCapBuilder", + "AudioCapsQABuilder", + "WavCapsCapInstructBuilder", + "ObjaverseCaptionInstructBuilder", + "ShapenetCaptionInstructBuilder", + "ModelNetClassificationBuilder", + "ObjaverseCaptionInstructBuilder", + "MSRVTTCapInstructBuilder", + "MSVDCapInstructBuilder", + "VATEXCapInstructBuilder", + "WebVid2MCapInstructBuilder", + "MSRVTTQAInstructBuilder", + "MSVDQAInstructBuilder", + "VALORCaptionInstructBuilder", + "AVSDDialInstructBuilder", + "VisDialInstructBuilder", + "MusicAVQAInstructBuilder", + "ViolinCapInstructBuilder", + "ViolinEntailmentInstructBuilder", + "VlepCaptionInstructBuilder", + "YouCookCaptionInstructBuilder", + "COINCaptionInstructBuilder", + "CharadeCaptionInstructBuilder", + "COCOVQAInstructBuilder", + "VGVQAInstructBuilder", + "GQAInstructBuilder", + "IconQAInstructBuilder", + "SNLIVisualEntailmentInstructBuilder", + "Laion400MInstructBuilder", + "LLaVA150kDialInstructBuilder", + "ScienceQAInstructBuilder", + "OCRVQAInstructBuilder", + "TextCapsCapInstructBuilder", + "Flickr30kCapInstructBuilder", + "ConceptualCaption12MInstructBuilder", + "ConceptualCaption3MInstructBuilder", + "VGCaptionInstructBuilder", + "SBUCaptionInstructBuilder", + "ClothoQABuilder", + + # DisCRN + "DiscrnImagePcBuilder", + "DiscrnAudioVideoBuilder" + +] + + +def load_dataset(name, cfg_path=None, vis_path=None, data_type=None): + """ + Example + + >>> dataset = load_dataset("coco_caption", cfg=None) + >>> splits = dataset.keys() + >>> print([len(dataset[split]) for split in splits]) + + """ + if cfg_path is None: + cfg = None + else: + cfg = load_dataset_config(cfg_path) + + try: + builder = registry.get_builder_class(name)(cfg) + except TypeError: + print( + f"Dataset {name} not found. Available datasets:\n" + + ", ".join([str(k) for k in dataset_zoo.get_names()]) + ) + exit(1) + + if vis_path is not None: + if data_type is None: + # use default data type in the config + data_type = builder.config.data_type + + assert ( + data_type in builder.config.build_info + ), f"Invalid data_type {data_type} for {name}." + + builder.config.build_info.get(data_type).storage = vis_path + + dataset = builder.build_datasets() + return dataset + + +class DatasetZoo: + def __init__(self) -> None: + self.dataset_zoo = { + k: list(v.DATASET_CONFIG_DICT.keys()) + for k, v in sorted(registry.mapping["builder_name_mapping"].items()) + } + + def get_names(self): + return list(self.dataset_zoo.keys()) + + +dataset_zoo = DatasetZoo() diff --git a/lavis/datasets/builders/__pycache__/__init__.cpython-310.pyc b/lavis/datasets/builders/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d163d9c6a6a0f233a3e943c03fe6724308f5e6b2 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/audio_caption_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/audio_caption_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2844ae92d0ecca5b78e0b8afd1401c6112a60db9 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/audio_caption_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/audio_qa_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/audio_qa_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0cf404ff4652ffa7ce6893843d1007c8212120ee Binary files /dev/null and b/lavis/datasets/builders/__pycache__/audio_qa_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/base_dataset_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/base_dataset_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66f1b8fa9c0ad4008043cfa819f552c5ee1845b9 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/base_dataset_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/caption_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/caption_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91b198eb8f2ec6660eb189696bafb70b5b1b9740 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/caption_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/classification_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/classification_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bf1ca23e7ae5c161d3a2eaac4e2ee83b636a9a4 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/classification_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/dialogue_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/dialogue_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..553a3406df8c8c8811df67df7d8c4e86e48fae4e Binary files /dev/null and b/lavis/datasets/builders/__pycache__/dialogue_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/discrn_builders.cpython-310.pyc b/lavis/datasets/builders/__pycache__/discrn_builders.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c553faf2c13610abb5cf69f87ec1140902209fc Binary files /dev/null and b/lavis/datasets/builders/__pycache__/discrn_builders.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/image_text_pair_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/image_text_pair_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ad0a6f54d31374986ca4a67290e729df89786df Binary files /dev/null and b/lavis/datasets/builders/__pycache__/image_text_pair_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/imagefolder_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/imagefolder_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79478feaa38d38aa9236656ce57d27d6ef013501 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/imagefolder_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/object3d_caption_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/object3d_caption_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..623dcfc730d88735c584c93dc9e1b0a6cc4c75e6 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/object3d_caption_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/object3d_classification_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/object3d_classification_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..066bdeeb10cdc243620b2894565d680335333595 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/object3d_classification_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/object3d_qa_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/object3d_qa_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0dd1858ccfeef8937b7664f76a8ede962c3da04c Binary files /dev/null and b/lavis/datasets/builders/__pycache__/object3d_qa_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/retrieval_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/retrieval_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76bd55ccec2348cc1f9db4fa4c625dec11bbc9c2 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/retrieval_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/text_to_image_generation_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/text_to_image_generation_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0aac70decb7bbafd792b4f72ef9ce2a1d0a4e677 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/text_to_image_generation_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/video_qa_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/video_qa_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c70cbed037eea6a015cb68d7db5431855b27e71a Binary files /dev/null and b/lavis/datasets/builders/__pycache__/video_qa_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/__pycache__/vqa_builder.cpython-310.pyc b/lavis/datasets/builders/__pycache__/vqa_builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..496fde7e18d349f972ab4b896ad7112dc5e67b18 Binary files /dev/null and b/lavis/datasets/builders/__pycache__/vqa_builder.cpython-310.pyc differ diff --git a/lavis/datasets/builders/audio_caption_builder.py b/lavis/datasets/builders/audio_caption_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..b4e3828e6ea1226e49d3dd8a212d32e844765416 --- /dev/null +++ b/lavis/datasets/builders/audio_caption_builder.py @@ -0,0 +1,123 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder + +from lavis.datasets.datasets.audio_captioning_datasets import ( + AudioSetDataset, + AudioSetEvalDataset, + AudioSetInstructDataset, + AudioCapsDataset, + AudioCapsEvalDataset, + AudioCapsInstructDataset, + ClothoV2Dataset, + ClothoV2InstructDataset, + ClothoV2EvalDataset, + AudioLanguagePretrainDataset, + AudioLanguagePretrainEvalDataset, + AudioLanguagePretrainInstructDataset +) + + +class AudioCapBuilder(MultiModalDatasetBuilder): + train_dataset_cls = AudioSetDataset + eval_dataset_cls = AudioSetEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/audioset/defaults_mm_cap.yaml", + } + + + def build(self): + datasets = super().build() + build_info = self.config.build_info + for split,ds in datasets.items(): + # TODO: add option to download templates + templates = build_info.get('templates') + if templates == None: + ds._build_templates(None) + else: + ds._build_templates(build_info.templates.storage) + return datasets + +@registry.register_builder("audioset_mm_caption") +class AudioSetBuilder(AudioCapBuilder): + train_dataset_cls = AudioSetDataset + eval_dataset_cls = AudioSetEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/audioset/defaults_mm_cap.yaml", + } + +@registry.register_builder("audioset_mm_caption_instruct") +class AudioSetInstructBuilder(AudioCapBuilder): + train_dataset_cls = AudioSetInstructDataset + eval_dataset_cls = AudioSetEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/audioset/defaults_mm_cap_instruct.yaml", + } + +@registry.register_builder("audiocaps_mm_caption") +class AudioCapsCapBuilder(AudioCapBuilder): + train_dataset_cls = AudioCapsDataset + eval_dataset_cls = AudioCapsEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/audiocaps/defaults_mm_cap.yaml", + } + +@registry.register_builder("audiocaps_mm_caption_instruct") +class AudioCapsInstructCapBuilder(AudioCapBuilder): + train_dataset_cls = AudioCapsInstructDataset + eval_dataset_cls = AudioCapsEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/audiocaps/defaults_mm_cap_instruct.yaml", + } + +@registry.register_builder("clothov2") +class ClothoCapInstructBuilder(MultiModalDatasetBuilder): + train_dataset_cls = ClothoV2Dataset + eval_dataset_cls = ClothoV2EvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/clotho/defaults_mm_cap.yaml", + } + +@registry.register_builder("clothov2_instruct") +class ClothoCapInstructBuilder(MultiModalDatasetBuilder): + train_dataset_cls = ClothoV2InstructDataset + eval_dataset_cls = ClothoV2EvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/clotho/defaults_mm_cap_instruct.yaml", + } + + +@registry.register_builder("wavcaps_mm_caption") +class WavCapsCapBuilder(AudioCapBuilder): + train_dataset_cls = AudioLanguagePretrainDataset + eval_dataset_cls = AudioLanguagePretrainEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/wavcaps/defaults_mm_cap.yaml", + } + + + +@registry.register_builder("wavcaps_mm_caption_instruct") +class WavCapsCapInstructBuilder(AudioCapBuilder): + train_dataset_cls = AudioLanguagePretrainInstructDataset + eval_dataset_cls = AudioLanguagePretrainEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml", + } + + diff --git a/lavis/datasets/builders/audio_qa_builder.py b/lavis/datasets/builders/audio_qa_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..8e9435a2da3eeadd74c92ab73944d46bb4b5ca19 --- /dev/null +++ b/lavis/datasets/builders/audio_qa_builder.py @@ -0,0 +1,28 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.builders.audio_caption_builder import AudioCapBuilder +from lavis.datasets.datasets.audio_qa_datasets import AudioCapsQADataset, ClothoQADataset + +@registry.register_builder("audiocaps_mm_qa") +class AudioCapsQABuilder(AudioCapBuilder): + train_dataset_cls = AudioCapsQADataset + eval_dataset_cls = AudioCapsQADataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/audiocaps/defaults_mm_qa.yaml", + } + +@registry.register_builder("clotho_qa") +class ClothoQABuilder(AudioCapBuilder): + train_dataset_cls = ClothoQADataset + eval_dataset_cls = ClothoQADataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/clotho/defaults_mm_qa.yaml", + } \ No newline at end of file diff --git a/lavis/datasets/builders/base_dataset_builder.py b/lavis/datasets/builders/base_dataset_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b0d549a39ba0616608c6f5fc45338e6571e2ce --- /dev/null +++ b/lavis/datasets/builders/base_dataset_builder.py @@ -0,0 +1,327 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +import os +import shutil +import warnings + +import lavis.common.utils as utils +import torch.distributed as dist +from lavis.common.dist_utils import is_dist_avail_and_initialized, is_main_process +from lavis.common.registry import registry +from lavis.datasets.data_utils import extract_archive +from lavis.processors.base_processor import BaseProcessor +from omegaconf import OmegaConf +from torchvision.datasets.utils import download_url + + +class BaseDatasetBuilder: + train_dataset_cls, eval_dataset_cls = None, None + + def __init__(self, cfg=None): + super().__init__() + + if cfg is None: + # help to create datasets from default config. + self.config = load_dataset_config(self.default_config_path()) + elif isinstance(cfg, str): + self.config = load_dataset_config(cfg) + else: + # when called from task.build_dataset() + self.config = cfg + + self.data_type = self.config.data_type + + self.vis_processors = {"train": BaseProcessor(), "eval": BaseProcessor()} + self.text_processors = {"train": BaseProcessor(), "eval": BaseProcessor()} + + # additional processors, each specified by a name in string. + self.kw_processors = {} + + def build_datasets(self): + # download, split, etc... + # only called on 1 GPU/TPU in distributed + + if is_main_process(): + self._download_data() + + if is_dist_avail_and_initialized(): + dist.barrier() + + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + datasets = self.build() # dataset['train'/'val'/'test'] + + return datasets + + def build_processors(self): + vis_proc_cfg = self.config.get("vis_processor") + txt_proc_cfg = self.config.get("text_processor") + + if vis_proc_cfg is not None: + vis_train_cfg = vis_proc_cfg.get("train") + vis_eval_cfg = vis_proc_cfg.get("eval") + + self.vis_processors["train"] = self._build_proc_from_cfg(vis_train_cfg) + self.vis_processors["eval"] = self._build_proc_from_cfg(vis_eval_cfg) + + if txt_proc_cfg is not None: + txt_train_cfg = txt_proc_cfg.get("train") + txt_eval_cfg = txt_proc_cfg.get("eval") + + self.text_processors["train"] = self._build_proc_from_cfg(txt_train_cfg) + self.text_processors["eval"] = self._build_proc_from_cfg(txt_eval_cfg) + + kw_proc_cfg = self.config.get("kw_processor") + if kw_proc_cfg is not None: + for name, cfg in kw_proc_cfg.items(): + self.kw_processors[name] = self._build_proc_from_cfg(cfg) + + @staticmethod + def _build_proc_from_cfg(cfg): + return ( + registry.get_processor_class(cfg.name).from_config(cfg) + if cfg is not None + else None + ) + + @classmethod + def default_config_path(cls, type="default"): + return utils.get_abs_path(cls.DATASET_CONFIG_DICT[type]) + + def _download_data(self): + self._download_ann() + self._download_vis() + + def _download_ann(self): + """ + Download annotation files if necessary. + All the vision-language datasets should have annotations of unified format. + + storage_path can be: + (1) relative/absolute: will be prefixed with env.cache_root to make full path if relative. + (2) basename/dirname: will be suffixed with base name of URL if dirname is provided. + + Local annotation paths should be relative. + """ + anns = self.config.build_info.annotations + + splits = anns.keys() + + cache_root = registry.get_path("cache_root") + + for split in splits: + info = anns[split] + + urls, storage_paths = info.get("url", None), info.storage + + if isinstance(urls, str): + urls = [urls] + if isinstance(storage_paths, str): + storage_paths = [storage_paths] + + assert len(urls) == len(storage_paths) + + for url_or_filename, storage_path in zip(urls, storage_paths): + # if storage_path is relative, make it full by prefixing with cache_root. + if not os.path.isabs(storage_path): + storage_path = os.path.join(cache_root, storage_path) + + dirname = os.path.dirname(storage_path) + if not os.path.exists(dirname): + os.makedirs(dirname) + + if os.path.isfile(url_or_filename): + src, dst = url_or_filename, storage_path + if not os.path.exists(dst): + shutil.copyfile(src=src, dst=dst) + else: + logging.info("Using existing file {}.".format(dst)) + else: + if os.path.isdir(storage_path): + # if only dirname is provided, suffix with basename of URL. + raise ValueError( + "Expecting storage_path to be a file path, got directory {}".format( + storage_path + ) + ) + else: + filename = os.path.basename(storage_path) + + download_url(url=url_or_filename, root=dirname, filename=filename) + + def _download_vis(self): + + storage_path = self.config.build_info.get(self.data_type).storage + storage_path = utils.get_cache_path(storage_path) + + if not os.path.exists(storage_path): + warnings.warn( + f""" + The specified path {storage_path} for visual inputs does not exist. + Please provide a correct path to the visual inputs or + refer to datasets/download_scripts/README.md for downloading instructions. + """ + ) + + def build(self): + """ + Create by split datasets inheriting torch.utils.data.Datasets. + + # build() can be dataset-specific. Overwrite to customize. + """ + self.build_processors() + + build_info = self.config.build_info + + ann_info = build_info.annotations + vis_info = build_info.get(self.data_type) + + datasets = dict() + for split in ann_info.keys(): + if split not in ["train", "val", "test"]: + continue + + is_train = split == "train" + + # processors + vis_processor = ( + self.vis_processors["train"] + if is_train + else self.vis_processors["eval"] + ) + text_processor = ( + self.text_processors["train"] + if is_train + else self.text_processors["eval"] + ) + + # annotation path + ann_paths = ann_info.get(split).storage + if isinstance(ann_paths, str): + ann_paths = [ann_paths] + + abs_ann_paths = [] + for ann_path in ann_paths: + if not os.path.isabs(ann_path): + ann_path = utils.get_cache_path(ann_path) + abs_ann_paths.append(ann_path) + ann_paths = abs_ann_paths + + # visual data storage path + vis_path = vis_info.storage + + if not os.path.isabs(vis_path): + # vis_path = os.path.join(utils.get_cache_path(), vis_path) + vis_path = utils.get_cache_path(vis_path) + + if not os.path.exists(vis_path): + warnings.warn("storage path {} does not exist.".format(vis_path)) + + # create datasets + dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls + datasets[split] = dataset_cls( + vis_processor=vis_processor, + text_processor=text_processor, + ann_paths=ann_paths, + vis_root=vis_path, + ) + + return datasets + + +class MultiModalDatasetBuilder(BaseDatasetBuilder): + """ + MultiModalDatasetBuilder is a utility class designed to construct datasets + suitable for multi-modal tasks. This class simplifies the creation of + datasets that incorporate data of multiple modalities, such as text, + images, video, or audio. + """ + train_dataset_cls, eval_dataset_cls = None, None + + def __init__(self, cfg=None): + super().__init__(cfg) + if isinstance(self.data_type, str): + self.data_type = [self.data_type] + + def _build_processor(self, cfg_name): + cfg = self.config.get(cfg_name) + return { + split: self._build_proc_from_cfg(cfg.get(split)) + if cfg is not None + else None + for split in ['train', 'eval'] + } + + def build_processors(self): + self.text_processors = self._build_processor("text_processor") + + self.processors = { + split: { + modality: self._build_proc_from_cfg( + self.config.get(f"{'vis' if 'image' in modality else modality}_processor").get(split) + ) + for modality in self.data_type + } + for split in ['train', 'eval'] + } + + def _download_multimodal(self, modality): + storage_path = utils.get_cache_path(self.config.build_info.get(modality).storage) + if not os.path.exists(storage_path): + warnings.warn(f"The specified path {storage_path} for {modality} inputs does not exist.") + + def _download_data(self): + self._download_ann() + for modality in self.data_type: + self._download_multimodal(modality) + + def _get_absolute_path(self, path): + if not os.path.isabs(path): + return utils.get_cache_path(path) + return path + + def build(self): + self.build_processors() + build_info = self.config.build_info + datasets = {} + + for split, info in build_info.annotations.items(): + if split not in ["train", "val", "test"]: + continue + + is_train = split == "train" + dataset_args = self._get_dataset_args(info, is_train) + + dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls + datasets[split] = dataset_cls(**dataset_args) + + return datasets + + def _get_dataset_args(self, info, is_train): + dataset_args = dict(self.config.build_info.get('kwargs', {})) + + for modality in self.data_type: + proc_name = f"{'vis' if 'image' in modality else modality}_processor" + dataset_args[proc_name] = self.processors["train" if is_train else "eval"][modality] + mm_path = self._get_absolute_path(self.config.build_info.get(modality).storage) + dataset_args[f"{'vis' if 'image' in modality else modality}_root"] = mm_path + + dataset_args['text_processor'] = self.text_processors["train" if is_train else "eval"] + dataset_args["ann_paths"] = [self._get_absolute_path(path) for path in info.storage] + dataset_args['modalities'] = self.data_type + + # Conform to base + for key in ['vis_processor', 'vis_root', 'test_processor']: + dataset_args.setdefault(key, None) + + return dataset_args + +def load_dataset_config(cfg_path): + cfg = OmegaConf.load(cfg_path).datasets + return next(iter(cfg.values())) \ No newline at end of file diff --git a/lavis/datasets/builders/caption_builder.py b/lavis/datasets/builders/caption_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..c3d9663858be9c34d894116422dff4e3dc19b967 --- /dev/null +++ b/lavis/datasets/builders/caption_builder.py @@ -0,0 +1,321 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder +from lavis.datasets.datasets.capfilt_dataset import CapFiltCaptionInstructDataset, CapFiltCaptionDataset +from lavis.datasets.datasets.coco_caption_datasets import ( + COCOCapDataset, + COCOCapInstructDataset, + COCOCapEvalDataset, + NoCapsEvalDataset, +) + +from lavis.common.registry import registry +from lavis.datasets.datasets.video_caption_datasets import ( + VideoCaptionDataset, + VideoCaptionEvalDataset, + ClipCaptionDataset, + ClipCaptionInstructDataset, + ClipCaptionEvalDataset, + VideoCaptionInstructDataset, + WebVideoCaptionDataset, + WebVideoCaptionInstructDataset, +) +from lavis.datasets.datasets.violin_dataset import ( + ViolinVideoCaptionDataset, + ViolinVideoCaptionInstructDataset, + ViolinVideoCaptionEvalDataset +) +from lavis.datasets.datasets.valor_caption import VALORCaptionInstuctDataset, VALORCaptionEvalDataset, VALORCaptionDataset +from lavis.datasets.datasets.vatex_captioning_datasets import VATEXCaptionInstuctDataset, VATEXCaptionEvalDataset, VATEXCaptionDataset +from lavis.datasets.datasets.vlep_dataset import VlepVideoDataset, VlepVideoInstructDataset, VlepVideoEvalDataset +from lavis.datasets.datasets.vsr_datasets import VSRCaptionDataset, VSRCaptionInstructDataset, VSRCaptionEvalDataset +from lavis.datasets.datasets.textcaps_datasets import TextCapsCapDataset, TextCapsCapInstructDataset, TextCapsCapEvalDataset + +@registry.register_builder("coco_caption") +class COCOCapBuilder(BaseDatasetBuilder): + train_dataset_cls = COCOCapDataset + eval_dataset_cls = COCOCapEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco/defaults_cap.yaml", + } + +@registry.register_builder("coco_caption_instruct") +class COCOCapInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = COCOCapInstructDataset + eval_dataset_cls = COCOCapEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco/defaults_cap_instruct.yaml", + } + + +@registry.register_builder("flickr30k_caption") +class Flickr30kCapBuilder(BaseDatasetBuilder): + train_dataset_cls = COCOCapDataset + eval_dataset_cls = COCOCapEvalDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/flickr30k/defaults_cap.yaml", + } + +@registry.register_builder("flickr30k_caption_instruct") +class Flickr30kCapInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = COCOCapInstructDataset + eval_dataset_cls = COCOCapEvalDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/flickr30k/defaults_cap_instuct.yaml", + } + +@registry.register_builder("nocaps") +class COCOCapBuilder(BaseDatasetBuilder): + eval_dataset_cls = NoCapsEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/nocaps/defaults.yaml", + } + +@registry.register_builder("vsr_caption") +class VSRCapBuilder(BaseDatasetBuilder): + train_dataset_cls = VSRCaptionDataset + eval_dataset_cls = VSRCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/vsr/defaults.yaml", + } + +@registry.register_builder("vsr_caption_instruct") +class VSRCapInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = VSRCaptionInstructDataset + eval_dataset_cls = VSRCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/vsr/defaults.yaml", + } + +@registry.register_builder("textcaps_caption") +class TextCapsCapBuilder(BaseDatasetBuilder): + train_dataset_cls = TextCapsCapDataset + eval_dataset_cls = TextCapsCapEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/textcaps/defaults.yaml", + } + +@registry.register_builder("textcaps_caption_instruct") +class TextCapsCapInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = TextCapsCapInstructDataset + eval_dataset_cls = TextCapsCapEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/textcaps/defaults_instruct.yaml", + } + + +@registry.register_builder("capfilt14m") +class CapFiltCapBuilder(BaseDatasetBuilder): + train_dataset_cls = CapFiltCaptionDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/capfilt14m/defaults_cap.yaml", + } + +@registry.register_builder("capfilt14m_instruct") +class CapFiltCapBuilder(BaseDatasetBuilder): + train_dataset_cls = CapFiltCaptionInstructDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/capfilt14m/defaults_cap_instruct.yaml", + } + + +@registry.register_builder("msrvtt_caption") +class MSRVTTCapBuilder(BaseDatasetBuilder): + train_dataset_cls = VideoCaptionDataset + eval_dataset_cls = VideoCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/msrvtt/defaults_cap.yaml", + } + + +@registry.register_builder("msvd_caption") +class MSVDCapBuilder(BaseDatasetBuilder): + train_dataset_cls = VideoCaptionDataset + eval_dataset_cls = VideoCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/msvd/defaults_cap.yaml", + } + + +@registry.register_builder("vatex_caption") +class VATEXCapBuilder(MultiModalDatasetBuilder): + train_dataset_cls = VATEXCaptionDataset + eval_dataset_cls = VATEXCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/vatex/defaults_cap.yaml", + } + +@registry.register_builder("msrvtt_caption_instruct") +class MSRVTTCapInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = VideoCaptionInstructDataset + eval_dataset_cls = VideoCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/msrvtt/defaults_cap_instruct.yaml", + } + +@registry.register_builder("msvd_caption_instruct") +class MSVDCapInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = VideoCaptionInstructDataset + eval_dataset_cls = VideoCaptionEvalDataset + + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/msvd/defaults_cap_instruct.yaml", + } + + +@registry.register_builder("vatex_caption_instruct") +class VATEXCapInstructBuilder(MultiModalDatasetBuilder): + train_dataset_cls = VATEXCaptionInstuctDataset + eval_dataset_cls = VATEXCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/vatex/defaults_cap_instruct.yaml", + } + + +@registry.register_builder("webvid2m_caption") +class WebVid2MCapBuilder(BaseDatasetBuilder): + train_dataset_cls = WebVideoCaptionDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/webvid/defaults_cap.yaml", + } + +@registry.register_builder("webvid2m_caption_instruct") +class WebVid2MCapInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = WebVideoCaptionInstructDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/webvid/defaults_cap_instruct.yaml", + } + +@registry.register_builder("violin_caption") +class ViolinCapBuilder(BaseDatasetBuilder): + train_dataset_cls = ViolinVideoCaptionDataset + eval_dataset_cls = ViolinVideoCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/violin/defaults_cap.yaml", + } + + +@registry.register_builder("violin_caption_instruct") +class ViolinCapInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ViolinVideoCaptionInstructDataset + eval_dataset_cls = ViolinVideoCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/violin/defaults_cap_instruct.yaml", + } + +@registry.register_builder("valor_mm_caption") +class VALORCaptionBuilder(MultiModalDatasetBuilder): + train_dataset_cls = VALORCaptionDataset + eval_dataset_cls = VALORCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/valor/defaults_mm_cap.yaml" + } + +@registry.register_builder("valor_mm_caption_instruct") +class VALORCaptionInstructBuilder(MultiModalDatasetBuilder): + train_dataset_cls = VALORCaptionInstuctDataset + eval_dataset_cls = VALORCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/valor/defaults_mm_cap_instruct.yaml" + } + +@registry.register_builder("vlep_caption") +class VlepCaptionBuilder(BaseDatasetBuilder): + train_dataset_cls = VlepVideoDataset + eval_dataset_cls = VlepVideoEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/vlep/defaults_cap.yaml" + } + + +@registry.register_builder("vlep_caption_instruct") +class VlepCaptionInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = VlepVideoInstructDataset + eval_dataset_cls = VlepVideoEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/vlep/defaults_cap_instruct.yaml" + } + +@registry.register_builder("youcook_caption") +class YouCookCaptionBuilder(BaseDatasetBuilder): + train_dataset_cls = ClipCaptionDataset + eval_dataset_cls = ClipCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/youcook/defaults_cap.yaml", + } + +@registry.register_builder("youcook_caption_instruct") +class YouCookCaptionInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ClipCaptionInstructDataset + eval_dataset_cls = ClipCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/youcook/defaults_cap_instruct.yaml", + } + +@registry.register_builder("coin_caption") +class COINCaptionBuilder(BaseDatasetBuilder): + train_dataset_cls = ClipCaptionDataset + eval_dataset_cls = ClipCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coin/defaults_cap.yaml", + } + + +@registry.register_builder("coin_caption_instruct") +class COINCaptionInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ClipCaptionInstructDataset + eval_dataset_cls = ClipCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coin/defaults_cap_instruct.yaml", + } + + +@registry.register_builder("charade_caption") +class CharadeCaptionBuilder(BaseDatasetBuilder): + train_dataset_cls = ClipCaptionDataset + eval_dataset_cls = ClipCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/charade/defaults_cap.yaml", + } + +@registry.register_builder("charade_caption_instruct") +class CharadeCaptionInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ClipCaptionInstructDataset + eval_dataset_cls = ClipCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/charade/defaults_cap_instruct.yaml", + } diff --git a/lavis/datasets/builders/classification_builder.py b/lavis/datasets/builders/classification_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..eedd8ab3b88d1cc5726a5eca3afbd7d323f6b85e --- /dev/null +++ b/lavis/datasets/builders/classification_builder.py @@ -0,0 +1,75 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder +from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset +from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset, SNLIVisualEntialmentInstructDataset +from lavis.datasets.datasets.violin_dataset import ViolinVideoEntailmentDataset, ViolinVideoEntailmentInstructDataset +from lavis.datasets.datasets.vsr_datasets import VSRClassificationDataset, VSRClassificationInstructDataset +from lavis.datasets.datasets.audio_classification_datasets import ESC50 +@registry.register_builder("violin_entailment") +class ViolinEntailmentBuilder(BaseDatasetBuilder): + train_dataset_cls = ViolinVideoEntailmentDataset + eval_dataset_cls = ViolinVideoEntailmentDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/violin/defaults_entail.yaml", + } + + +@registry.register_builder("violin_entailment_instruct") +class ViolinEntailmentInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ViolinVideoEntailmentInstructDataset + eval_dataset_cls = ViolinVideoEntailmentInstructDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/violin/defaults_entail_instruct.yaml", + } + +@registry.register_builder("nlvr") +class NLVRBuilder(BaseDatasetBuilder): + train_dataset_cls = NLVRDataset + eval_dataset_cls = NLVREvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"} + + +@registry.register_builder("snli_ve") +class SNLIVisualEntailmentBuilder(BaseDatasetBuilder): + train_dataset_cls = SNLIVisualEntialmentDataset + eval_dataset_cls = SNLIVisualEntialmentDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"} + +@registry.register_builder("snli_ve_instruct") +class SNLIVisualEntailmentInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = SNLIVisualEntialmentInstructDataset + eval_dataset_cls = SNLIVisualEntialmentInstructDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults_instruct.yaml"} + + +@registry.register_builder("vsr_classification") +class VSRClassificationBuilder(BaseDatasetBuilder): + train_dataset_cls = VSRClassificationDataset + eval_dataset_cls = VSRClassificationDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/vsr/defaults_classification.yaml"} + +@registry.register_builder("vsr_classification_instruct") +class SNLIVisualEntailmentInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = VSRClassificationInstructDataset + eval_dataset_cls = VSRClassificationInstructDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/vsr/defaults_classification_instruct.yaml"} + +@registry.register_builder("esc50_cls") +class ESC50ClassificationBuilder(MultiModalDatasetBuilder): + eval_dataset_cls = ESC50 + + DATASET_CONFIG_DICT = {"default": "configs/datasets/esc50/defaults_mm_cls.yaml"} diff --git a/lavis/datasets/builders/dialogue_builder.py b/lavis/datasets/builders/dialogue_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..3add7ba6c9fd6c9e35dbac5a4441b9e3cb6e89af --- /dev/null +++ b/lavis/datasets/builders/dialogue_builder.py @@ -0,0 +1,66 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder +from lavis.datasets.datasets.avsd_dialogue_datasets import ( + AVSDDialDataset, + AVSDDialEvalDataset, + AVSDDialInstructEvalDataset +) +from lavis.datasets.datasets.visdial_dialogue_datasets import ( + VisDialDataset, + VisDialInstructDataset, + VisDialEvalDataset, +) + +from lavis.datasets.datasets.yt8m_video_dialogue_datasets import YT8MDialDataset +from lavis.datasets.datasets.llava150k_dataset import LLaVA150kInstructDataset + + +@registry.register_builder("avsd_dialogue") +class AVSDDialBuilder(BaseDatasetBuilder): + train_dataset_cls = AVSDDialDataset + eval_dataset_cls = AVSDDialEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"} + +@registry.register_builder("visdial") +class VisDialBuilder(BaseDatasetBuilder): + train_dataset_cls = VisDialDataset + eval_dataset_cls = VisDialEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/visdial/defaults_dial.yaml"} + +@registry.register_builder("visdial_instruct") +class VisDialInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = VisDialInstructDataset + eval_dataset_cls = VisDialEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/visdial/defaults_dial_instruct.yaml"} + +@registry.register_builder("avsd_mm_dialogue_instruct") +class AVSDDialInstructBuilder(MultiModalDatasetBuilder): + train_dataset_cls = AVSDDialInstructEvalDataset + eval_dataset_cls = AVSDDialInstructEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_mm_dial_instruct.yaml"} + +@registry.register_builder("llava150k_dialogue_instruct") +class LLaVA150kDialInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = LLaVA150kInstructDataset + eval_dataset_cls = LLaVA150kInstructDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/llava150k/defaults_dial.yaml"} + +@registry.register_builder("yt8m_mm_dialogue") +class YT8MDialBuilder(MultiModalDatasetBuilder): + train_dataset_cls = YT8MDialDataset + eval_dataset_cls = YT8MDialDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/yt8m/defaults_mm_dial.yaml"} + diff --git a/lavis/datasets/builders/discrn_builders.py b/lavis/datasets/builders/discrn_builders.py new file mode 100644 index 0000000000000000000000000000000000000000..0f7cc690b03c7550a61a1c096fb0d3019bc6ae77 --- /dev/null +++ b/lavis/datasets/builders/discrn_builders.py @@ -0,0 +1,28 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder +from lavis.datasets.datasets.discriminatory_reasoning_datasets import DisCRnDataset + + + +@registry.register_builder("image_pc_discrn") +class DiscrnImagePcBuilder(MultiModalDatasetBuilder): + eval_dataset_cls = DisCRnDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/discriminatory_reasoning/defaults_mm_image_pc.yaml", + } + +@registry.register_builder("audio_video_discrn") +class DiscrnAudioVideoBuilder(MultiModalDatasetBuilder): + eval_dataset_cls = DisCRnDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/discriminatory_reasoning/defaults_mm_audio_video.yaml", + } diff --git a/lavis/datasets/builders/image_text_pair_builder.py b/lavis/datasets/builders/image_text_pair_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..9cc0ad11e8f0943d1785cc4c531ca1c30df19c4e --- /dev/null +++ b/lavis/datasets/builders/image_text_pair_builder.py @@ -0,0 +1,120 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from lavis.common.registry import registry + +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder +from lavis.datasets.datasets.image_text_pair_datasets import ImageTextPairDataset, ImageTextPairInstructDataset +from lavis.datasets.datasets.laion_dataset import LaionDataset, LaionInstructDataset + +@registry.register_builder("conceptual_caption_3m") +class ConceptualCaption3MBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageTextPairDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/conceptual_caption/defaults_3m.yaml" + } + +@registry.register_builder("conceptual_caption_3m_instruct") +class ConceptualCaption3MInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageTextPairInstructDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/conceptual_caption/defaults_3m_instruct.yaml" + } + + +@registry.register_builder("conceptual_caption_12m") +class ConceptualCaption12MBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageTextPairDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/conceptual_caption/defaults_12m.yaml" + } + +@registry.register_builder("conceptual_caption_12m_instruct") +class ConceptualCaption12MInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageTextPairInstructDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/conceptual_caption/defaults_12m_instruct.yaml" + } + +@registry.register_builder("sbu_caption") +class SBUCaptionBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageTextPairDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/sbu_caption/defaults.yaml"} + + +@registry.register_builder("sbu_caption_instruct") +class SBUCaptionInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageTextPairInstructDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/sbu_caption/defaults_instruct.yaml"} + + +@registry.register_builder("vg_caption") +class VGCaptionBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageTextPairDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_caption.yaml"} + + +@registry.register_builder("vg_caption_instruct") +class VGCaptionInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageTextPairInstructDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_caption_instruct.yaml"} + + + +@registry.register_builder("laion2B_multi") +class Laion2BMultiBuilder(BaseDatasetBuilder): + train_dataset_cls = LaionDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults_2B_multi.yaml"} + + def _download_ann(self): + pass + + def _download_vis(self): + pass + + def build(self): + self.build_processors() + + build_info = self.config.build_info + + datasets = dict() + split = "train" # laion dataset only has train split + + # create datasets + # [NOTE] return inner_datasets (wds.DataPipeline) + dataset_cls = self.train_dataset_cls + datasets[split] = dataset_cls( + vis_processor=self.vis_processors[split], + text_processor=self.text_processors[split], + location=build_info.storage, + ).inner_dataset + + return datasets + +@registry.register_builder("laion400M") +class Laion400MBuilder(Laion2BMultiBuilder): + train_dataset_cls = LaionDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults_400M.yaml"} + + +@registry.register_builder("laion400M_instruct") +class Laion400MInstructBuilder(Laion2BMultiBuilder): + train_dataset_cls = LaionInstructDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults_400M_instruct.yaml"} + diff --git a/lavis/datasets/builders/imagefolder_builder.py b/lavis/datasets/builders/imagefolder_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..6c71fbe216156f7e18f3a0d49004d558508980e8 --- /dev/null +++ b/lavis/datasets/builders/imagefolder_builder.py @@ -0,0 +1,1061 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os + +from lavis.common.registry import registry +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder +from lavis.datasets.datasets.imagefolder_dataset import ImageFolderDataset + + +@registry.register_builder("imagenet") +class ImageNetBuilder(BaseDatasetBuilder): + train_dataset_cls = ImageFolderDataset + eval_dataset_cls = ImageFolderDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/imagenet/defaults.yaml"} + + def _download_ann(self): + pass + + def build(self): + self.build_processors() + + build_info = self.config.build_info + + vis_info = build_info.get(self.data_type) + + datasets = dict() + for split in build_info.splits: + assert split in [ + "train", + "val", + ], "Invalid split name {}, must be one of 'train', 'val' and 'test'." + + is_train = split == "train" + + vis_processor = ( + self.vis_processors["train"] + if is_train + else self.vis_processors["eval"] + ) + + vis_path = os.path.join(vis_info.storage, split) + + # create datasets + dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls + datasets[split] = dataset_cls( + vis_processor=vis_processor, + vis_root=vis_path, + classnames=imagenet_classnames, + ) + + return datasets + + +imagenet_classnames = [ + "tench", + "goldfish", + "great white shark", + "tiger shark", + "hammerhead shark", + "electric ray", + "stingray", + "rooster", + "hen", + "ostrich", + "brambling", + "goldfinch", + "house finch", + "junco", + "indigo bunting", + "American robin", + "bulbul", + "jay", + "magpie", + "chickadee", + "American dipper", + "kite (bird of prey)", + "bald eagle", + "vulture", + "great grey owl", + "fire salamander", + "smooth newt", + "newt", + "spotted salamander", + "axolotl", + "American bullfrog", + "tree frog", + "tailed frog", + "loggerhead sea turtle", + "leatherback sea turtle", + "mud turtle", + "terrapin", + "box turtle", + "banded gecko", + "green iguana", + "Carolina anole", + "desert grassland whiptail lizard", + "agama", + "frilled-necked lizard", + "alligator lizard", + "Gila monster", + "European green lizard", + "chameleon", + "Komodo dragon", + "Nile crocodile", + "American alligator", + "triceratops", + "worm snake", + "ring-necked snake", + "eastern hog-nosed snake", + "smooth green snake", + "kingsnake", + "garter snake", + "water snake", + "vine snake", + "night snake", + "boa constrictor", + "African rock python", + "Indian cobra", + "green mamba", + "sea snake", + "Saharan horned viper", + "eastern diamondback rattlesnake", + "sidewinder rattlesnake", + "trilobite", + "harvestman", + "scorpion", + "yellow garden spider", + "barn spider", + "European garden spider", + "southern black widow", + "tarantula", + "wolf spider", + "tick", + "centipede", + "black grouse", + "ptarmigan", + "ruffed grouse", + "prairie grouse", + "peafowl", + "quail", + "partridge", + "african grey parrot", + "macaw", + "sulphur-crested cockatoo", + "lorikeet", + "coucal", + "bee eater", + "hornbill", + "hummingbird", + "jacamar", + "toucan", + "duck", + "red-breasted merganser", + "goose", + "black swan", + "tusker", + "echidna", + "platypus", + "wallaby", + "koala", + "wombat", + "jellyfish", + "sea anemone", + "brain coral", + "flatworm", + "nematode", + "conch", + "snail", + "slug", + "sea slug", + "chiton", + "chambered nautilus", + "Dungeness crab", + "rock crab", + "fiddler crab", + "red king crab", + "American lobster", + "spiny lobster", + "crayfish", + "hermit crab", + "isopod", + "white stork", + "black stork", + "spoonbill", + "flamingo", + "little blue heron", + "great egret", + "bittern bird", + "crane bird", + "limpkin", + "common gallinule", + "American coot", + "bustard", + "ruddy turnstone", + "dunlin", + "common redshank", + "dowitcher", + "oystercatcher", + "pelican", + "king penguin", + "albatross", + "grey whale", + "killer whale", + "dugong", + "sea lion", + "Chihuahua", + "Japanese Chin", + "Maltese", + "Pekingese", + "Shih Tzu", + "King Charles Spaniel", + "Papillon", + "toy terrier", + "Rhodesian Ridgeback", + "Afghan Hound", + "Basset Hound", + "Beagle", + "Bloodhound", + "Bluetick Coonhound", + "Black and Tan Coonhound", + "Treeing Walker Coonhound", + "English foxhound", + "Redbone Coonhound", + "borzoi", + "Irish Wolfhound", + "Italian Greyhound", + "Whippet", + "Ibizan Hound", + "Norwegian Elkhound", + "Otterhound", + "Saluki", + "Scottish Deerhound", + "Weimaraner", + "Staffordshire Bull Terrier", + "American Staffordshire Terrier", + "Bedlington Terrier", + "Border Terrier", + "Kerry Blue Terrier", + "Irish Terrier", + "Norfolk Terrier", + "Norwich Terrier", + "Yorkshire Terrier", + "Wire Fox Terrier", + "Lakeland Terrier", + "Sealyham Terrier", + "Airedale Terrier", + "Cairn Terrier", + "Australian Terrier", + "Dandie Dinmont Terrier", + "Boston Terrier", + "Miniature Schnauzer", + "Giant Schnauzer", + "Standard Schnauzer", + "Scottish Terrier", + "Tibetan Terrier", + "Australian Silky Terrier", + "Soft-coated Wheaten Terrier", + "West Highland White Terrier", + "Lhasa Apso", + "Flat-Coated Retriever", + "Curly-coated Retriever", + "Golden Retriever", + "Labrador Retriever", + "Chesapeake Bay Retriever", + "German Shorthaired Pointer", + "Vizsla", + "English Setter", + "Irish Setter", + "Gordon Setter", + "Brittany dog", + "Clumber Spaniel", + "English Springer Spaniel", + "Welsh Springer Spaniel", + "Cocker Spaniel", + "Sussex Spaniel", + "Irish Water Spaniel", + "Kuvasz", + "Schipperke", + "Groenendael dog", + "Malinois", + "Briard", + "Australian Kelpie", + "Komondor", + "Old English Sheepdog", + "Shetland Sheepdog", + "collie", + "Border Collie", + "Bouvier des Flandres dog", + "Rottweiler", + "German Shepherd Dog", + "Dobermann", + "Miniature Pinscher", + "Greater Swiss Mountain Dog", + "Bernese Mountain Dog", + "Appenzeller Sennenhund", + "Entlebucher Sennenhund", + "Boxer", + "Bullmastiff", + "Tibetan Mastiff", + "French Bulldog", + "Great Dane", + "St. Bernard", + "husky", + "Alaskan Malamute", + "Siberian Husky", + "Dalmatian", + "Affenpinscher", + "Basenji", + "pug", + "Leonberger", + "Newfoundland dog", + "Great Pyrenees dog", + "Samoyed", + "Pomeranian", + "Chow Chow", + "Keeshond", + "brussels griffon", + "Pembroke Welsh Corgi", + "Cardigan Welsh Corgi", + "Toy Poodle", + "Miniature Poodle", + "Standard Poodle", + "Mexican hairless dog (xoloitzcuintli)", + "grey wolf", + "Alaskan tundra wolf", + "red wolf or maned wolf", + "coyote", + "dingo", + "dhole", + "African wild dog", + "hyena", + "red fox", + "kit fox", + "Arctic fox", + "grey fox", + "tabby cat", + "tiger cat", + "Persian cat", + "Siamese cat", + "Egyptian Mau", + "cougar", + "lynx", + "leopard", + "snow leopard", + "jaguar", + "lion", + "tiger", + "cheetah", + "brown bear", + "American black bear", + "polar bear", + "sloth bear", + "mongoose", + "meerkat", + "tiger beetle", + "ladybug", + "ground beetle", + "longhorn beetle", + "leaf beetle", + "dung beetle", + "rhinoceros beetle", + "weevil", + "fly", + "bee", + "ant", + "grasshopper", + "cricket insect", + "stick insect", + "cockroach", + "praying mantis", + "cicada", + "leafhopper", + "lacewing", + "dragonfly", + "damselfly", + "red admiral butterfly", + "ringlet butterfly", + "monarch butterfly", + "small white butterfly", + "sulphur butterfly", + "gossamer-winged butterfly", + "starfish", + "sea urchin", + "sea cucumber", + "cottontail rabbit", + "hare", + "Angora rabbit", + "hamster", + "porcupine", + "fox squirrel", + "marmot", + "beaver", + "guinea pig", + "common sorrel horse", + "zebra", + "pig", + "wild boar", + "warthog", + "hippopotamus", + "ox", + "water buffalo", + "bison", + "ram (adult male sheep)", + "bighorn sheep", + "Alpine ibex", + "hartebeest", + "impala (antelope)", + "gazelle", + "arabian camel", + "llama", + "weasel", + "mink", + "European polecat", + "black-footed ferret", + "otter", + "skunk", + "badger", + "armadillo", + "three-toed sloth", + "orangutan", + "gorilla", + "chimpanzee", + "gibbon", + "siamang", + "guenon", + "patas monkey", + "baboon", + "macaque", + "langur", + "black-and-white colobus", + "proboscis monkey", + "marmoset", + "white-headed capuchin", + "howler monkey", + "titi monkey", + "Geoffroy's spider monkey", + "common squirrel monkey", + "ring-tailed lemur", + "indri", + "Asian elephant", + "African bush elephant", + "red panda", + "giant panda", + "snoek fish", + "eel", + "silver salmon", + "rock beauty fish", + "clownfish", + "sturgeon", + "gar fish", + "lionfish", + "pufferfish", + "abacus", + "abaya", + "academic gown", + "accordion", + "acoustic guitar", + "aircraft carrier", + "airliner", + "airship", + "altar", + "ambulance", + "amphibious vehicle", + "analog clock", + "apiary", + "apron", + "trash can", + "assault rifle", + "backpack", + "bakery", + "balance beam", + "balloon", + "ballpoint pen", + "Band-Aid", + "banjo", + "baluster / handrail", + "barbell", + "barber chair", + "barbershop", + "barn", + "barometer", + "barrel", + "wheelbarrow", + "baseball", + "basketball", + "bassinet", + "bassoon", + "swimming cap", + "bath towel", + "bathtub", + "station wagon", + "lighthouse", + "beaker", + "military hat (bearskin or shako)", + "beer bottle", + "beer glass", + "bell tower", + "baby bib", + "tandem bicycle", + "bikini", + "ring binder", + "binoculars", + "birdhouse", + "boathouse", + "bobsleigh", + "bolo tie", + "poke bonnet", + "bookcase", + "bookstore", + "bottle cap", + "hunting bow", + "bow tie", + "brass memorial plaque", + "bra", + "breakwater", + "breastplate", + "broom", + "bucket", + "buckle", + "bulletproof vest", + "high-speed train", + "butcher shop", + "taxicab", + "cauldron", + "candle", + "cannon", + "canoe", + "can opener", + "cardigan", + "car mirror", + "carousel", + "tool kit", + "cardboard box / carton", + "car wheel", + "automated teller machine", + "cassette", + "cassette player", + "castle", + "catamaran", + "CD player", + "cello", + "mobile phone", + "chain", + "chain-link fence", + "chain mail", + "chainsaw", + "storage chest", + "chiffonier", + "bell or wind chime", + "china cabinet", + "Christmas stocking", + "church", + "movie theater", + "cleaver", + "cliff dwelling", + "cloak", + "clogs", + "cocktail shaker", + "coffee mug", + "coffeemaker", + "spiral or coil", + "combination lock", + "computer keyboard", + "candy store", + "container ship", + "convertible", + "corkscrew", + "cornet", + "cowboy boot", + "cowboy hat", + "cradle", + "construction crane", + "crash helmet", + "crate", + "infant bed", + "Crock Pot", + "croquet ball", + "crutch", + "cuirass", + "dam", + "desk", + "desktop computer", + "rotary dial telephone", + "diaper", + "digital clock", + "digital watch", + "dining table", + "dishcloth", + "dishwasher", + "disc brake", + "dock", + "dog sled", + "dome", + "doormat", + "drilling rig", + "drum", + "drumstick", + "dumbbell", + "Dutch oven", + "electric fan", + "electric guitar", + "electric locomotive", + "entertainment center", + "envelope", + "espresso machine", + "face powder", + "feather boa", + "filing cabinet", + "fireboat", + "fire truck", + "fire screen", + "flagpole", + "flute", + "folding chair", + "football helmet", + "forklift", + "fountain", + "fountain pen", + "four-poster bed", + "freight car", + "French horn", + "frying pan", + "fur coat", + "garbage truck", + "gas mask or respirator", + "gas pump", + "goblet", + "go-kart", + "golf ball", + "golf cart", + "gondola", + "gong", + "gown", + "grand piano", + "greenhouse", + "radiator grille", + "grocery store", + "guillotine", + "hair clip", + "hair spray", + "half-track", + "hammer", + "hamper", + "hair dryer", + "hand-held computer", + "handkerchief", + "hard disk drive", + "harmonica", + "harp", + "combine harvester", + "hatchet", + "holster", + "home theater", + "honeycomb", + "hook", + "hoop skirt", + "gymnastic horizontal bar", + "horse-drawn vehicle", + "hourglass", + "iPod", + "clothes iron", + "carved pumpkin", + "jeans", + "jeep", + "T-shirt", + "jigsaw puzzle", + "rickshaw", + "joystick", + "kimono", + "knee pad", + "knot", + "lab coat", + "ladle", + "lampshade", + "laptop computer", + "lawn mower", + "lens cap", + "letter opener", + "library", + "lifeboat", + "lighter", + "limousine", + "ocean liner", + "lipstick", + "slip-on shoe", + "lotion", + "music speaker", + "loupe magnifying glass", + "sawmill", + "magnetic compass", + "messenger bag", + "mailbox", + "tights", + "one-piece bathing suit", + "manhole cover", + "maraca", + "marimba", + "mask", + "matchstick", + "maypole", + "maze", + "measuring cup", + "medicine cabinet", + "megalith", + "microphone", + "microwave oven", + "military uniform", + "milk can", + "minibus", + "miniskirt", + "minivan", + "missile", + "mitten", + "mixing bowl", + "mobile home", + "ford model t", + "modem", + "monastery", + "monitor", + "moped", + "mortar and pestle", + "graduation cap", + "mosque", + "mosquito net", + "vespa", + "mountain bike", + "tent", + "computer mouse", + "mousetrap", + "moving van", + "muzzle", + "metal nail", + "neck brace", + "necklace", + "baby pacifier", + "notebook computer", + "obelisk", + "oboe", + "ocarina", + "odometer", + "oil filter", + "pipe organ", + "oscilloscope", + "overskirt", + "bullock cart", + "oxygen mask", + "product packet / packaging", + "paddle", + "paddle wheel", + "padlock", + "paintbrush", + "pajamas", + "palace", + "pan flute", + "paper towel", + "parachute", + "parallel bars", + "park bench", + "parking meter", + "railroad car", + "patio", + "payphone", + "pedestal", + "pencil case", + "pencil sharpener", + "perfume", + "Petri dish", + "photocopier", + "plectrum", + "Pickelhaube", + "picket fence", + "pickup truck", + "pier", + "piggy bank", + "pill bottle", + "pillow", + "ping-pong ball", + "pinwheel", + "pirate ship", + "drink pitcher", + "block plane", + "planetarium", + "plastic bag", + "plate rack", + "farm plow", + "plunger", + "Polaroid camera", + "pole", + "police van", + "poncho", + "pool table", + "soda bottle", + "plant pot", + "potter's wheel", + "power drill", + "prayer rug", + "printer", + "prison", + "missile", + "projector", + "hockey puck", + "punching bag", + "purse", + "quill", + "quilt", + "race car", + "racket", + "radiator", + "radio", + "radio telescope", + "rain barrel", + "recreational vehicle", + "fishing casting reel", + "reflex camera", + "refrigerator", + "remote control", + "restaurant", + "revolver", + "rifle", + "rocking chair", + "rotisserie", + "eraser", + "rugby ball", + "ruler measuring stick", + "sneaker", + "safe", + "safety pin", + "salt shaker", + "sandal", + "sarong", + "saxophone", + "scabbard", + "weighing scale", + "school bus", + "schooner", + "scoreboard", + "CRT monitor", + "screw", + "screwdriver", + "seat belt", + "sewing machine", + "shield", + "shoe store", + "shoji screen / room divider", + "shopping basket", + "shopping cart", + "shovel", + "shower cap", + "shower curtain", + "ski", + "balaclava ski mask", + "sleeping bag", + "slide rule", + "sliding door", + "slot machine", + "snorkel", + "snowmobile", + "snowplow", + "soap dispenser", + "soccer ball", + "sock", + "solar thermal collector", + "sombrero", + "soup bowl", + "keyboard space bar", + "space heater", + "space shuttle", + "spatula", + "motorboat", + "spider web", + "spindle", + "sports car", + "spotlight", + "stage", + "steam locomotive", + "through arch bridge", + "steel drum", + "stethoscope", + "scarf", + "stone wall", + "stopwatch", + "stove", + "strainer", + "tram", + "stretcher", + "couch", + "stupa", + "submarine", + "suit", + "sundial", + "sunglasses", + "sunglasses", + "sunscreen", + "suspension bridge", + "mop", + "sweatshirt", + "swim trunks / shorts", + "swing", + "electrical switch", + "syringe", + "table lamp", + "tank", + "tape player", + "teapot", + "teddy bear", + "television", + "tennis ball", + "thatched roof", + "front curtain", + "thimble", + "threshing machine", + "throne", + "tile roof", + "toaster", + "tobacco shop", + "toilet seat", + "torch", + "totem pole", + "tow truck", + "toy store", + "tractor", + "semi-trailer truck", + "tray", + "trench coat", + "tricycle", + "trimaran", + "tripod", + "triumphal arch", + "trolleybus", + "trombone", + "hot tub", + "turnstile", + "typewriter keyboard", + "umbrella", + "unicycle", + "upright piano", + "vacuum cleaner", + "vase", + "vaulted or arched ceiling", + "velvet fabric", + "vending machine", + "vestment", + "viaduct", + "violin", + "volleyball", + "waffle iron", + "wall clock", + "wallet", + "wardrobe", + "military aircraft", + "sink", + "washing machine", + "water bottle", + "water jug", + "water tower", + "whiskey jug", + "whistle", + "hair wig", + "window screen", + "window shade", + "Windsor tie", + "wine bottle", + "airplane wing", + "wok", + "wooden spoon", + "wool", + "split-rail fence", + "shipwreck", + "sailboat", + "yurt", + "website", + "comic book", + "crossword", + "traffic or street sign", + "traffic light", + "dust jacket", + "menu", + "plate", + "guacamole", + "consomme", + "hot pot", + "trifle", + "ice cream", + "popsicle", + "baguette", + "bagel", + "pretzel", + "cheeseburger", + "hot dog", + "mashed potatoes", + "cabbage", + "broccoli", + "cauliflower", + "zucchini", + "spaghetti squash", + "acorn squash", + "butternut squash", + "cucumber", + "artichoke", + "bell pepper", + "cardoon", + "mushroom", + "Granny Smith apple", + "strawberry", + "orange", + "lemon", + "fig", + "pineapple", + "banana", + "jackfruit", + "cherimoya (custard apple)", + "pomegranate", + "hay", + "carbonara", + "chocolate syrup", + "dough", + "meatloaf", + "pizza", + "pot pie", + "burrito", + "red wine", + "espresso", + "tea cup", + "eggnog", + "mountain", + "bubble", + "cliff", + "coral reef", + "geyser", + "lakeshore", + "promontory", + "sandbar", + "beach", + "valley", + "volcano", + "baseball player", + "bridegroom", + "scuba diver", + "rapeseed", + "daisy", + "yellow lady's slipper", + "corn", + "acorn", + "rose hip", + "horse chestnut seed", + "coral fungus", + "agaric", + "gyromitra", + "stinkhorn mushroom", + "earth star fungus", + "hen of the woods mushroom", + "bolete", + "corn cob", + "toilet paper", +] diff --git a/lavis/datasets/builders/object3d_caption_builder.py b/lavis/datasets/builders/object3d_caption_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..fd764cb765ea77f1d0ebbe6abd084cb6c506e7e1 --- /dev/null +++ b/lavis/datasets/builders/object3d_caption_builder.py @@ -0,0 +1,65 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.builders.base_dataset_builder import MultiModalDatasetBuilder +from lavis.datasets.datasets.object3d_captioning_datasets import ( + ObjaverseCaptionDataset, + ObjaverseCaptionEvalDataset, + ObjaverseCaptionInstructDataset, + ShapenetCaptionDataset, + ShapenetCaptionEvalDataset, + ShapenetCaptionInstructDataset, +) + +@registry.register_builder("objaverse_mm_caption") +class ObjaverseCaptionBuilder(MultiModalDatasetBuilder): + train_dataset_cls = ObjaverseCaptionDataset + eval_dataset_cls = ObjaverseCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/objaverse/defaults_mm_cap.yaml", + } + + def build(self): + datasets = super().build() + build_info = self.config.build_info + for split,ds in datasets.items(): + # TODO: add option to download templates + templates = build_info.get('templates') + if templates == None: + ds._build_templates(None) + else: + ds._build_templates(build_info.templates.storage) + return datasets + +@registry.register_builder("objaverse_mm_caption_instruct") +class ObjaverseCaptionInstructBuilder(ObjaverseCaptionBuilder): + train_dataset_cls = ObjaverseCaptionInstructDataset + eval_dataset_cls = ObjaverseCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/objaverse/defaults_mm_cap_instruct.yaml", + } + +@registry.register_builder("shapenet_mm_caption") +class ShapenetCaptionBuilder(ObjaverseCaptionBuilder): + train_dataset_cls = ShapenetCaptionDataset + eval_dataset_cls = ShapenetCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/shapenet/defaults_mm_cap.yaml", + } + +@registry.register_builder("shapenet_mm_caption_instruct") +class ShapenetCaptionInstructBuilder(ObjaverseCaptionBuilder): + train_dataset_cls = ShapenetCaptionInstructDataset + eval_dataset_cls = ShapenetCaptionEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/shapenet/defaults_mm_cap_instruct.yaml", + } \ No newline at end of file diff --git a/lavis/datasets/builders/object3d_classification_builder.py b/lavis/datasets/builders/object3d_classification_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..b214d5df726929a3c319a4ac21144f492d42b9b0 --- /dev/null +++ b/lavis/datasets/builders/object3d_classification_builder.py @@ -0,0 +1,19 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder +from lavis.datasets.datasets.object3d_classification_datasets import ModelNetClassificationDataset + +@registry.register_builder("modelnet40_cls") +class ModelNetClassificationBuilder(MultiModalDatasetBuilder): + train_dataset_cls = ModelNetClassificationDataset + eval_dataset_cls = ModelNetClassificationDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/modelnet40/defaults_cls.yaml", + } \ No newline at end of file diff --git a/lavis/datasets/builders/object3d_qa_builder.py b/lavis/datasets/builders/object3d_qa_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..50ef545f8907300f5f4ed1ad27c66697bc8e5747 --- /dev/null +++ b/lavis/datasets/builders/object3d_qa_builder.py @@ -0,0 +1,19 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.builders.object3d_caption_builder import ObjaverseCaptionBuilder +from lavis.datasets.datasets.object3d_qa_datasets import ObjaverseQADataset + +@registry.register_builder("objaverse_mm_qa") +class ObjaverseQABuilder(ObjaverseCaptionBuilder): + train_dataset_cls = ObjaverseQADataset + eval_dataset_cls = ObjaverseQADataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/objaverse/defaults_mm_qa.yaml", + } \ No newline at end of file diff --git a/lavis/datasets/builders/retrieval_builder.py b/lavis/datasets/builders/retrieval_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..98ca3bdf572fe007ea1bd97d75aefcb8ae02fe3d --- /dev/null +++ b/lavis/datasets/builders/retrieval_builder.py @@ -0,0 +1,48 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder +from lavis.datasets.datasets.retrieval_datasets import ( + RetrievalDataset, + RetrievalEvalDataset, + VideoRetrievalDataset, + VideoRetrievalEvalDataset, +) + +from lavis.common.registry import registry + + +@registry.register_builder("msrvtt_retrieval") +class MSRVTTRetrievalBuilder(BaseDatasetBuilder): + train_dataset_cls = VideoRetrievalDataset + eval_dataset_cls = VideoRetrievalEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/msrvtt/defaults_ret.yaml"} + + +@registry.register_builder("didemo_retrieval") +class DiDeMoRetrievalBuilder(BaseDatasetBuilder): + train_dataset_cls = VideoRetrievalDataset + eval_dataset_cls = VideoRetrievalEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/didemo/defaults_ret.yaml"} + + +@registry.register_builder("coco_retrieval") +class COCORetrievalBuilder(BaseDatasetBuilder): + train_dataset_cls = RetrievalDataset + eval_dataset_cls = RetrievalEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/coco/defaults_ret.yaml"} + + +@registry.register_builder("flickr30k") +class Flickr30kBuilder(BaseDatasetBuilder): + train_dataset_cls = RetrievalDataset + eval_dataset_cls = RetrievalEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/flickr30k/defaults.yaml"} diff --git a/lavis/datasets/builders/text_to_image_generation_builder.py b/lavis/datasets/builders/text_to_image_generation_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..b93368a71f62b463b91f53ba407c767ed44e74cd --- /dev/null +++ b/lavis/datasets/builders/text_to_image_generation_builder.py @@ -0,0 +1,39 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.datasets.datasets.subject_driven_t2i_dataset import ( + SubjectDrivenTextToImageDataset, +) +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder + + +@registry.register_builder("blip_diffusion_finetune") +class BlipDiffusionFinetuneBuilder(BaseDatasetBuilder): + train_dataset_cls = SubjectDrivenTextToImageDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/blip_diffusion_datasets/defaults.yaml" + } + + def _download_ann(self): + pass + + def build(self): + self.build_processors() + + build_info = self.config.build_info + + dataset = self.train_dataset_cls( + image_dir=build_info.images.storage, + subject_text=build_info.subject_text, + inp_image_processor=self.kw_processors["inp_vis_processor"], + tgt_image_processor=self.kw_processors["tgt_vis_processor"], + txt_processor=self.text_processors["eval"], + ) + + return {"train": dataset} diff --git a/lavis/datasets/builders/video_qa_builder.py b/lavis/datasets/builders/video_qa_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..28e76a59e238f6b18fb052561bd27bdc4186af36 --- /dev/null +++ b/lavis/datasets/builders/video_qa_builder.py @@ -0,0 +1,77 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.common.utils import get_cache_path +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder +from lavis.datasets.datasets.video_vqa_datasets import VideoQADataset, VideoQAInstructDataset +from lavis.datasets.datasets.music_avqa import MusicAVQAInstructDataset, MusicAVQADataset + + +class VideoQABuilder(BaseDatasetBuilder): + train_dataset_cls = VideoQADataset + eval_dataset_cls = VideoQADataset + + def build(self): + datasets = super().build() + + ans2label = self.config.build_info.annotations.get("ans2label") + if ans2label is None: + raise ValueError("ans2label is not specified in build_info.") + + ans2label = get_cache_path(ans2label.storage) + + for split in datasets: + datasets[split]._build_class_labels(ans2label) + + return datasets + + +@registry.register_builder("msrvtt_qa") +class MSRVTTQABuilder(VideoQABuilder): + DATASET_CONFIG_DICT = { + "default": "configs/datasets/msrvtt/defaults_qa.yaml", + } + + +@registry.register_builder("msvd_qa") +class MSVDQABuilder(VideoQABuilder): + DATASET_CONFIG_DICT = { + "default": "configs/datasets/msvd/defaults_qa.yaml", + } + + +@registry.register_builder("msrvtt_qa_instruct") +class MSRVTTQAInstructBuilder(VideoQABuilder): + train_dataset_cls = VideoQAInstructDataset + eval_dataset_cls = VideoQAInstructDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/msrvtt/defaults_qa_instruct.yaml", + } + + +@registry.register_builder("msvd_qa_instruct") +class MSVDQAInstructBuilder(VideoQABuilder): + train_dataset_cls = VideoQAInstructDataset + eval_dataset_cls = VideoQAInstructDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/msvd/defaults_qa_instruct.yaml", + } + +@registry.register_builder("musicavqa_mm") +class MusicAVQABuilder(MultiModalDatasetBuilder): + train_dataset_cls = MusicAVQADataset + eval_dataset_cls = MusicAVQADataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/music_avqa/defaults_mm_qa.yaml"} + +@registry.register_builder("musicavqa_mm_instruct") +class MusicAVQAInstructBuilder(MultiModalDatasetBuilder): + train_dataset_cls = MusicAVQAInstructDataset + eval_dataset_cls = MusicAVQAInstructDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml"} \ No newline at end of file diff --git a/lavis/datasets/builders/vqa_builder.py b/lavis/datasets/builders/vqa_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..028fc434795793d612c646319f2e4f8b6394fd69 --- /dev/null +++ b/lavis/datasets/builders/vqa_builder.py @@ -0,0 +1,152 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder + +from lavis.common.registry import registry +from lavis.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset, AOKVQAInstructDataset +from lavis.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset, COCOVQAInstructDataset +from lavis.datasets.datasets.vg_vqa_datasets import VGVQADataset, VGVQAInstructDataset +from lavis.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset, GQAInstructDataset +from lavis.datasets.datasets.iconqa_datasets import IconQADataset, IconQAEvalDataset, IconQAInstructDataset +from lavis.datasets.datasets.ocr_datasets import OCRVQADataset, OCRVQAInstructDataset +from lavis.datasets.datasets.vizwiz_vqa_datasets import VizWizEvalDataset + +@registry.register_builder("coco_vqa") +class COCOVQABuilder(BaseDatasetBuilder): + train_dataset_cls = COCOVQADataset + eval_dataset_cls = COCOVQAEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco/defaults_vqa.yaml", + "eval": "configs/datasets/coco/eval_vqa.yaml", + } + +@registry.register_builder("coco_vqa_instruct") +class COCOVQAInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = COCOVQAInstructDataset + eval_dataset_cls = COCOVQAEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco/defaults_vqa_instruct.yaml", + "eval": "configs/datasets/coco/eval_vqa.yaml", + } + +@registry.register_builder("vg_vqa") +class VGVQABuilder(BaseDatasetBuilder): + train_dataset_cls = VGVQADataset + DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_vqa.yaml"} + +@registry.register_builder("vg_vqa_instruct") +class VGVQAInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = VGVQAInstructDataset + DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_vqa_instruct.yaml"} + +@registry.register_builder("ok_vqa") +class OKVQABuilder(COCOVQABuilder): + DATASET_CONFIG_DICT = { + "default": "configs/datasets/okvqa/defaults.yaml", + } + +@registry.register_builder("ok_vqa_instruct") +class OKVQAInstructBuilder(COCOVQAInstructBuilder): + DATASET_CONFIG_DICT = { + "default": "configs/datasets/okvqa/defaults_instruct.yaml", + } + +@registry.register_builder("aok_vqa") +class AOKVQABuilder(BaseDatasetBuilder): + train_dataset_cls = AOKVQADataset + eval_dataset_cls = AOKVQAEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"} + +@registry.register_builder("aok_vqa_instruct") +class AOKVQAInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = AOKVQAInstructDataset + eval_dataset_cls = AOKVQAEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults_instruct.yaml"} + + +@registry.register_builder("gqa") +class GQABuilder(BaseDatasetBuilder): + train_dataset_cls = GQADataset + eval_dataset_cls = GQAEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/gqa/defaults.yaml", + "balanced_val": "configs/datasets/gqa/balanced_val.yaml", + "balanced_testdev": "configs/datasets/gqa/balanced_testdev.yaml", + } + +@registry.register_builder("gqa_instruct") +class GQAInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = GQAInstructDataset + eval_dataset_cls = GQAEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/gqa/defaults_instruct.yaml", + "balanced_val": "configs/datasets/gqa/balanced_val_instruct.yaml", + "balanced_testdev": "configs/datasets/gqa/balanced_testdev_instruct.yaml", + } + +@registry.register_builder("iconqa") +class IconQABuilder(BaseDatasetBuilder): + train_dataset_cls = IconQADataset + eval_dataset_cls = IconQAEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/iconqa/defaults.yaml", + } + +@registry.register_builder("iconqa_instruct") +class IconQAInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = IconQAInstructDataset + eval_dataset_cls = IconQAEvalDataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/iconqa/defaults_instruct.yaml", + } + +@registry.register_builder("scienceqa") +class ScienceQABuilder(BaseDatasetBuilder): + train_dataset_cls = IconQADataset + eval_dataset_cls = IconQAEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/scienceqa/defaults.yaml"} + +@registry.register_builder("scienceqa_instruct") +class ScienceQAInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = IconQAInstructDataset + eval_dataset_cls = IconQAEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/scienceqa/defaults_instruct.yaml"} + +@registry.register_builder("ocr_vqa") +class OCRVQABuilder(BaseDatasetBuilder): + train_dataset_cls = OCRVQADataset + eval_dataset_cls = OCRVQADataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/defaults.yaml"} + +@registry.register_builder("ocr_vqa_instruct") +class OCRVQAInstructBuilder(BaseDatasetBuilder): + train_dataset_cls = OCRVQAInstructDataset + eval_dataset_cls = OCRVQAInstructDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/defaults_instruct.yaml"} + + +@registry.register_builder("vizwiz_vqa") +class VizWizVQABuilder(BaseDatasetBuilder): + eval_dataset_cls = VizWizEvalDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/vizwiz/defaults.yaml"} + + + diff --git a/lavis/datasets/data_utils.py b/lavis/datasets/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..098c7e0c591208b2d2f5f25c83164f1996ab2014 --- /dev/null +++ b/lavis/datasets/data_utils.py @@ -0,0 +1,351 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import gzip +import logging +import os +import random as rnd +import tarfile +import zipfile +import cv2 + +import decord +import webdataset as wds +import numpy as np +import torch +from torch.utils.data.dataset import IterableDataset, ChainDataset +from decord import VideoReader +from lavis.common.registry import registry +from lavis.datasets.datasets.base_dataset import ConcatDataset +from tqdm import tqdm + +decord.bridge.set_bridge("torch") +MAX_INT = registry.get("MAX_INT") + + +def load_video(video_path, n_frms=MAX_INT, height=-1, width=-1, sampling="uniform"): + vr = VideoReader(uri=video_path, height=height, width=width) + + vlen = len(vr) + start, end = 0, vlen + + n_frms = min(n_frms, vlen) + + if sampling == "uniform": + indices = np.arange(start, end, vlen / n_frms).astype(int) + elif sampling == "headtail": + indices_h = sorted(rnd.sample(range(vlen // 2), n_frms // 2)) + indices_t = sorted(rnd.sample(range(vlen // 2, vlen), n_frms // 2)) + indices = indices_h + indices_t + else: + raise NotImplementedError + + # get_batch -> T, H, W, C + frms = vr.get_batch(indices).permute(3, 0, 1, 2).float() # (C, T, H, W) + + return frms + + +def apply_to_sample(f, sample): + ## add check for datasets that return none samples for missing items + if sample == None or len(sample) == 0: + return {} + + def _apply(x): + if torch.is_tensor(x): + return f(x) + elif isinstance(x, dict): + return {key: _apply(value) for key, value in x.items()} + elif isinstance(x, list): + return [_apply(x) for x in x] + else: + return x + + return _apply(sample) + + +def move_to_cuda(sample): + def _move_to_cuda(tensor): + return tensor.cuda() + + return apply_to_sample(_move_to_cuda, sample) + + +def prepare_sample(samples, cuda_enabled=True): + if cuda_enabled: + samples = move_to_cuda(samples) + + # TODO fp16 support + + return samples + + +def reorg_datasets_by_split(datasets): + """ + Organizes datasets by split. + + Args: + datasets: dict of torch.utils.data.Dataset objects by name. + + Returns: + Dict of datasets by split {split_name: List[Datasets]}. + """ + # if len(datasets) == 1: + # return datasets[list(datasets.keys())[0]] + # else: + reorg_datasets = dict() + + # reorganize by split + for _, dataset in datasets.items(): + for split_name, dataset_split in dataset.items(): + if split_name not in reorg_datasets: + reorg_datasets[split_name] = [dataset_split] + else: + reorg_datasets[split_name].append(dataset_split) + + return reorg_datasets + + +def concat_datasets(datasets): + """ + Concatenates multiple datasets into a single dataset. + + It supports may-style datasets and DataPipeline from WebDataset. Currently, does not support + generic IterableDataset because it requires creating separate samplers. + + Now only supports conctenating training datasets and assuming validation and testing + have only a single dataset. This is because metrics should not be computed on the concatenated + datasets. + + Args: + datasets: dict of torch.utils.data.Dataset objects by split. + + Returns: + Dict of concatenated datasets by split, "train" is the concatenation of multiple datasets, + "val" and "test" remain the same. + + If the input training datasets contain both map-style and DataPipeline datasets, returns + a tuple, where the first element is a concatenated map-style dataset and the second + element is a chained DataPipeline dataset. + + """ + # concatenate datasets in the same split + for split_name in datasets: + if split_name != "train": + assert ( + len(datasets[split_name]) == 1 + ), "Do not support multiple {} datasets.".format(split_name) + datasets[split_name] = datasets[split_name][0] + else: + iterable_datasets, map_datasets = [], [] + for dataset in datasets[split_name]: + if isinstance(dataset, wds.DataPipeline): + logging.info( + "Dataset {} is IterableDataset, can't be concatenated.".format( + dataset + ) + ) + iterable_datasets.append(dataset) + elif isinstance(dataset, IterableDataset): + raise NotImplementedError( + "Do not support concatenation of generic IterableDataset." + ) + else: + map_datasets.append(dataset) + + # if len(iterable_datasets) > 0: + # concatenate map-style datasets and iterable-style datasets separately + chained_datasets = ( + ChainDataset(iterable_datasets) if len(iterable_datasets) > 0 else None + ) + concat_datasets = ( + ConcatDataset(map_datasets) if len(map_datasets) > 0 else None + ) + + train_datasets = concat_datasets, chained_datasets + train_datasets = tuple([x for x in train_datasets if x is not None]) + train_datasets = ( + train_datasets[0] if len(train_datasets) == 1 else train_datasets + ) + + datasets[split_name] = train_datasets + + return datasets + + +def extract_archive(from_path, to_path=None, overwrite=False): + """Extract archive. + + Args: + from_path: the path of the archive. + to_path: the root path of the extracted files (directory of from_path) + overwrite: overwrite existing files (False) + + Returns: + List of paths to extracted files even if not overwritten. + + Examples: + >>> url = 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz' + >>> from_path = './validation.tar.gz' + >>> to_path = './' + >>> torchtext.utils.download_from_url(url, from_path) + >>> torchtext.utils.extract_archive(from_path, to_path) + >>> ['.data/val.de', '.data/val.en'] + >>> torchtext.utils.download_from_url(url, from_path) + >>> torchtext.utils.extract_archive(from_path, to_path) + >>> ['.data/val.de', '.data/val.en'] + + """ + + if to_path is None: + to_path = os.path.dirname(from_path) + + if from_path.endswith((".tar.gz", ".tgz")): + logging.info("Opening tar file {} to {}.".format(from_path, to_path)) + with tarfile.open(from_path, "r") as tar: + files = [] + for file_ in tqdm(tar): + file_path = os.path.join(to_path, file_.name) + if file_.isfile(): + files.append(file_path) + if os.path.exists(file_path): + logging.info("{} already extracted.".format(file_path)) + if not overwrite: + continue + tar.extract(file_, to_path) + logging.info("Finished extracting tar file {}.".format(from_path)) + return files + + elif from_path.endswith(".zip"): + assert zipfile.is_zipfile(from_path), from_path + logging.info("Opening zip file {} to {}.".format(from_path, to_path)) + with zipfile.ZipFile(from_path, "r") as zfile: + files = [] + for file_ in tqdm(zfile.namelist()): + file_path = os.path.join(to_path, file_) + files.append(file_path) + if os.path.exists(file_path): + logging.info("{} already extracted.".format(file_path)) + if not overwrite: + continue + zfile.extract(file_, to_path) + files = [f for f in files if os.path.isfile(f)] + logging.info("Finished extracting zip file {}.".format(from_path)) + return files + + elif from_path.endswith(".gz"): + logging.info("Opening gz file {} to {}.".format(from_path, to_path)) + default_block_size = 65536 + filename = from_path[:-3] + files = [filename] + with gzip.open(from_path, "rb") as gzfile, open(filename, "wb") as d_file: + while True: + block = gzfile.read(default_block_size) + if not block: + break + else: + d_file.write(block) + d_file.write(block) + logging.info("Finished extracting gz file {}.".format(from_path)) + return files + + else: + raise NotImplementedError( + "We currently only support tar.gz, .tgz, .gz and zip achives." + ) + + +def save_frames_grid(img_array, out_path): + import torch + from PIL import Image + from torchvision.utils import make_grid + + if len(img_array.shape) == 3: + img_array = img_array.unsqueeze(0) + elif len(img_array.shape) == 5: + b, t, c, h, w = img_array.shape + img_array = img_array.view(-1, c, h, w) + elif len(img_array.shape) == 4: + pass + else: + raise NotImplementedError( + "Supports only (b,t,c,h,w)-shaped inputs. First two dimensions can be ignored." + ) + + assert img_array.shape[1] == 3, "Exepcting input shape of (H, W, 3), i.e. RGB-only." + + grid = make_grid(img_array) + ndarr = grid.permute(1, 2, 0).to("cpu", torch.uint8).numpy() + + img = Image.fromarray(ndarr) + + img.save(out_path) + + +def uniform_frame_sampling(video_path, num_frames, target_height, target_width, start_time=None, end_time=None): + cap = cv2.VideoCapture(video_path) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + frame_rate = cap.get(cv2.CAP_PROP_FPS) + + if start_time is None: + start_time = 0 + if end_time is None: + end_time = total_frames / frame_rate + + start_frame = int(start_time * frame_rate) + end_frame = int(end_time * frame_rate) + frame_indices = list(range(start_frame, end_frame + 1, (end_frame - start_frame + 1) // num_frames)) + + frames = [] + for frame_index in frame_indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index) + ret, frame = cap.read() + if not ret: + break + frame = cv2.resize(frame, (target_width, target_height)) + frames.append(frame) + + cap.release() + return frames + + +def head_tail_frame_sampling(video_path, num_frames, target_height, target_width, start_time=None, end_time=None): + cap = cv2.VideoCapture(video_path) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + frame_rate = cap.get(cv2.CAP_PROP_FPS) + + if start_time is None: + start_time = 0 + if end_time is None: + end_time = total_frames / frame_rate + + start_frame = int(start_time * frame_rate) + end_frame = int(end_time * frame_rate) + frame_indices = [start_frame] + [start_frame + (end_frame - start_frame) // (num_frames - 1) * i for i in range(1, num_frames - 1)] + [end_frame] + + frames = [] + for frame_index in frame_indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index) + ret, frame = cap.read() + if not ret: + break + frame = cv2.resize(frame, (target_width, target_height)) + frames.append(frame) + + cap.release() + if len(frames) == 0: + return None + return torch.stack([torch.tensor(f).permute(2,0,1).float() for f in frames], dim=1) + + +def load_clip(video_path, num_frames, target_height, target_width, start_time=None, end_time=None, sampling="headtail"): + if sampling == "headtail": + return head_tail_frame_sampling(video_path, num_frames, target_height, target_width, start_time, end_time) + elif sampling == "uniform": + return uniform_frame_sampling(video_path, num_frames, target_height, target_width, start_time, end_time) + else: + raise NotImplementedError \ No newline at end of file diff --git a/lavis/datasets/datasets/__pycache__/aok_vqa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/aok_vqa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e13cca98a92230313ed6ab58037cfaa55c821490 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/aok_vqa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/audio_captioning_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/audio_captioning_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2297f2ea2f59d9c8e7c2b65ceaa8a1e8fdb57537 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/audio_captioning_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/audio_classification_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/audio_classification_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af603ba68b3a85a60370f2bda5583433115b96eb Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/audio_classification_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/audio_qa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/audio_qa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eeb3fe8eb45ebd23321cc997901d8f9f3e689aee Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/audio_qa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/avsd_dialogue_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/avsd_dialogue_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42e7c8ef5e00c0357c3acbb53af916c922df917a Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/avsd_dialogue_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/base_dataset.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/base_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c0ca198ab53a63c53f391b224694c475c7c0cbf Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/base_dataset.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/capfilt_dataset.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/capfilt_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb1e96b1ee52377d5b6bee2f32e4e2c6935f74bc Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/capfilt_dataset.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/caption_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/caption_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..391cb39639293f8564d730ee8961d6262668ceed Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/caption_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/coco_caption_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/coco_caption_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c84d92644af196d04471941650189c12dfa7510 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/coco_caption_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/coco_vqa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/coco_vqa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c0d4113b331268a91b7cc942ff7885b1cf80f37 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/coco_vqa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/dialogue_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/dialogue_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12332fd7a1b8771010fed98c733ab565e8e558c4 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/dialogue_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/discriminatory_reasoning_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/discriminatory_reasoning_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c9a977595fa9de44b3e0063b3e56f48d87ed780 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/discriminatory_reasoning_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/gqa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/gqa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4716b7748e9fe4b6cad008e651b36dbc51c8295 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/gqa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/iconqa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/iconqa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..224383423dcc85f31b882d790b04b8b0011ec45e Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/iconqa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/image_text_pair_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/image_text_pair_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ae7cc6ff5c140d2b7f7b8227f02943501b4f742 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/image_text_pair_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/imagefolder_dataset.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/imagefolder_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bf9a90fbb8805ebef1d55982bca826a5aaf5374 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/imagefolder_dataset.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/laion_dataset.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/laion_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16278b241084b20a3175f9a73a5033c72b4a874a Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/laion_dataset.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/llava150k_dataset.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/llava150k_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51b0e6cc01c6a78cce412bbd55327fc993722f50 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/llava150k_dataset.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/multimodal_classification_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/multimodal_classification_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e486f90fb855d5a4ec40d00029093e927df2471e Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/multimodal_classification_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/music_avqa.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/music_avqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6708784c9038f159a0f434bb6bcc98f4007f9dd4 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/music_avqa.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/nlvr_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/nlvr_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bd5e089591309a91e370d7d29a4649194e932b8 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/nlvr_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/object3d_captioning_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/object3d_captioning_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd78426b224c3c5f1ee4ecb4369b0a472bb37d36 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/object3d_captioning_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/object3d_classification_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/object3d_classification_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..baf455d5ba236d59dc3e25f80d5b67f54c66f9b1 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/object3d_classification_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/object3d_qa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/object3d_qa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1acada56356f549bcfec0bb4d7253cc9c93767b7 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/object3d_qa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/ocr_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/ocr_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d18bdcc05325684fcbad217902a5efb6fed3bbc Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/ocr_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/retrieval_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/retrieval_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..066dcf3935a86ab2ae703676499b64edb07e6a29 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/retrieval_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/snli_ve_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/snli_ve_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1795c316bed55a2324ace316db61dd5e08638b44 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/snli_ve_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/subject_driven_t2i_dataset.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/subject_driven_t2i_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30eb95287a0079e01a5fc9dcfc02a0ab4c4ed804 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/subject_driven_t2i_dataset.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/textcaps_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/textcaps_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..617749155d3bfeb30296e7dcb94b62b76746243a Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/textcaps_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/valor_caption.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/valor_caption.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ef5e75d5e8df3b79286b75fb47886cba6011021 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/valor_caption.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/vatex_captioning_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/vatex_captioning_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f890fbd26ba3dfe9bef9e6b5a6f4e5e43c41a498 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/vatex_captioning_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/vg_vqa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/vg_vqa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a5416d60bcf831cc782a2394978aad676352981 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/vg_vqa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/video_caption_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/video_caption_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9540e6823c0c8ef4e5aca19262d61e956cbf6df8 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/video_caption_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/video_vqa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/video_vqa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc6433e6feee85a38bc7e1e72b47f99058545857 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/video_vqa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/violin_dataset.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/violin_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21d5a937897f3f161a5a14bbebba677e325e4aa8 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/violin_dataset.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/visdial_dialogue_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/visdial_dialogue_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ecd423faae4da1f5ed9dc387104765d2f118c37 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/visdial_dialogue_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/vizwiz_vqa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/vizwiz_vqa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f5ab8e29cca783208e35caaff78027fc21cafbc Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/vizwiz_vqa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/vlep_dataset.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/vlep_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91d468bdd31db9837eb48c20fbc945fc67f01586 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/vlep_dataset.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/vqa_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/vqa_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cde3c95ab836279f7362a22cedbd8a8a1a1c1cb3 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/vqa_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/vsr_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/vsr_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd0a291ce7a69e5fb1d08c009a2d0e3a2db678bf Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/vsr_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/__pycache__/yt8m_video_dialogue_datasets.cpython-310.pyc b/lavis/datasets/datasets/__pycache__/yt8m_video_dialogue_datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cbedcf6a5e1e3a99c87c47683f903bf060763c3 Binary files /dev/null and b/lavis/datasets/datasets/__pycache__/yt8m_video_dialogue_datasets.cpython-310.pyc differ diff --git a/lavis/datasets/datasets/aok_vqa_datasets.py b/lavis/datasets/datasets/aok_vqa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..53ad32e793c7f9c025274a11a2712404a205657e --- /dev/null +++ b/lavis/datasets/datasets/aok_vqa_datasets.py @@ -0,0 +1,167 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from collections import OrderedDict +import json +import os +import torch +import random + +from PIL import Image + +from lavis.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + return OrderedDict( + { + "file": ann["image"], + "question": ann["question"], + "question_id": ann["question_id"], + "direct_answers": "; ".join(ann["direct_answers"]), + "choices": "; ".join(ann["choices"]), + "correct_choice": ann["choices"][ann["correct_choice_idx"]], + "image": sample["image"], + } + ) + + +class AOKVQADataset(VQADataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + answer_key = "direct_answers" + + answer_weight = {} + for answer in ann[answer_key]: + if answer in answer_weight.keys(): + answer_weight[answer] += 1 / len(ann[answer_key]) + else: + answer_weight[answer] = 1 / len(ann[answer_key]) + + answers = list(answer_weight.keys()) + weights = list(answer_weight.values()) + + return { + "image": image, + "text_input": question, + "answers": answers, + "weights": weights, + } + +class AOKVQAInstructDataset(AOKVQADataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data["text_output"] = random.choice(data['answers']) + return data + + def collater(self, samples): + data = super().collater(samples) + data['text_output'] = data['answer'] + return data + + +class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + self.vis_root = vis_root + + self.annotation = json.load(open(ann_paths[0])) + + answer_list_path = ann_paths[1] + if os.path.exists(answer_list_path): + self.answer_list = json.load(open(answer_list_path)) + else: + self.answer_list = None + + try: + self.coco_fmt_qust_file = ann_paths[2] + self.coco_fmt_anno_file = ann_paths[3] + except IndexError: + self.coco_fmt_qust_file = None + self.coco_fmt_anno_file = None + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + def collater(self, samples): + ( + image_list, + question_list, + question_id_list, + instance_id_list, + choices_list, + correct_choice_idx_list, + direct_answers_list, + ) = ([], [], [], [], [], [], []) + + for sample in samples: + image_list.append(sample["image"]) + question_list.append(sample["text_input"]) + question_id_list.append(sample["question_id"]) + instance_id_list.append(sample["instance_id"]) + choices_list.append(sample["choices"]) + correct_choice_idx_list.append(sample["correct_choice_idx"]) + direct_answers_list.append(sample["direct_answers"]) + + return { + "image": torch.stack(image_list, dim=0), + "text_input": question_list, + "question_id": question_id_list, + "instance_id": instance_id_list, + "choices": choices_list, + "correct_choice_idx": correct_choice_idx_list, + "direct_answers": direct_answers_list, + } + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + choices = ann["choices"] + if "correct_choice_idx" in ann: + correct_choice_idx = ann["correct_choice_idx"] + else: + correct_choice_idx = None + + if "direct_answers" in ann: + direct_answers = ann["direct_answers"] + else: + direct_answers = None + + return { + "image": image, + "text_input": question, + "question_id": ann["question_id"], + "instance_id": ann["instance_id"], + "choices": choices, + "correct_choice_idx": correct_choice_idx, + "direct_answers": direct_answers, + } \ No newline at end of file diff --git a/lavis/datasets/datasets/audio_captioning_datasets.py b/lavis/datasets/datasets/audio_captioning_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..e7a00e99cb983da4d1900253a15eb5a796422fbe --- /dev/null +++ b/lavis/datasets/datasets/audio_captioning_datasets.py @@ -0,0 +1,407 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict +import torch +import copy +import pathlib +import random +import json +import pandas as pd +import torchaudio +import torch +from tqdm import tqdm + +from lavis.datasets.datasets.base_dataset import BaseDataset + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "label": ann["caption"], + "audio": sample["audio"], + "audio_path": sample["audio_path"], + "caption": sample["caption"], + + } + ) + + +class AudioCaptioningDataset(BaseDataset, __DisplMixin): + def __init__(self, **kwargs): + self.modalities = kwargs['modalities'] + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + for modality in self.modalities: + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + + def get_audio_path(self, ann): + raise NotImplementedError("Subclasses should implement this!") + + def is_empty_audio(self, ann): + path = self.get_audio_path(ann) + try: + waveform, sr = torchaudio.load(path) + + # Convert to mono if it's stereo + if waveform.shape[0] == 2: + waveform = torch.mean(waveform, dim=0) + + except torchaudio.TorchaudioException: + return True # Audio loading failed + + return waveform.nelement() == 0 + + def get_existing_audio_annotations(self): + return [f.split('_')[0] for f in os.listdir(self.audio_root)] + + def get_existing_video_annotations(self): + return os.listdir(self.video_root) + + def get_existing_images_annotations(self): + return os.listdir(self.vis_root) + + def get_video_path(self, ann): + return pathlib.Path(os.path.join(self.video_root, ann[self.sample_id_key])).resolve() + + def get_images_path(self, ann): + return pathlib.Path(os.path.join(self.vis_root, ann[self.sample_id_key])).resolve() + + def __len__(self): + return len(self.annotation) + + def __getitem__(self, index): + raise NotImplementedError("Subclasses should implement this!") + + def _build_templates(self, templates_path): + # use captions not templates + if templates_path is None: + self.templates = None + else: + with open(templates_path) as f: + self.templates = json.load(f) + +class AudioSetDataset(AudioCaptioningDataset): + def __init__(self, **kwargs): + self.dataset_name = 'audioset' + self.sample_id_key = 'YTID' + clean_ids = [l.strip() for l in open(kwargs['ann_paths'][-1]).readlines()] + df = pd.read_csv(kwargs['ann_paths'][-1]) + self.mid2label = {k: v for k, v in zip(df['mid'].tolist(), df['display_name'].tolist())} + annotation = [] + for ann_path in kwargs['ann_paths'][:-1]: + df = pd.read_csv(ann_path, comment='#', header=None,names=['YTID', 'start_seconds', 'end_seconds', 'positive_labels'], skiprows=3, quotechar='"', delimiter=',', skipinitialspace=True ) + annotation.extend([row.to_dict() for i,row in df.iterrows()]) + kwargs['ann_paths'] = [] + super().__init__(**kwargs) + self.annotation = annotation + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + + self.annotation = [ann for ann in self.annotation if ann[self.sample_id_key] in self.sample_ids and ann[self.sample_id_key]] + self._add_instance_ids() + print(f"Loaded {len(self.annotation)} examples.") + + def get_audio_path(self, ann): + if 'end_seconds' not in ann: + ann['start_seconds'] = float(ann['start_time']) + del ann['start_time'] + ann['end_seconds'] = float(ann['start_seconds']) + 10.0 + return str(os.path.realpath(os.path.join(self.audio_root, ann[self.sample_id_key] + '_{:.1f}-{:.1f}.wav'.format(ann['start_seconds'], ann['end_seconds'])))).replace('all_audio/', '') + + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + ann["sample_id"] = ann["YTID"] + objects = ann['positive_labels'].split(',') + objects = [self.mid2label[l] for l in objects] + ann['label'] = objects + if self.templates: + ann['captions'] = [random.choice(self.templates).format(obj) for obj in objects] + else: + ann['captions'] = [random.choice(objects)] + + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann) + if isinstance(ann[f"{modality}_path"], list): + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + else: + ann[modality if 'image' not in modality else 'image'] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(ann[f"{modality}_path"]) + + if isinstance(ann['captions'], list): + ann['text_input'] = self.text_processor(random.choice(ann['captions'])) + else: + ann['text_input'] = self.text_processor(ann['captions']) + + if ann["audio"].sum() == 0: + return None + + return ann + +class AudioSetInstructDataset(AudioSetDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +class AudioSetEvalDataset(AudioSetDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data + +class AudioCapsDataset(AudioCaptioningDataset): + def __init__(self, **kwargs): + self.sample_id_key = "youtube_id" + self.split = 'train' if 'train' in kwargs['ann_paths'][0] else 'test' if 'test' in kwargs['ann_paths'][0] else 'val' + self.modalities = kwargs['modalities'] + for modality in self.modalities: + kwargs[f"{modality}_root"] = os.path.join(kwargs[f"{modality}_root"],f'{self.split}') + super().__init__(**kwargs) + self.cached = kwargs.get('cached', False) + self.cache_dir = kwargs.get('cached_dir', '') + def get_existing_audio_annotations(self): + return [f.split('_')[0] for f in os.listdir(self.audio_root)] if not self.cached else [f.split('_')[0] for f in os.listdir(self.cached_dir)] + + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + self.annotation = [ann for ann in self.annotation if ann[self.sample_id_key] in self.sample_ids and ann[self.sample_id_key] not in kwargs.get('missing_ids', [])] + self._add_instance_ids() + print(f"Loaded {len(self.annotation)} examples.") + + def get_audio_path(self, ann): + if 'end_seconds' not in ann: + ann['start_seconds'] = float(ann['start_time']) + ann['end_seconds'] = ann['start_seconds'] + 10.0 + return os.path.join(self.audio_root, ann[self.sample_id_key] + '_{}.flac'.format(int(ann['start_seconds']))) + + def get_cached_audio_path(self, ann): + if 'end_seconds' not in ann: + ann['start_seconds'] = float(ann['start_time']) + ann['end_seconds'] = ann['start_seconds'] + 10.0 + return os.path.join(self.cache_dir, ann[self.sample_id_key] + '_{}.flac.pt'.format(int(ann['start_seconds']))) + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + ann['captions'] = ann['caption'] + ann["sample_id"] = ann["youtube_id"] + + for modality in self.modalities: + if modality == 'audio' and self.cached: + ann[f"{modality}_path"] = getattr(self, f"get_cached_{modality}_path")(ann) + ann["audio"] = torch.load(ann[f"{modality}_path"]) + else: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann) + if isinstance(ann[f"{modality}_path"], list): + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + ann[modality if 'image' not in modality else 'image'] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(ann[f"{modality}_path"]) + + if isinstance(ann['captions'], list): + ann['text_input'] = self.text_processor(random.choice(ann['captions'])) + else: + ann['text_input'] = self.text_processor(ann['captions']) + + if ann["audio"].sum() == 0: + return None + + return ann + +class AudioCapsInstructDataset(AudioCapsDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +class AudioCapsEvalDataset(AudioCapsDataset): + def __init__(self, **kwargs): + super().__init__(**kwargs) + seen = set() + self.annotation = [x for x in self.annotation if x["youtube_id"] not in seen and not seen.add(x["youtube_id"])] + + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data + +class ClothoV2Dataset(BaseDataset, __DisplMixin): + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + # Captions column names in CSV files + self._CAPTIONS_KEYS = ( + "caption_1", + "caption_2", + "caption_3", + "caption_4", + "caption_5", + ) + self.split = kwargs['ann_paths'][-1].split('_')[-1].split('.')[0] + for ann in self.annotation: + ann["fname"] = ann["file_name"] + ann["sound_id"] = ann["fname"] + ann["captions"] = [ann[caption_key] for caption_key in self._CAPTIONS_KEYS] + + self.audio_processor = kwargs[f"audio_processor"] + self.audio_root = kwargs[f"audio_root"] + self._add_instance_ids() + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + ann['audio'] = self.audio_processor(os.path.join(self.audio_root,self.split,ann['fname'])) + if ann["audio"].sum() == 0: + return None + ann['audio_path'] = os.path.join(self.audio_root,self.split,ann['fname']) + ann["text_input"] = self.text_processor(random.choice(ann['captions'])) + return ann + +class ClothoV2InstructDataset(ClothoV2Dataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +class ClothoV2EvalDataset(ClothoV2Dataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data + +# class ClothoV2EvalDataset(BaseDataset, __DisplMixin): +# def __init__(self, **kwargs): +# super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) +# # Captions column names in CSV files +# self._CAPTIONS_KEYS = ( +# "caption_1", +# "caption_2", +# "caption_3", +# "caption_4", +# "caption_5", +# ) + +# for ann in self.annotation: +# ann["fname"] = ann["file_name"] +# ann["sound_id"] = ann["fname"] +# ann["captions"] = [ann[caption_key] for caption_key in self._CAPTIONS_KEYS] +# self.audio_processor = kwargs[f"audio_processor"] +# self.audio_root = kwargs[f"audio_root"] +# self._add_instance_ids() + +# def __getitem__(self, index): +# ann = copy.deepcopy(self.annotation[index]) +# ann['audio'] = self.audio_processor(os.path.join(self.audio_root,ann['fname'])) +# if ann["audio"].sum() == 0: +# return None +# ann['audio_path'] = os.path.join(self.audio_root,ann['fname']) +# # ann["text_input"] = ann['captions'] +# return ann + +class AudioLanguagePretrainDataset(BaseDataset, __DisplMixin): + def __init__(self, **kwargs): + json_files = kwargs['ann_paths'][:-1] + blacklist = None + # self._load_json_file(json_files, kwargs["audio_root"], blacklist) + self.annotation = json.load(open(kwargs['ann_paths'][-1])) + self.cached = kwargs.get('cached', False) + self.cache_dir = kwargs.get('cached_dir', '') + self.text_processor = kwargs.get('text_processor', None) + self.audio_processor = kwargs['audio_processor'] + self._add_instance_ids() + + # https://github.com/XinhaoMei/WavCaps/blob/c17ff4fe61a650a5d19fb7df8b85569c9ebc74e3/retrieval/data_handling/pretrain_dataset.py#L55 + def _load_json_file(self, files, audio_root, blacklist=None): + json_data = [] + audio_id = 0 + if blacklist is not None: + with open(blacklist, 'r') as f: + blacklist = json.load(f) + for file in files: + with open(file, "r") as f: + json_obj = json.load(f) + if json_obj["num_captions_per_audio"] == 1: + for item in tqdm(json_obj["data"]): + if "FreeSound" in file and blacklist is not None: + if item["id"] in blacklist["FreeSound"]: + continue + elif "AudioSet" in file and blacklist is not None: + if item["id"] in blacklist["AudioSet"]: + continue + if 'AudioSet' in file: + audio_path = f"{audio_root}/AudioSet_SL_flac/{item['id'].split('.')[0]}.flac" + elif 'BBC_Sound' in file: + audio_path = f"{audio_root}/BBC_Sound_Effects_flac/{item['id'].split('.')[0]}.flac" + elif 'FreeSound' in file: + audio_path = f"{audio_root}/FreeSound_flac/{item['id'].split('.')[0]}.flac" + elif 'SoundBible' in file: + audio_path = f"{audio_root}/SoundBible_flac/{item['id'].split('.')[0]}.flac" + if not os.path.exists(audio_path): + # print(f'Skipped {audio_path}') + continue + temp_dict = {"audio": item["audio"], "caption": item["caption"], "id": item['id'],"duration": item["duration"], 'audio_path': audio_path} + json_data.append(temp_dict) + audio_id += 1 + else: + for item in json_obj["data"]: + for i in range(1, json_obj["num_captions_per_audio"] + 1): + temp_dict = {"audio": item["audio"], "caption": item[f"caption_{i}"], "id": item['id'], + "duration": item["duration"]} + json_data.append(temp_dict) + audio_id += 1 + return json_data + + def __len__(self): + return len(self.annotation) + + def __getitem__(self, index): + ann = self.annotation[index] + + ## CACHED REPRESENTATIONS + if self.cached: + audio = torch.load(os.path.join(self.cache_dir, f"{ann['id']}.pt"), map_location=torch.device('cpu')) + else: + audio = self.audio_processor(ann["audio_path"]) + + if audio.sum() == 0: + return None + + caption = self.text_processor(ann["caption"]) + audio_id = ann["id"] + + return { + "audio": audio , + "text_input": caption, + "sample_id": audio_id, + "instance_id": ann["instance_id"] + } + + def _build_templates(self, templates_path): + self.templates = None + +class AudioLanguagePretrainInstructDataset(AudioLanguagePretrainDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +class AudioLanguagePretrainEvalDataset(AudioLanguagePretrainDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data \ No newline at end of file diff --git a/lavis/datasets/datasets/audio_classification_datasets.py b/lavis/datasets/datasets/audio_classification_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..2697d0980798fc6668f659a658ec925bd6894710 --- /dev/null +++ b/lavis/datasets/datasets/audio_classification_datasets.py @@ -0,0 +1,100 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict +import torch +import copy +import pathlib +import random +import json +import pandas as pd +import torchaudio +import torch +from tqdm import tqdm + +from lavis.datasets.datasets.base_dataset import BaseDataset + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "label": ann["caption"], + "audio": sample["audio"], + "audio_path": sample["audio_path"], + "caption": sample["caption"], + + } + ) + + +class ESC50(BaseDataset, __DisplMixin): + def __init__(self, **kwargs): + self.modalities = kwargs['modalities'] + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + for modality in self.modalities: + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + self.classnames = list(set([ann['category'] for ann in self.annotation])) + self.classnames = [c.replace('_', ' ') for c in self.classnames] + + def get_audio_path(self, ann): + return os.path.join(self.audio_root, ann["sample_id"]) + + def is_empty_audio(self, ann): + path = self.get_audio_path(ann) + try: + waveform, sr = torchaudio.load(path) + + # Convert to mono if it's stereo + if waveform.shape[0] == 2: + waveform = torch.mean(waveform, dim=0) + + except torchaudio.TorchaudioException: + return True # Audio loading failed + + return waveform.nelement() == 0 + + def get_existing_audio_annotations(self): + return [f for f in os.listdir(self.audio_root)] + + def get_existing_video_annotations(self): + return os.listdir(self.video_root) + + def get_existing_images_annotations(self): + return os.listdir(self.vis_root) + + def get_video_path(self, ann): + return pathlib.Path(os.path.join(self.video_root, ann[self.sample_id_key])).resolve() + + def get_images_path(self, ann): + return pathlib.Path(os.path.join(self.vis_root, ann[self.sample_id_key])).resolve() + + def __len__(self): + return len(self.annotation) + + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + ann["sample_id"] = ann["filename"] + ann['label'] = ann['category'].replace('_', ' ') + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann) + if isinstance(ann[f"{modality}_path"], list): + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + else: + ann[modality if 'image' not in modality else 'image'] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(ann[f"{modality}_path"]) + + if ann["audio"].sum() == 0: + return None + + return ann + diff --git a/lavis/datasets/datasets/audio_qa_datasets.py b/lavis/datasets/datasets/audio_qa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..31085808e10db236192a719039949d05e144869e --- /dev/null +++ b/lavis/datasets/datasets/audio_qa_datasets.py @@ -0,0 +1,116 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import copy +import os +from lavis.datasets.datasets.audio_captioning_datasets import AudioCapsDataset +from lavis.datasets.datasets.base_dataset import BaseDataset +import torch +import random +from collections import Counter + +class AudioCapsQADataset(AudioCapsDataset): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.add_binary = kwargs.get('add_binary', False) + self.binary_templates = ["do you hear {}?", "is this {}?", "does the audio contain {}?"] + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + for modality in self.modalities: + if modality == 'audio' and self.cached: + ann[f"{modality}_path"] = getattr(self, f"get_cached_{modality}_path")(ann) + ann["audio"] = torch.load(ann[f"{modality}_path"]) + else: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann) + if isinstance(ann[f"{modality}_path"], list): + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + ann[modality if 'image' not in modality else 'image'] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(ann[f"{modality}_path"]) + + if ann["audio"].sum() == 0: + return None + if self.add_binary and random.randint(0,10) < 3: + yes_answer = random.randint(0,10)<5 + if not yes_answer: + caption_index = random.choice(list(set(range(len(self.annotation))).difference(set([index])))) + caption = self.annotation[caption_index]['caption'] + else: + caption = ann['caption'] + + question = random.choice(self.binary_templates).format(caption) + answer = 'yes' if yes_answer else 'no' + return { + "text_input": self.text_processor(question), + "instance_id": ann["instance_id"], + "text_output":answer, + "answer":answer, + "caption": ann['caption'], + "audio": ann['audio'], + "audio_id": ann['youtube_id'], + "question_id": ann['youtube_id'], + } + + return { + "text_input": self.text_processor(ann['question']), + "instance_id": ann["instance_id"], + "text_output":ann['answer'], + "answer":ann['answer'], + "caption": ann['caption'], + "audio": ann['audio'], + "audio_id": ann['youtube_id'], + "question_id": ann['youtube_id'], + } + + + +class ClothoQADataset(BaseDataset): + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + + self.non_binary_cls = kwargs.get('non_bin',False) + self.unanimous = kwargs.get('unanimous',False) + + + annotation = [] + for i in range(0, len(self.annotation), 3): + new_ann = self.annotation[i] + new_ann['question'] = new_ann['QuestionText'] + del new_ann['QuestionText'] + new_ann['answer'] = [self.annotation[i+off]['answer'] for off in range(3)] + if self.unanimous and Counter(new_ann['answer'])[new_ann['answer'][0]] != 3: + continue + if self.non_binary_cls and ('yes' in new_ann['answer'] or 'no' in new_ann['answer']): + continue + new_ann["question_id"] = new_ann['instance_id'] + annotation.append(new_ann) + self.modalities = kwargs['modalities'] + for modality in self.modalities: + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + self.annotation = annotation + + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + audio_path = os.path.join(self.audio_root, ann["file_name"]) + ann['audio'] = self.audio_processor(audio_path) + + if ann["audio"].sum() == 0: + return None + + return { + "text_input": self.text_processor(ann['question']), + "question": self.text_processor(ann['question']), + "instance_id": ann["instance_id"], + "text_output":random.choice(ann['answer']), + "answer":ann['answer'], + "answers":ann['answer'], + "audio": ann['audio'], + "question_id": ann['instance_id'], + } + + def _build_templates(self, template): + return None diff --git a/lavis/datasets/datasets/avsd_dialogue_datasets.py b/lavis/datasets/datasets/avsd_dialogue_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..69459220349b494d52e93f328ae6301a8fe5b1ae --- /dev/null +++ b/lavis/datasets/datasets/avsd_dialogue_datasets.py @@ -0,0 +1,227 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import os +import copy +import random +from PIL import Image +from lavis.datasets.datasets.dialogue_datasets import ( + DialogueDataset, + DialogueEvalDataset, +) + + +class AVSDDialDataset(DialogueDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + + ann = self.annotation[index] + + vname = ann["image_id"] + + video = self.vis_processor(self.vis_root, vname) + + dialogue = self.text_processor(ann) + + # "image_id" is kept to stay compatible with the COCO evaluation format + return { + "video_fts": video["video_fts"], + "video_token_type_ids": video["token_type_ids"], + "input_ids": dialogue["input_ids"], + "token_type_ids": dialogue["token_type_ids"], + "labels": dialogue["labels"], + "image_id": ann["image_id"], + "instance_id": ann["instance_id"], + } + + def collater(self, samples): + + input_ids, token_type_ids, labels, video_fts, video_token_type_ids = ( + [], + [], + [], + [], + [], + ) + + for i in samples: + input_ids.append(i["input_ids"]) + token_type_ids.append(i["token_type_ids"]) + labels.append(i["labels"]) + video_fts.append(i["video_fts"]) + video_token_type_ids.append(i["video_token_type_ids"]) + + input_ids = self.text_processor.padding(input_ids) + + labels = self.text_processor.padding( + labels, -1 + ) # ignore token indice -1 by default + video_fts = self.vis_processor.padding(video_fts) + + token_type_ids = self.text_processor.padding(token_type_ids) + video_token_type_ids = self.text_processor.padding(video_token_type_ids) + token_type_ids = torch.cat([video_token_type_ids, token_type_ids], dim=1) + + attn_mask = self.text_processor.get_attention_mask(input_ids) + video_mask = self.vis_processor.get_attention_mask(video_fts) + attn_mask = torch.cat([video_mask, attn_mask], dim=1) + + video_labels = ( + torch.ones((video_fts.size(0), video_fts.size(1))).long() * -1 + ) # ignore token indice -1 by default + labels = torch.cat([video_labels, labels], dim=1) + + samples = {} + samples["input_ids"] = input_ids + samples["token_type_ids"] = token_type_ids + samples["labels"] = labels + samples["video_fts"] = video_fts + samples["attn_mask"] = attn_mask + + return samples + + +class AVSDDialEvalDataset(DialogueEvalDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + + ann = self.annotation[index] + + vname = ann["image_id"] + + video = self.vis_processor(self.vis_root, vname) + + dialogue = self.text_processor(ann) + + # "image_id" is kept to stay compatible with the COCO evaluation format + return { + "video_fts": video["video_fts"], + "video_token_type_ids": video["token_type_ids"], + "input_ids": dialogue["input_ids"], + "token_type_ids": dialogue["token_type_ids"], + "labels": dialogue["labels"], + "image_id": ann["image_id"], + "instance_id": ann["instance_id"], + } + + def collater(self, samples): + + input_ids, token_type_ids, labels, video_fts, video_token_type_ids = ( + [], + [], + [], + [], + [], + ) + + for i in samples: + input_ids.append(i["input_ids"]) + token_type_ids.append(i["token_type_ids"]) + labels.append(i["labels"]) + video_fts.append(i["video_fts"]) + video_token_type_ids.append(i["video_token_type_ids"]) + + input_ids = self.text_processor.padding(input_ids) + + labels = self.text_processor.padding( + labels, -1 + ) # ignore token indice -1 by default + video_fts = self.vis_processor.padding(video_fts) + + token_type_ids = self.text_processor.padding(token_type_ids) + video_token_type_ids = self.text_processor.padding(video_token_type_ids) + token_type_ids = torch.cat([video_token_type_ids, token_type_ids], dim=1) + + attn_mask = self.text_processor.get_attention_mask(input_ids) + video_mask = self.vis_processor.get_attention_mask(video_fts) + attn_mask = torch.cat([video_mask, attn_mask], dim=1) + + video_labels = ( + torch.ones((video_fts.size(0), video_fts.size(1))).long() * -1 + ) # ignore token indice -1 by default + labels = torch.cat([video_labels, labels], dim=1) + + samples = {} + samples["input_ids"] = input_ids + samples["token_type_ids"] = token_type_ids + samples["labels"] = labels + samples["video_fts"] = video_fts + samples["attn_mask"] = attn_mask + + return samples + + +class AVSDDialInstructEvalDataset(DialogueDataset): + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + + self.modalities = kwargs['modalities'] + + for modality in self.modalities: + if 'image' in modality: + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + continue + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + self.annotation = [ann for ann in self.annotation if ann['image_id'] in self.sample_ids] + if 'test' in kwargs['ann_paths'][0]: + self.annotation = [ann for ann in self.annotation if ann['answer'] == '__UNDISCLOSED__'] + + def get_existing_audio_annotations(self): + return [f.split('.')[0] for f in os.listdir(self.audio_root)] + + def get_existing_video_annotations(self): + return [f.split('.')[0] for f in os.listdir(self.video_root)] + + def get_audio_path(self, sample_key): + return os.path.join(self.audio_root, sample_key) + '.mp4' + + def get_video_path(self, sample_key): + return os.path.join(self.video_root, sample_key) + '.mp4' + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann['image_id']) + + if type(ann[f"{modality}_path"]) == list: + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + if 'image' in modality: + ann['image'] = self.vis_processor(Image.open(ann[f"images_path"])) + else: + ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32) + + ann["sample_id"] = ann["image_id"] + dialog = "" + for t in ann['dialog']: + dialog += f"{t['question']} {t['answer']} " + ann['dialog'] = dialog + ann['text_output'] = self.text_processor(ann['answer']) + ann['text_input'] = self.text_processor(ann['question']) + ann["question_id"] = index + # ann['captions'] = ann[ann['answer']] # commented out for test dataset + return ann + + def __len__(self): + return len(self.annotation) \ No newline at end of file diff --git a/lavis/datasets/datasets/base_dataset.py b/lavis/datasets/datasets/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e4b3302b13ae8f042016b717db83ac9362d298e4 --- /dev/null +++ b/lavis/datasets/datasets/base_dataset.py @@ -0,0 +1,95 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import json +from typing import Iterable +import pandas as pd +import torch + +from torch.utils.data import Dataset, ConcatDataset +from torch.utils.data.dataloader import default_collate + + +class BaseDataset(Dataset): + def __init__( + self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[] + ): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + self.annotation = [] + for ann_path in ann_paths: + if any(ext in ann_path for ext in ['csv', 'tsv']): + df = pd.read_csv(ann_path) + self.annotation.extend(df.to_dict(orient="records")) + + elif 'jsonl' in ann_path: + with open(ann_path, "r") as f: + self.annotation.extend([json.loads(line) for line in f]) + + else: + with open(ann_path, "r") as f: + loaded = json.load(f) + if isinstance(loaded, list): + self.annotation.extend(loaded) + elif isinstance(loaded, dict): + self.annotation.extend([{"sample_id": k, **v} if isinstance(v, dict) else {"sample_id": k, "data": v} for k, v in loaded.items()]) + + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + def __len__(self): + return len(self.annotation) + + def collater(self, samples): + # Filter out None samples + samples = [s for s in samples if s is not None] + # Check if samples is empty after filtering + if not samples: + return {} + collated_dict = {} + keys = samples[0].keys() # Use the keys of the first sample as a reference + for k in keys: + values = [sample[k] for sample in samples] + # If the value type for the key is torch.Tensor, stack them else return list + collated_dict[k] = torch.stack(values, dim=0) if isinstance(values[0], torch.Tensor) else values + return collated_dict + # return default_collate(samples) + + def set_processors(self, vis_processor, text_processor): + self.vis_processor = vis_processor + self.text_processor = text_processor + + def _add_instance_ids(self, key="instance_id"): + for idx, ann in enumerate(self.annotation): + ann[key] = str(idx) + +class ConcatDataset(ConcatDataset): + def __init__(self, datasets: Iterable[Dataset]) -> None: + super().__init__(datasets) + + def collater(self, samples): + # TODO For now only supports datasets with same underlying collater implementations + + all_keys = set() + for s in samples: + all_keys.update(s) + + shared_keys = all_keys + for s in samples: + shared_keys = shared_keys & set(s.keys()) + + samples_shared_keys = [] + for s in samples: + samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys}) + + return self.datasets[0].collater(samples_shared_keys) diff --git a/lavis/datasets/datasets/capfilt_dataset.py b/lavis/datasets/datasets/capfilt_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..37bc984f41b45d7e8170383611c8bf57fd171e7b --- /dev/null +++ b/lavis/datasets/datasets/capfilt_dataset.py @@ -0,0 +1,58 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json + +from PIL import Image +from PIL import ImageFile + +from lavis.datasets.datasets.caption_datasets import CaptionDataset, CaptionEvalDataset, __DisplMixin +from lavis.datasets.datasets.base_dataset import BaseDataset + +class CapFiltCaptionDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.img_ids = {} + n = 0 + for ann in self.annotation: + ann["image_id"] = ''.join(ann['image'].split('.')[:-1]) + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(ann["image"]) + try: + image = Image.open(image_path).convert("RGB") + except: + return None # image does not exist + + image = self.vis_processor(image) + caption = self.text_processor(ann["caption"]) + + return { + "image": image, + "text_input": caption, + "image_id": ann["image_id"] + } + +class CapFiltCaptionInstructDataset(CapFiltCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data \ No newline at end of file diff --git a/lavis/datasets/datasets/caption_datasets.py b/lavis/datasets/datasets/caption_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..973a0d682b2250cce2f3fbb10c322455d43611b3 --- /dev/null +++ b/lavis/datasets/datasets/caption_datasets.py @@ -0,0 +1,94 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict + +from lavis.datasets.datasets.base_dataset import BaseDataset +from PIL import Image + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "caption": ann["caption"], + "image": sample["image"], + } + ) + + +class CaptionDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.img_ids = {} + n = 0 + for ann in self.annotation: + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + + def __getitem__(self, index): + + # TODO this assumes image input, not general enough + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + try: + image = Image.open(image_path).convert("RGB") + except: + return None # image does not exist + + image = self.vis_processor(image) + caption = self.text_processor(ann["caption"]) + + return { + "image": image, + "text_input": caption, + "image_id": ann["image_id"] + } + +class CaptionEvalDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + return { + "image": image, + "image_id": ann["image_id"], + "instance_id": ann["instance_id"], + } + +class CaptionInstructDataset(CaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data \ No newline at end of file diff --git a/lavis/datasets/datasets/coco_caption_datasets.py b/lavis/datasets/datasets/coco_caption_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..c5497f82762654268c339255b38e840c199732ca --- /dev/null +++ b/lavis/datasets/datasets/coco_caption_datasets.py @@ -0,0 +1,72 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json + +from PIL import Image +from PIL import ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True + +from lavis.datasets.datasets.caption_datasets import CaptionDataset, CaptionInstructDataset, CaptionEvalDataset + +COCOCapDataset = CaptionDataset +COCOCapInstructDataset = CaptionInstructDataset + + +class COCOCapEvalDataset(CaptionEvalDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1] + + return { + "image": image, + "image_id": img_id, + "instance_id": ann["instance_id"], + } + + + +class NoCapsEvalDataset(CaptionEvalDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + img_id = ann["img_id"] + + return { + "image": image, + "image_id": img_id, + "instance_id": ann["instance_id"], + } \ No newline at end of file diff --git a/lavis/datasets/datasets/coco_vqa_datasets.py b/lavis/datasets/datasets/coco_vqa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..b3b3837138f960386645ce4f0fd83bb447f8a586 --- /dev/null +++ b/lavis/datasets/datasets/coco_vqa_datasets.py @@ -0,0 +1,121 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json +import random +from PIL import Image + +from lavis.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset + +from collections import OrderedDict + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "question": ann["question"], + "question_id": ann["question_id"], + "answers": "; ".join(ann["answer"]), + "image": sample["image"], + } + ) + + +class COCOVQADataset(VQADataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + answer_weight = {} + for answer in ann["answer"]: + if answer in answer_weight.keys(): + answer_weight[answer] += 1 / len(ann["answer"]) + else: + answer_weight[answer] = 1 / len(ann["answer"]) + + answers = list(answer_weight.keys()) + weights = list(answer_weight.values()) + + return { + "image": image, + "text_input": question, + "answers": answers, + "weights": weights, + } + + +class COCOVQAInstructDataset(COCOVQADataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = random.choice(data["answers"]) + return data + + def collater(self, samples): + data = super().collater(samples) + data['text_output'] = data['answer'] + return data + + + +class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + self.vis_root = vis_root + + self.annotation = json.load(open(ann_paths[0])) + + answer_list_path = ann_paths[1] + if os.path.exists(answer_list_path): + self.answer_list = json.load(open(answer_list_path)) + else: + self.answer_list = None + + try: + self.coco_fmt_qust_file = ann_paths[2] + self.coco_fmt_anno_file = ann_paths[3] + except IndexError: + self.coco_fmt_qust_file = None + self.coco_fmt_anno_file = None + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + return { + "image": image, + "text_input": question, + "question_id": ann["question_id"], + "instance_id": ann["instance_id"], + } diff --git a/lavis/datasets/datasets/dataloader_utils.py b/lavis/datasets/datasets/dataloader_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3decb6f5d1f647b8d1bdfb513e08f95fc3bc3f6e --- /dev/null +++ b/lavis/datasets/datasets/dataloader_utils.py @@ -0,0 +1,164 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import time +import random +import torch +from lavis.datasets.data_utils import move_to_cuda +from torch.utils.data import DataLoader + + +class MultiIterLoader: + """ + A simple wrapper for iterating over multiple iterators. + + Args: + loaders (List[Loader]): List of Iterator loaders. + ratios (List[float]): List of ratios to sample from each loader. If None, all loaders are sampled uniformly. + """ + + def __init__(self, loaders, ratios=None): + # assert all loaders has __next__ method + for loader in loaders: + assert hasattr( + loader, "__next__" + ), "Loader {} has no __next__ method.".format(loader) + if ratios is None: + ratios = [1.0] * len(loaders) + else: + assert len(ratios) == len(loaders) + ratios = [float(ratio) / sum(ratios) for ratio in ratios] + + self.loaders = loaders + self.ratios = ratios + + def __next__(self): + # random sample from each loader by ratio + loader_idx = random.choices(range(len(self.loaders)), self.ratios, k=1)[0] + return next(self.loaders[loader_idx]) + + +class PrefetchLoader(object): + """ + Modified from https://github.com/ChenRocks/UNITER. + + overlap compute and cuda data transfer + (copied and then modified from nvidia apex) + """ + + def __init__(self, loader): + self.loader = loader + self.stream = torch.cuda.Stream() + + def __iter__(self): + loader_it = iter(self.loader) + self.preload(loader_it) + batch = self.next(loader_it) + while batch is not None: + is_tuple = isinstance(batch, tuple) + if is_tuple: + task, batch = batch + + if is_tuple: + yield task, batch + else: + yield batch + batch = self.next(loader_it) + + def __len__(self): + return len(self.loader) + + def preload(self, it): + try: + self.batch = next(it) + except StopIteration: + self.batch = None + return + # if record_stream() doesn't work, another option is to make sure + # device inputs are created on the main stream. + # self.next_input_gpu = torch.empty_like(self.next_input, + # device='cuda') + # self.next_target_gpu = torch.empty_like(self.next_target, + # device='cuda') + # Need to make sure the memory allocated for next_* is not still in use + # by the main stream at the time we start copying to next_*: + # self.stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(self.stream): + self.batch = move_to_cuda(self.batch) + # more code for the alternative if record_stream() doesn't work: + # copy_ will record the use of the pinned source tensor in this + # side stream. + # self.next_input_gpu.copy_(self.next_input, non_blocking=True) + # self.next_target_gpu.copy_(self.next_target, non_blocking=True) + # self.next_input = self.next_input_gpu + # self.next_target = self.next_target_gpu + + def next(self, it): + torch.cuda.current_stream().wait_stream(self.stream) + batch = self.batch + if batch is not None and batch is not {}: + record_cuda_stream(batch) + self.preload(it) + return batch + + def __next__(self, it): + return self.next(it) + + def __getattr__(self, name): + method = self.loader.__getattribute__(name) + return method + + +def record_cuda_stream(batch): + if isinstance(batch, torch.Tensor): + batch.record_stream(torch.cuda.current_stream()) + elif isinstance(batch, list) or isinstance(batch, tuple): + for t in batch: + record_cuda_stream(t) + elif isinstance(batch, dict): + for t in batch.values(): + record_cuda_stream(t) + else: + pass + + +class IterLoader: + """ + A wrapper to convert DataLoader as an infinite iterator. + + Modified from: + https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/iter_based_runner.py + """ + + def __init__(self, dataloader: DataLoader, use_distributed: bool = False): + self._dataloader = dataloader + self.iter_loader = iter(self._dataloader) + self._use_distributed = use_distributed + self._epoch = 0 + + @property + def epoch(self) -> int: + return self._epoch + + def __next__(self): + try: + data = next(self.iter_loader) + except StopIteration: + self._epoch += 1 + if hasattr(self._dataloader.sampler, "set_epoch") and self._use_distributed: + self._dataloader.sampler.set_epoch(self._epoch) + time.sleep(2) # Prevent possible deadlock during epoch transition + self.iter_loader = iter(self._dataloader) + data = next(self.iter_loader) + + return data + + def __iter__(self): + return self + + def __len__(self): + return len(self._dataloader) diff --git a/lavis/datasets/datasets/dialogue_datasets.py b/lavis/datasets/datasets/dialogue_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..7596da65f42812d185d91c8c7bcf7776e8362444 --- /dev/null +++ b/lavis/datasets/datasets/dialogue_datasets.py @@ -0,0 +1,141 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict + +from PIL import Image + +from lavis.datasets.datasets.base_dataset import BaseDataset + +import json +import copy + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "dialogue": ann["dialogue"], + "image": sample["image"], + } + ) + + +class DialogueDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + self.vis_root = vis_root + + self.annotation = [] + for ann_path in ann_paths: + dialogs = json.load(open(ann_path, "r"))["dialogs"] + for dialog in dialogs: + all_turns = dialog["dialog"] + dialogue_context = [] + for turn in all_turns: + dialog_instance = copy.deepcopy(dialog) + question = turn["question"] + answer = turn["answer"] + + dialog_instance["dialog"] = copy.deepcopy(dialogue_context) + dialog_instance["question"] = question + dialog_instance["answer"] = answer + self.annotation.append(dialog_instance) + dialogue_context.append(turn) + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + self.img_ids = {} + n = 0 + for ann in self.annotation: + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + + def __getitem__(self, index): + + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + caption = self.text_processor(ann["caption"]) + + return { + "image": image, + "text_input": caption, + "image_id": self.img_ids[ann["image_id"]], + } + + +class DialogueEvalDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + + self.vis_root = vis_root + + self.annotation = [] + for ann_path in ann_paths: + dialogs = json.load(open(ann_path, "r"))["dialogs"] + for dialog in dialogs: + all_turns = dialog["dialog"] + dialogue_context = all_turns[:-1] + last_turn = all_turns[-1] + + question = last_turn["question"] + answer = last_turn["answer"] + + dialog["dialog"] = dialogue_context + dialog["question"] = question + dialog["answer"] = answer + + self.annotation.append(dialog) + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + self.img_ids = {} + n = 0 + for ann in self.annotation: + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + + def __getitem__(self, index): + + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + return { + "image": image, + "image_id": ann["image_id"], + "instance_id": ann["instance_id"], + } diff --git a/lavis/datasets/datasets/discriminatory_reasoning_datasets.py b/lavis/datasets/datasets/discriminatory_reasoning_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..d00be18b0f7a84faf687dd3acb3c58239323d596 --- /dev/null +++ b/lavis/datasets/datasets/discriminatory_reasoning_datasets.py @@ -0,0 +1,165 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict +from PIL import Image +import copy + +from lavis.datasets.datasets.base_dataset import BaseDataset +from lavis.common.utils import is_serializable + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + serializable_sample_keys = [k for k,v in sample.items() if is_serializable(v)] + serializable_ann_keys = [k for k,v in ann.items() if is_serializable(v)] + display = {k:sample[k] for k in serializable_sample_keys} + display.update({k:ann[k] for k in serializable_ann_keys}) + + return OrderedDict( + display + ) + + + +class DisCRnDataset(BaseDataset, __DisplMixin): + def __init__(self, **kwargs): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + pc_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + + self.ds_name = kwargs['dataset_name'] + self.modalities = [str(m) for m in kwargs['modalities']] + ## from lavis convention, sometimes "image" modality is denoted as images + if "images" in self.modalities: + self.modalities[self.modalities.index("images")] = "image" + self.npoints = 8192 + self.sample_points_num = self.npoints + self.annotation = self.annotation + self.view = kwargs.get('view', 2) + self.classnames = copy.deepcopy(self.modalities) + self.classnames = kwargs.get('classnames', ["first", "second"]) + self.total = kwargs.get('total', 'all') + self.ground_truth = kwargs.get('ground_truth', False) + self.shuffle_modalities = kwargs.get('shuffle_modalities', False) + self.balance_labels = kwargs.get('balance_labels', True) + self.raw = kwargs.get('raw', False) + + if self.total != 'all': + self.annotation = self.annotation[:self.total] + + for modality in self.modalities: + if "image" not in modality: + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + self.annotation = [ann for ann in self.annotation if ann['sample_ids'][0] in self.sample_ids and ann['sample_ids'][1] in self.sample_ids] + self._add_instance_ids() + + def get_existing_image_annotations(self): + if self.ds_name == 'objaverse': + return [f.split('_')[0] for f in os.listdir(os.path.join(self.vis_root, f'compressed_imgs_view{self.view}/Cap3D_imgs_view{self.view}/'))] + + def get_image_path(self, ann, entity_index): + if self.ds_name == 'objaverse': + # data downloaded from: https://huggingface.co/datasets/tiange/Cap3D/tree/main/RenderedImage_zips + return os.path.join(self.vis_root, f'compressed_imgs_view{self.view}/Cap3D_imgs_view{self.view}/', ann['sample_ids'][entity_index]+f'_{self.view}.jpeg') + + def get_existing_audio_annotations(self): + return [f.split('_')[0] for f in os.listdir(self.audio_root)] + + def get_audio_path(self, ann, entity_index): + if self.ds_name == 'audiocaps': + return str(os.path.join(self.audio_root, ann['sample_ids'][entity_index] + '_{}.flac'.format(int(ann['start_seconds'][entity_index])))) + + def get_video_path(self, ann, entity_index): + if self.ds_name == 'audiocaps': + return str(os.path.realpath(os.path.join(self.video_root,ann['sample_ids'][entity_index] + '_{}.mp4'.format(int(ann['start_seconds'][entity_index]))))) + + def get_existing_video_annotations(self): + return [f.split('_')[0] for f in os.listdir(self.video_root)] + + def get_existing_pc_annotations(self): + if self.ds_name == 'objaverse': + return os.listdir(self.pc_root) + + def get_pc_path(self, ann, entity_index): + if self.ds_name == 'objaverse': + return os.path.join(self.pc_root, ann['sample_ids'][entity_index], '{}_{}.npz'.format(ann['sample_ids'][entity_index], self.npoints)) + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + N = 2 # number of inputs + ann["question_id"] = ann["instance_id"] + ann[f"modalities"] = copy.deepcopy(self.modalities) + for i,modality in enumerate(self.modalities): + if ann[f'captions_pred_{modality}'] == None or ann[f'captions_pred_{modality}'][i]== None: + return None + if len(self.modalities) == 1: # both modalities of the same type. + ann[f"modalities"] = [self.modalities[0]] * N + + if self.balance_labels: + if (index%2 and ann["label"] == 1) or (not index%2 and ann['label'] == 0): + ann["label"] = 1- ann["label"] + ann["properties"] = [ann['properties'][1],ann['properties'][0]] + ann["captions"] = [ann['captions'][1],ann['captions'][0]] + if self.shuffle_modalities: + ann['modalities'] = [ann['modalities'][1],ann['modalities'][0]] # if we comment this out, we can have batch size > 1. Maintaining for reproducibility. + for modality in self.modalities: + ann[f'captions_pred_{modality}'] = [ann[f'captions_pred_{modality}'][1], ann[f'captions_pred_{modality}'][0]] + + ## baseline captions + ann["baseline_captions"] = [c for c in ann["captions"]] if self.ground_truth else [ann[f'captions_pred_{ann["modalities"][0]}'][0], ann[f'captions_pred_{ann["modalities"][1]}'][1]] + # ann["baseline_captions"] = [c.replace('..', '.') for c in ann["baseline_captions"]] + ann["baseline_captions"] = [c.strip() if c!=None else "" for c in ann["baseline_captions"]] + ## text input + ann["text_input"] = self.text_processor(f'{ann["question"].replace("which entity", "which of the two options").replace("which object", "which of the two options").replace("which image", "which of the two options").replace("which audio", "which of the two options").replace("audio", "object").replace("image", "object")}?'.replace('??', '?')) + # ann["text_input"] = self.text_processor(f'{ann["question"]}?'.replace('??', '?')) + ## answers + first_answers = [ann['modalities'][0], "the first option.", "the first", "left one", "(a) left", "(a) left one", "(a)", 'a.', 'A.', "a)", "(A)", 'Input A', 'Entity 1', 'Object 1','Entity A', 'Object A', 'left', 'first', '1st', 'input 1', '1','a', 'input a', "the first", "the left one"] + second_answers = [ann['modalities'][1], "the second option.", "the second.", "second option", "the second option", "second option.", "right one","(b) right", "(b) right one" , "(b)", "b)", 'Input B', 'right', 'second', '2nd', 'input 2', '2', 'b', 'input b', 'Object 2','Entity B', 'Object B', "the second", "the right one", "the second one"] + if ann["label"] == 0: + ann["answers"] = first_answers + else: + ann["answers"] = second_answers + if 'pc' in ann["answers"]: + ann["answers"].extend(['3d', '3d model', 'model', 'rendering', 'a 3d', 'a 3d model']) + if 'image' in ann["answers"]: + ann["answers"].extend(['photo', 'picture']) + if 'audio' in ann["answers"]: + ann["answers"].append('sound') + ## label + ann["label"] = self.classnames[ann["label"]] + ann['answer'] = ann["answers"] # for vqa task compatibility + + ## get data + for i,modality in enumerate(ann["modalities"]): + path = getattr(self, f"get_{modality}_path")(ann, i) + if 'image' in modality: + path = Image.open(path).convert("RGB") + if self.raw: + ann[modality] = path + continue + try: + ann[modality] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(path) + except: + return None + + ann["discrn"] = True # signify to model, this is a discrn task + + return ann + + def __len__(self): + return len(self.annotation) \ No newline at end of file diff --git a/lavis/datasets/datasets/gqa_datasets.py b/lavis/datasets/datasets/gqa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..c1913a7054198095de43b67482890f6031393a02 --- /dev/null +++ b/lavis/datasets/datasets/gqa_datasets.py @@ -0,0 +1,114 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json +import random + +from PIL import Image + +from lavis.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset + +from collections import OrderedDict + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "question": ann["question"], + "question_id": ann["question_id"], + "answers": "; ".join(ann["answer"]), + "image": sample["image"], + } + ) + + +class GQADataset(VQADataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + answers = [ann["answer"]] + weights = [1] + + return { + "image": image, + "text_input": question, + "answers": answers, + "weights": weights, + } + +class GQAInstructDataset(GQADataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = random.choice(data["answers"]) + return data + + def collater(self, samples): + data = super().collater(samples) + data['text_output'] = data['answer'] + return data + + +class GQAEvalDataset(VQAEvalDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. gqa/images/) + ann_root (string): directory to store the annotation file + """ + + self.vis_root = vis_root + + self.annotation = json.load(open(ann_paths[0])) + + ## TODO: support inference method == 'ranking' + answer_list_path = ann_paths[1] if len(ann_paths) > 1 else '' + if os.path.exists(answer_list_path): + self.answer_list = json.load(open(answer_list_path)) + else: + self.answer_list = None + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + if "answer" in ann: + # answer is a string + answer = ann["answer"] + else: + answer = None + + return { + "image": image, + "text_input": question, + "answer": answer, + "question_id": ann["question_id"], + "instance_id": ann["instance_id"], + } diff --git a/lavis/datasets/datasets/iconqa_datasets.py b/lavis/datasets/datasets/iconqa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..02936854a03a1b9b5a8f358617ac3af7e00e7a3a --- /dev/null +++ b/lavis/datasets/datasets/iconqa_datasets.py @@ -0,0 +1,143 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from collections import OrderedDict +import json +import os +import torch +import pathlib +import random + +from PIL import Image + +from lavis.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + return OrderedDict( + { + "file": ann["image"], + "question": ann["question"], + "question_id": ann["question_id"], + "direct_answers": "; ".join(ann["direct_answers"]), + "choices": "; ".join(ann["choices"]), + "correct_choice": ann["choices"][ann["correct_choice_idx"]], + "image": sample["image"], + } + ) + + +class IconQADataset(VQADataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.vis_processor = vis_processor + self.text_processor = text_processor + + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = pathlib.Path(os.path.join(self.vis_root, ann["image"])).resolve() + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + answers = [ann['choices'][ann['answer']]] + + return { + "image": image, + "text_input": question, + "direct_answers": answers, + "weights": [1], + } + +class IconQAInstructDataset(IconQADataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = random.choice(data["direct_answers"]) + return data + + def collater(self, samples): + data = super().collatter(samples) + data['text_output'] = data['answer'] + return data + + +class IconQAEvalDataset(VQAEvalDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.vis_processor = vis_processor + self.text_processor = text_processor + + def collater(self, samples): + ( + image_list, + question_list, + question_id_list, + instance_id_list, + choices_list, + correct_choice_idx_list, + direct_answers_list, + ) = ([], [], [], [], [], [], []) + + for sample in samples: + image_list.append(sample["image"]) + question_list.append(sample["text_input"]) + question_id_list.append(sample["question_id"]) + instance_id_list.append(sample["instance_id"]) + choices_list.append(sample["choices"]) + correct_choice_idx_list.append(sample["correct_choice_idx"]) + direct_answers_list.append(sample["direct_answers"]) + + return { + "image": torch.stack(image_list, dim=0), + "text_input": question_list, + "instance_id": instance_id_list, + "choices": choices_list, + "correct_choice_idx": correct_choice_idx_list, + "direct_answers": direct_answers_list, + } + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = pathlib.Path(os.path.join(self.vis_root, ann["image"])).resolve() + + answers = [ann['choices'][ann['answer']]] + + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + choices = ann["choices"] + correct_choice_idx = ann["answer"] + + return { + "image": image, + "text_input": question, + "instance_id": ann["instance_id"], + "choices": choices, + "correct_choice_idx": correct_choice_idx, + "direct_answers": answers, + "question_id": ann["instance_id"] + } diff --git a/lavis/datasets/datasets/image_text_pair_datasets.py b/lavis/datasets/datasets/image_text_pair_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..3f019ef92fe1355451406d8e73d36452a048b27a --- /dev/null +++ b/lavis/datasets/datasets/image_text_pair_datasets.py @@ -0,0 +1,58 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict + +from lavis.datasets.datasets.base_dataset import BaseDataset +from PIL import Image + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": os.path.basename(ann["image"]), + "caption": ann["caption"], + "image": sample["image"], + } + ) + + +class ImageTextPairDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + + # TODO this assumes image input, not general enough + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + try: + image = Image.open(image_path).convert("RGB") + except: + return None + + image = self.vis_processor(image) + caption = self.text_processor(ann["caption"]) + + return {"image": image, "text_input": caption} + +class ImageTextPairInstructDataset(ImageTextPairDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data \ No newline at end of file diff --git a/lavis/datasets/datasets/imagefolder_dataset.py b/lavis/datasets/datasets/imagefolder_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8057b7e946d5df5c837499a4a92d46e8c56cf03e --- /dev/null +++ b/lavis/datasets/datasets/imagefolder_dataset.py @@ -0,0 +1,59 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict + +from lavis.datasets.datasets.base_dataset import BaseDataset +from PIL import Image +from torchvision import datasets + + +class ImageFolderDataset(BaseDataset): + def __init__(self, vis_processor, vis_root, classnames=[], **kwargs): + super().__init__(vis_processor=vis_processor, vis_root=vis_root) + + self.inner_dataset = datasets.ImageFolder(vis_root) + + self.annotation = [ + {"image": elem[0], "label": elem[1], "image_id": elem[0]} + for elem in self.inner_dataset.imgs + ] + + self.classnames = classnames + + self._add_instance_ids() + + def __len__(self): + return len(self.inner_dataset) + + def __getitem__(self, index): + ann = self.annotation[index] + + img_fn = ann["image"] + image_path = os.path.join(self.vis_root, img_fn) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + return { + "image": image, + "label": ann["label"], + "image_id": ann["image_id"], + "instance_id": ann["instance_id"], + } + + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "label": self.classnames[ann["label"]], + "image": sample["image"], + } + ) diff --git a/lavis/datasets/datasets/laion_dataset.py b/lavis/datasets/datasets/laion_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6322c217e26d411903e79e6ed582182f96fb6e4a --- /dev/null +++ b/lavis/datasets/datasets/laion_dataset.py @@ -0,0 +1,77 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import webdataset as wds +from lavis.datasets.datasets.base_dataset import BaseDataset +import random + +class LaionDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, location): + super().__init__(vis_processor=vis_processor, text_processor=text_processor) + + self.inner_dataset = wds.DataPipeline( + wds.ResampledShards(location), + wds.tarfile_to_samples(handler=wds.warn_and_continue), + wds.shuffle(1000, handler=wds.warn_and_continue), + wds.decode("pilrgb", handler=wds.warn_and_continue), + wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), + wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), + wds.map(self.to_dict, handler=wds.warn_and_continue), + ) + + def to_dict(self, sample): + if type(sample[1]) == list: + caption = random.choice(sample[1][:2]) + else: + caption = sample[1]["caption"] + + return { + "image": sample[0], + "text_input": self.text_processor(caption), + } + + +class LaionInstructDataset(LaionDataset): + def to_dict(self, sample): + data = super().to_dict(sample) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +if __name__ == "__main__": + from torchvision import transforms + + def to_image_text_pair(sample): + return sample[0], sample[1]["caption"] + + normalize = transforms.Normalize( + (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711) + ) + + transform_train = transforms.Compose( + [ + transforms.RandomResizedCrop(256, scale=(0.2, 1.0)), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ] + ) + + dataset = LaionDataset( + vis_processor=transform_train, + text_processor=lambda x: x, + # location="/export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{" + # "00..15}_shard{000000..000118}.tar", + location="/export/laion/laion2B-multi/part-00000/{00000..01743}.tar", + ) + + import torch + + loader = torch.utils.data.DataLoader(dataset.inner_dataset, batch_size=2) + + print(next(iter(loader))["text_input"]) diff --git a/lavis/datasets/datasets/llava150k_dataset.py b/lavis/datasets/datasets/llava150k_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..256698468b91bf03d5dd5c5a1afca2072fa517da --- /dev/null +++ b/lavis/datasets/datasets/llava150k_dataset.py @@ -0,0 +1,37 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.datasets.datasets.base_dataset import BaseDataset +import os +from PIL import Image + + +class LLaVA150kInstructDataset(BaseDataset): + def __init__(self, vis_processor, text_processor,ann_paths, vis_root): + super().__init__(vis_processor=vis_processor, text_processor=text_processor, ann_paths=ann_paths, vis_root=vis_root) + self.inner_dataset = self.annotation + self.location = vis_root + + def __len__(self): + return len(self.inner_dataset) + + def __getitem__(self, index): + + example = self.inner_dataset[index] + text_input = example['conversations'][0]['value'].replace('', '').strip() + text_output = example['conversations'][1]['value'] + image_id = example['image'] + image_path = os.path.join(self.location, image_id) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + return { + "image": image, + "instance_id":image_id, + "text_input": self.text_processor(text_input), + "text_output": self.text_processor(text_output), + "image_path": image_path + } diff --git a/lavis/datasets/datasets/multimodal_classification_datasets.py b/lavis/datasets/datasets/multimodal_classification_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..c1b4fe02ed39bcec396e160bda6fe43246cb4d03 --- /dev/null +++ b/lavis/datasets/datasets/multimodal_classification_datasets.py @@ -0,0 +1,20 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from abc import abstractmethod +from lavis.datasets.datasets.base_dataset import BaseDataset + + +class MultimodalClassificationDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.class_labels = None + + @abstractmethod + def _build_class_labels(self): + pass diff --git a/lavis/datasets/datasets/music_avqa.py b/lavis/datasets/datasets/music_avqa.py new file mode 100644 index 0000000000000000000000000000000000000000..38e361cc1f4a037dbbff52033422cd3cb5d445e0 --- /dev/null +++ b/lavis/datasets/datasets/music_avqa.py @@ -0,0 +1,71 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import copy +import os +import random +import json +import ast +from PIL import Image +from lavis.datasets.datasets.base_dataset import BaseDataset + +class MusicAVQADataset(BaseDataset): + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + + self.modalities = kwargs['modalities'] + + for modality in self.modalities: + if 'image' in modality: + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + continue + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + self.annotation = [ann for ann in self.annotation if ann['video_id'] in self.sample_ids] + + def get_existing_audio_annotations(self): + return [f.split('.')[0] for f in os.listdir(self.audio_root)] + + def get_existing_video_annotations(self): + return [f.split('.')[0] for f in os.listdir(self.video_root)] + + def get_audio_path(self, ann): + # return os.path.join(self.audio_root, f'{ann["video_id"]}.flac') + return os.path.join(self.audio_root, f'{ann["video_id"]}.mp4') + + def get_video_path(self, ann): + return os.path.join(self.video_root, f'{ann["video_id"]}.mp4') + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann) + if type(ann[f"{modality}_path"]) == list: + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + if 'image' in modality: + ann['image'] = self.vis_processor(Image.open(ann[f"images_path"])) + else: + ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32) + + ann["sample_id"] = ann["video_id"] + question = ann['question_content'].replace( '', '{}').format(*ast.literal_eval(ann['templ_values'])) + ann['text_input'] = self.text_processor(question) + ann["question_id"] = ann['question_id'] + ann['answers'] = ann['anser'] + return ann + + +class MusicAVQAInstructDataset(MusicAVQADataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['answer'] = data["answers"] # needed to use gqa task + data['text_output'] = data["answers"] + return data diff --git a/lavis/datasets/datasets/nlvr_datasets.py b/lavis/datasets/datasets/nlvr_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..0cc818c6ac7592686ce104bea345bfe95d727aa0 --- /dev/null +++ b/lavis/datasets/datasets/nlvr_datasets.py @@ -0,0 +1,94 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import random + +from collections import OrderedDict + +from lavis.datasets.datasets.multimodal_classification_datasets import ( + MultimodalClassificationDataset, +) +from PIL import Image + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file_L": ann["images"][0], + "file_R": ann["images"][1], + "sentence": ann["sentence"], + "label": ann["label"], + "image": [sample["image0"], sample["image1"]], + } + ) + + +class NLVRDataset(MultimodalClassificationDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.class_labels = self._build_class_labels() + + def _build_class_labels(self): + return {"False": 0, "True": 1} + + @staticmethod + def _flip(samples): + sentence = samples["text_input"] + image0, image1 = samples["image0"], samples["image1"] + + if "left" not in sentence and "right" not in sentence: + if random.random() < 0.5: + image0, image1 = image1, image0 + else: + if random.random() < 0.5: + sentence = sentence.replace("left", "[TEMP_TOKEN]") + sentence = sentence.replace("right", "left") + sentence = sentence.replace("[TEMP_TOKEN]", "right") + + image0, image1 = image1, image0 + + samples["text_input"] = sentence + samples["image0"] = image0 + samples["image1"] = image1 + + return samples + + def __getitem__(self, index): + ann = self.annotation[index] + + image0_path = os.path.join(self.vis_root, ann["images"][0]) + image0 = Image.open(image0_path).convert("RGB") + image0 = self.vis_processor(image0) + + image1_path = os.path.join(self.vis_root, ann["images"][1]) + image1 = Image.open(image1_path).convert("RGB") + image1 = self.vis_processor(image1) + + sentence = self.text_processor(ann["sentence"]) + label = self.class_labels[ann["label"]] + + return self._flip( + { + "image0": image0, + "image1": image1, + "text_input": sentence, + "label": label, + # "image_id": ann["image_id"], + "instance_id": ann["instance_id"], + } + ) + + +class NLVREvalDataset(NLVRDataset): + @staticmethod + def _flip(samples): + return samples diff --git a/lavis/datasets/datasets/object3d_captioning_datasets.py b/lavis/datasets/datasets/object3d_captioning_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..65c8c74931d751b696f34f764b8830b5ee9696e1 --- /dev/null +++ b/lavis/datasets/datasets/object3d_captioning_datasets.py @@ -0,0 +1,177 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os, sys +from collections import OrderedDict +import random + +from lavis.datasets.datasets.base_dataset import BaseDataset +from lavis.common.utils import is_serializable + +from PIL import Image +import numpy as np +from tqdm import tqdm +import json +import torch +import copy + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + serializable_sample_keys = [k for k,v in sample.items() if is_serializable(v)] + serializable_ann_keys = [k for k,v in ann.items() if is_serializable(v)] + display = {k:sample[k] for k in serializable_sample_keys} + display.update({k:ann[k] for k in serializable_ann_keys}) + + return OrderedDict( + display + ) + + + +class Object3dCaptionDataset(BaseDataset, __DisplMixin): + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + + self.modalities = kwargs['modalities'] + self.npoints = 8192 + self.sample_points_num = self.npoints + + for modality in self.modalities: + if 'image' in modality: + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + continue + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + self.annotation = [ann for ann in self.annotation if ann['sample_id'] in self.sample_ids] + + def get_existing_depth_annotations(self): + return os.listdir(self.depth_root) + + def get_existing_images_annotations(self): + return os.listdir(self.vis_root) + + def get_existing_pc_annotations(self): + raise NotImplementedError("Subclasses should implement this!") + + def get_pc_path(self, sample_key): + raise NotImplementedError("Subclasses should implement this!") + + def get_images_path(self, sample_key): + raise NotImplementedError("Subclasses should implement this!") + + def get_depth_path(self, sample_key): + raise NotImplementedError("Subclasses should implement this!") + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + ann['captions'] = ann['data'] + del ann['data'] + + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann['sample_id']) + if type(ann[f"{modality}_path"]) == list: # select from image views + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + if 'image' in modality: + ann['image'] = self.vis_processor(Image.open(ann[f"images_path"])) + else: + ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32) + return ann + + def __len__(self): + return len(self.annotation) + + def _build_templates(self, templates_path): + # use captions not templates + if templates_path is None: + self.templates = None + else: + with open(templates_path) as f: + self.templates = json.load(f) + + +class ObjaverseCaptionDataset(Object3dCaptionDataset, __DisplMixin): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def get_existing_images_annotations(self): + return [f.split('_')[0] for f in os.listdir(os.path.join(self.vis_root, f'compressed_imgs_view{0}/Cap3D_imgs_view{0}/'))] + + def get_existing_pc_annotations(self): + return list(set(os.listdir(self.pc_root)).intersection(set(ann['sample_id'] for ann in self.annotation))) + + def get_pc_path(self, sample_key): + return os.path.join(self.pc_root, sample_key, '{}_{}.npz'.format(sample_key, self.npoints)) + + def get_images_path(self, sample_key): + # data downloaded from: https://huggingface.co/datasets/tiange/Cap3D/tree/main/RenderedImage_zips + return [os.path.join(self.vis_root, f'compressed_imgs_view{i}/Cap3D_imgs_view{i}/', sample_key+f'_{i}.jpeg') for i in range(8)] + + def __getitem__(self, index): + ann = super().__getitem__(index) + ann['text_input'] = self.text_processor(random.choice(ann['captions'])) + return ann + +class ObjaverseCaptionInstructDataset(ObjaverseCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + + +class ObjaverseCaptionEvalDataset(ObjaverseCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data + + + +class ShapenetCaptionDataset(Object3dCaptionDataset, __DisplMixin): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def get_existing_pc_annotations(self): + return list(set([f.replace('.npy', '') for f in os.listdir(self.pc_root)])) + + def get_pc_path(self, sample_key): + return os.path.join(self.pc_root, sample_key+'.npy') + + def get_images_path(self, sample_key): + return [os.path.join(self.vis_root,sample_key, img_path) for img_path in os.listdir(os.path.join(self.vis_root, sample_key))] + + def __getitem__(self, index): + ann = super().__getitem__(index) + if not isinstance(ann['captions'], list): + if self.templates: + ann['objects'] = ann['captions'] + ann['captions'] = [random.choice(self.templates).format(obj) for obj in ann['objects'].split(',')] + else: + ann['objects'] = ann['captions'] + ann['captions'] = [random.choice(ann['objects'].split(','))] + ann['text_input'] = self.text_processor(random.choice(ann['captions'])) + return ann + +class ShapenetCaptionInstructDataset(ShapenetCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +class ShapenetCaptionEvalDataset(ShapenetCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data diff --git a/lavis/datasets/datasets/object3d_classification_datasets.py b/lavis/datasets/datasets/object3d_classification_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..675156fd18587cabe21d3edd7f4acab255774409 --- /dev/null +++ b/lavis/datasets/datasets/object3d_classification_datasets.py @@ -0,0 +1,158 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +# Adapted from https://github.com/salesforce/ULIP/blob/48d8d00b1cdb2aee79005817a202816f1c521911/models/pointnext/PointNeXt/openpoints/dataset/modelnet/modelnet40_normal_resampled_loader.py + +import os +from collections import OrderedDict +import numpy as np +from tqdm import tqdm +import torch +import copy +import random +import pickle +from PIL import Image +from lavis.processors.ulip_processors import farthest_point_sample, pc_normalize +from lavis.datasets.datasets.base_dataset import BaseDataset + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "caption": ann["caption"], + "image": sample["image"], + "pc": sample["pc"], + } + ) + +class ModelNetClassificationDataset(BaseDataset, __DisplMixin): + """ + Dataset for ModelNet Classification. + """ + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], []) + + self.modalities = kwargs['modalities'] + # Setting dataset specific properties + self.npoints = 8192 + self.use_normals = False + self.num_category = 40 + self.process_data = True + self.uniform = True + self.generate_from_raw_data = False + ann_paths = kwargs['ann_paths'] + + assert 'pc_root' in kwargs, "Point cloud root needs to be provided to retrieve labels." + self.pc_root = kwargs["pc_root"] + + # Fetching class names and IDs + self.classnames = [line.rstrip() for line in open(ann_paths[0])] + self.classes = dict(zip(self.classnames, range(len(self.classnames)))) + self.shape_ids = [line.rstrip() for line in open(ann_paths[-1])] + self.shape_names = ['_'.join(x.split('_')[0:-1]) for x in self.shape_ids] + + # Setting data paths + self.datapath = [(self.shape_names[i], os.path.join(self.pc_root, self.shape_names[i], self.shape_ids[i]) + '.txt') for i + in range(len(self.shape_ids))] + + + # Saving path settings + self.save_path = ann_paths[1] if self.uniform else ann_paths[0].replace('_fps', '') + + # Processing or loading data + self._prepare_data() + + + def _prepare_data(self): + # Check for pre-processed data + if self.process_data: + if not os.path.exists(self.save_path): + if self.generate_from_raw_data: + print('Processing data %s (only running in the first time)...' % self.save_path) + self._process_raw_data() + else: + print('Load processed data from %s...' % self.save_path) + with open(self.save_path, 'rb') as f: + self.list_of_points, self.list_of_labels = pickle.load(f) + else: + print('Load processed data from %s...' % self.save_path) + with open(self.save_path, 'rb') as f: + self.list_of_points, self.list_of_labels = pickle.load(f) + + def _process_raw_data(self): + self.list_of_points = [None] * len(self.datapath) + self.list_of_labels = [None] * len(self.datapath) + for index in tqdm(range(len(self.datapath)), total=len(self.datapath)): + fn = self.datapath[index] + cls = self.classes[self.datapath[index][0]] + cls = np.array([cls]).astype(np.int32) + point_set = np.loadtxt(fn[1], delimiter=',').astype(np.float32) + + if self.uniform: + point_set = farthest_point_sample(point_set, self.npoints) + print("uniformly sampled out {} points".format(self.npoints)) + else: + point_set = point_set[0:self.npoints, :] + + self.list_of_points[index] = point_set + self.list_of_labels[index] = cls + + with open(self.save_path, 'wb') as f: + pickle.dump([self.list_of_points, self.list_of_labels], f) + + def __len__(self): + return len(self.list_of_labels) + + def _get_item(self, index): + if self.process_data: + point_set, label = self.list_of_points[index], self.list_of_labels[index] + else: + fn = self.datapath[index] + cls = self.classes[self.datapath[index][0]] + label = np.array([cls]).astype(np.int32) + point_set = np.loadtxt(fn[1], delimiter=',').astype(np.float32) + + # Uniform sampling or trimming + if self.uniform: + point_set = farthest_point_sample(point_set, self.npoints) + else: + point_set = point_set[0:self.npoints, :] + if self.npoints < point_set.shape[0]: + point_set = farthest_point_sample(point_set, self.npoints) + + point_set[:, 0:3] = pc_normalize(point_set[:, 0:3]) + if not self.use_normals: + point_set = point_set[:, 0:3] + + return point_set, label[0] + + def __getitem__(self, index): + points, label = self._get_item(index) + label_name = self.classnames[int(label)] + + data = { + "instance_id": index, + "sample_key": index, + "image_id": index, + "label": label_name + } + + if 'pc' in self.modalities: + pt_idxs = np.arange(0, points.shape[0]) + np.random.shuffle(pt_idxs) + current_points = points[pt_idxs].copy() + current_points = torch.from_numpy(current_points).float() + data['pc'] = current_points + if any([k in self.modalities for k in ['images', 'image']]): + img = Image.open(os.path.join(self.vis_root,f"{index}.jpeg" )) + data['image'] = self.vis_processor(img) + + return data diff --git a/lavis/datasets/datasets/object3d_qa_datasets.py b/lavis/datasets/datasets/object3d_qa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..ef2c14117166802b58c9ba7dfc88ef974ffef158 --- /dev/null +++ b/lavis/datasets/datasets/object3d_qa_datasets.py @@ -0,0 +1,65 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import copy +import random +from PIL import Image +import torch + +from lavis.datasets.datasets.object3d_captioning_datasets import Object3dCaptionDataset + +class ObjaverseQADataset(Object3dCaptionDataset): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.add_binary = kwargs.get('add_binary', False) + self.binary_templates = ["do you see {}?", "is this {}?", "does the 3d model contain {}?"] + self.remove_model_answer = kwargs.get('remove_model_answer', False) + if self.remove_model_answer: + self.annotation = [ann for ann in self.annotation if 'model' not in ann['answer']] + + def get_existing_pc_annotations(self): + return list(set(os.listdir(self.pc_root)).intersection(set(ann['sample_id'] for ann in self.annotation))) + + def get_pc_path(self, sample_key): + return os.path.join(self.pc_root, sample_key, '{}_{}.npz'.format(sample_key, self.npoints)) + + def get_images_path(self, sample_key): + # data downloaded from: https://huggingface.co/datasets/tiange/Cap3D/tree/main/RenderedImage_zips + return [os.path.join(self.vis_root, f'compressed_imgs_view{i}/Cap3D_imgs_view{i}/', sample_key+f'_{i}.jpeg') for i in range(8)] + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann['sample_id']) + if type(ann[f"{modality}_path"]) == list: + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + if 'image' in modality: + ann['image'] = self.vis_processor(Image.open(ann[f"image_path"])) + else: + ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32) + + if self.add_binary and random.randint(0,10) < 3: + yes_answer = random.randint(0,10)<5 + if not yes_answer: + caption_index = random.choice(list(set(range(len(self.annotation))).difference(set([index])))) + caption = self.annotation[caption_index]['caption'] + else: + caption = ann['caption'] + + question = random.choice(self.binary_templates).format(caption) + answer = 'yes' if yes_answer else 'no' + ann['text_input'] = self.text_processor(question) + ann['text_output'] = answer + + else: + ann['text_input'] = self.text_processor(ann['question']) + ann['text_output'] = ann['answer'] + + ann['answers'] = [ann['text_output']] + ann['question_id'] = ann['instance_id'] + return ann \ No newline at end of file diff --git a/lavis/datasets/datasets/ocr_datasets.py b/lavis/datasets/datasets/ocr_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..713ddbe106d10bab6c7876dd5beb72a2b129bf7a --- /dev/null +++ b/lavis/datasets/datasets/ocr_datasets.py @@ -0,0 +1,69 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import random +import copy + +from PIL import Image +from lavis.datasets.datasets.vqa_datasets import VQADataset + + +class OCRVQADataset(VQADataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + count_id = 0 + annotations = [] + for ann in self.annotation: + for q,a in zip(ann['questions'],ann['answers']): + new_ann = {} + new_ann = copy.deepcopy(ann) + new_ann['questions'] = q + new_ann['answers'] = a + new_ann['instance_id'] = count_id + new_ann['sample_id'] = ann["sample_id"] + image_id = ann['sample_id'] + '.jpg' + image_path = os.path.join(self.vis_root, image_id) + if not os.path.exists(image_path): + continue + count_id+= 1 + annotations.append(new_ann) + self.annotation = annotations + + def __getitem__(self, index): + ann = self.annotation[index] + image_id = ann['sample_id'] + '.jpg' + image_path = os.path.join(self.vis_root, image_id) + try: + image = Image.open(image_path).convert("RGB") + except: + return None + image = self.vis_processor(image) + question = self.text_processor(ann["questions"]) + + answers = [ann["answers"]] + # TODO this should be configured better + weights = [1.] + + return { + "image": image, + "text_input": question, + "answers": answers, + "weights": weights, + "question_id": ann["sample_id"] + } + +class OCRVQAInstructDataset(OCRVQADataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = random.choice(data["answers"]) + return data + def collater(self, samples): + data = super().collater(samples) + data['text_output'] = data['answer'] + return data \ No newline at end of file diff --git a/lavis/datasets/datasets/retrieval_datasets.py b/lavis/datasets/datasets/retrieval_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..9cee7a4f800c67524fffbd3ce1e4fc068fba67e1 --- /dev/null +++ b/lavis/datasets/datasets/retrieval_datasets.py @@ -0,0 +1,162 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict + +from lavis.datasets.datasets.base_dataset import BaseDataset +from PIL import Image + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + visual_key = "image" if "image" in ann else "video" + + return OrderedDict( + { + "file": ann[visual_key], + "caption": ann["caption"], + visual_key: sample[visual_key], + } + ) + + +class RetrievalDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.img_ids = {} + n = 0 + for ann in self.annotation: + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + + def __getitem__(self, index): + + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + caption = self.text_processor(ann["caption"]) + + return { + "image": image, + "text_input": caption, + "image_id": self.img_ids[ann["image_id"]], + "instance_id": ann["instance_id"], + } + + +class RetrievalEvalDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.text = [] + self.image = [] + self.txt2img = {} + self.img2txt = {} + + txt_id = 0 + for img_id, ann in enumerate(self.annotation): + self.image.append(ann["image"]) + self.img2txt[img_id] = [] + for i, caption in enumerate(ann["caption"]): + self.text.append(self.text_processor(caption)) + self.img2txt[img_id].append(txt_id) + self.txt2img[txt_id] = img_id + txt_id += 1 + + def __getitem__(self, index): + + image_path = os.path.join(self.vis_root, self.annotation[index]["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + return {"image": image, "index": index} + + +class VideoRetrievalDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of videos. + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.img_ids = {} + n = 0 + for ann in self.annotation: + img_id = ann["video"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + + def __getitem__(self, index): + + ann = self.annotation[index] + + vpath = os.path.join(self.vis_root, ann["video"]) + + video = self.vis_processor(vpath) + caption = self.text_processor(ann["caption"]) + + # return image, caption, self.img_ids[ann['image_id']] + return { + "video": video, + "text_input": caption, + "image_id": self.img_ids[ann["video"]], + } + + +class VideoRetrievalEvalDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of videos. + ann_root (string): directory to store the annotation file + split (string): val or test + """ + + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.text = [] + self.image = [] + self.txt2img = {} + self.img2txt = {} + + txt_id = 0 + for img_id, ann in enumerate(self.annotation): + self.image.append(ann["video"]) + self.img2txt[img_id] = [] + for i, caption in enumerate(ann["caption"]): + self.text.append(self.text_processor(caption)) + self.img2txt[img_id].append(txt_id) + self.txt2img[txt_id] = img_id + txt_id += 1 + + def __getitem__(self, index): + ann = self.annotation[index] + + vpath = os.path.join(self.vis_root, ann["video"]) + video = self.vis_processor(vpath) + + return {"video": video, "index": index} diff --git a/lavis/datasets/datasets/snli_ve_datasets.py b/lavis/datasets/datasets/snli_ve_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..68d32801d0875f378cdb6863a89a315d4361b160 --- /dev/null +++ b/lavis/datasets/datasets/snli_ve_datasets.py @@ -0,0 +1,70 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict + +from lavis.datasets.datasets.multimodal_classification_datasets import ( + MultimodalClassificationDataset, +) +from PIL import Image + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": os.path.basename(ann["image"]), + "sentence": ann["sentence"], + "label": ann["label"], + "image": sample["image"], + } + ) + + +class SNLIVisualEntialmentDataset(MultimodalClassificationDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.class_labels = self._build_class_labels() + self.classnames = list(self.class_labels.keys()) + + def _build_class_labels(self): + return {"contradiction": 0, "neutral": 1, "entailment": 2} + + def __getitem__(self, index): + ann = self.annotation[index] + + image_id = ann["image"] + image_path = os.path.join(self.vis_root, "%s.jpg" % image_id) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + sentence = self.text_processor(ann["sentence"]) + + return { + "image": image, + "text_input": sentence, + "label": self.class_labels[ann["label"]], + "image_id": image_id, + "instance_id": ann["instance_id"], + } + +class SNLIVisualEntialmentInstructDataset(SNLIVisualEntialmentDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.classnames = ['no', 'maybe', 'yes'] + + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data["prompt"] = self.text_processor("based on the given the image is {} true?") + data["answer"] = self.classnames[data["label"]] + data["label"] = self.classnames[data["label"]] + data["question_id"] = data["instance_id"] + return data diff --git a/lavis/datasets/datasets/subject_driven_t2i_dataset.py b/lavis/datasets/datasets/subject_driven_t2i_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..bc45d118feca57a5e99981be0539d2b51989d8f6 --- /dev/null +++ b/lavis/datasets/datasets/subject_driven_t2i_dataset.py @@ -0,0 +1,72 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os + +from PIL import Image +from torch.utils.data import Dataset +from torch.utils.data.dataloader import default_collate + + +class SubjectDrivenTextToImageDataset(Dataset): + def __init__( + self, + image_dir, + subject_text, + inp_image_processor, + tgt_image_processor, + txt_processor, + repetition=100000, + ): + self.subject = txt_processor(subject_text.lower()) + self.image_dir = image_dir + + self.inp_image_transform = inp_image_processor + self.tgt_image_transform = tgt_image_processor + + self.text_processor = txt_processor + + image_paths = os.listdir(image_dir) + # image paths are jpg png webp + image_paths = [ + os.path.join(image_dir, imp) + for imp in image_paths + if os.path.splitext(imp)[1][1:] + in ["jpg", "png", "webp", "jpeg", "JPG", "PNG", "WEBP", "JPEG"] + ] + # make absolute path + self.image_paths = [os.path.abspath(imp) for imp in image_paths] + self.repetition = repetition + + def __len__(self): + return len(self.image_paths) * self.repetition + + @property + def len_without_repeat(self): + return len(self.image_paths) + + def collater(self, samples): + return default_collate(samples) + + def __getitem__(self, index): + image_path = self.image_paths[index % len(self.image_paths)] + image = Image.open(image_path).convert("RGB") + + # For fine-tuning, we use the same caption for all images + # maybe worth trying different captions for different images + caption = f"a {self.subject}" + caption = self.text_processor(caption) + + inp_image = self.inp_image_transform(image) + tgt_image = self.tgt_image_transform(image) + + return { + "inp_image": inp_image, + "tgt_image": tgt_image, + "caption": caption, + "subject_text": self.subject, + } diff --git a/lavis/datasets/datasets/textcaps_datasets.py b/lavis/datasets/datasets/textcaps_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..4588fc327f1e75c3058a47825f1d2f4eb89458c4 --- /dev/null +++ b/lavis/datasets/datasets/textcaps_datasets.py @@ -0,0 +1,57 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +from lavis.datasets.datasets.base_dataset import BaseDataset +from lavis.datasets.datasets.caption_datasets import CaptionDataset, CaptionEvalDataset + +class TextCapsCapDataset(CaptionDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + BaseDataset.__init__(self, vis_processor, text_processor, vis_root, ann_paths) + self.annotation = self.annotation[3]['data'] + self.img_ids = {} + n = 0 + for ann in self.annotation: + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + ann["image"] = ann["image_id"]+'.jpg' + ann["caption"] = ann["caption_str"] + del ann["caption_str"] + +class TextCapsCapInstructDataset(TextCapsCapDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +class TextCapsCapEvalDataset(CaptionEvalDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + BaseDataset.__init__(self, vis_processor, text_processor, vis_root, ann_paths) + self.annotation = self.annotation[3]['data'] + self.annotation = [ann for ann in self.annotation if "caption_str" in ann] # only keep annotations with captions + + self.img_ids = {} + n = 0 + for ann in self.annotation: + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + ann["image"] = ann["image_id"]+'.jpg' + ann["caption"] = ann["caption_str"] + del ann["caption_str"] + self._add_instance_ids() \ No newline at end of file diff --git a/lavis/datasets/datasets/valor_caption.py b/lavis/datasets/datasets/valor_caption.py new file mode 100644 index 0000000000000000000000000000000000000000..c7706bb6962780f80b23b5cddc91da01c49213d3 --- /dev/null +++ b/lavis/datasets/datasets/valor_caption.py @@ -0,0 +1,88 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + + +import torch +import copy +import os +import random +import json +from PIL import Image +from lavis.datasets.datasets.base_dataset import BaseDataset + +class VALORCaptionDataset(BaseDataset): + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + + self.modalities = kwargs['modalities'] + + for modality in self.modalities: + if 'image' in modality: + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + continue + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + self.annotation = [ann for ann in self.annotation if ann['video_id'].replace('000', '0') in self.sample_ids] + seen = set() + self.annotation = [x for x in self.annotation if x["video_id"] not in seen and not seen.add(x["video_id"])] + + def __len__(self): + return len(self.annotation) + + def get_existing_audio_annotations(self): + return ['.'.join(f.split('.')[:-1]) for f in os.listdir(self.audio_root)] + + def get_existing_video_annotations(self): + return ['.'.join(f.split('.')[:-1]) for f in os.listdir(self.video_root)] + + + def get_audio_path(self, ann): + return os.path.join(self.audio_root, f'{ann["video_id"].replace("000", "0")}.mp4') + + def get_video_path(self, ann): + return os.path.join(self.video_root, f'{ann["video_id"].replace("000", "0")}.mp4') + + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + ann["sample_id"] = ann["video_id"] + ann["text_input"] = self.text_processor(ann['desc']) + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann) + if type(ann[f"{modality}_path"]) == list: + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + if 'image' in modality: + ann['image'] = self.vis_processor(Image.open(ann[f"images_path"])) + else: + ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32) + + ann["caption"] = ann["text_input"] + ann["image_id"] = ann["video_id"] + + + return ann + + +class VALORCaptionEvalDataset(VALORCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data['text_input'] + del data['caption'] + return data + + +class VALORCaptionInstuctDataset(VALORCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data diff --git a/lavis/datasets/datasets/vatex_captioning_datasets.py b/lavis/datasets/datasets/vatex_captioning_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..4397f01650570390f6e96639c8f382bf450a778f --- /dev/null +++ b/lavis/datasets/datasets/vatex_captioning_datasets.py @@ -0,0 +1,87 @@ + +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import copy +import os +import random +import json +from PIL import Image +from lavis.datasets.datasets.base_dataset import BaseDataset + +class VATEXCaptionDataset(BaseDataset): + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + + self.modalities = kwargs['modalities'] + + for modality in self.modalities: + if 'image' in modality: + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + continue + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + seen = set() + self.annotation = [x for x in self.annotation if x["video"] not in seen and not seen.add(x["video"])] + + def __len__(self): + return len(self.annotation) + + def get_existing_audio_annotations(self): + return ['.'.join(f.split('.')[:-1]) for f in os.listdir(self.audio_root)] + + def get_existing_video_annotations(self): + return ['.'.join(f.split('.')[:-1]) for f in os.listdir(self.video_root)] + + + def get_audio_path(self, ann): + return os.path.join(self.audio_root, f'{ann["video"]}') + + + def get_video_path(self, ann): + return os.path.join(self.video_root, f'{ann["video"]}') + + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + ann["video_path"] = ann["video"] + ann["audio_path"] = ann["video"] + ann["sample_id"] = ann["video"] + ann['text_input'] = ann["caption"] + ann["image_id"] = ann["video"] + + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann) + if type(ann[f"{modality}_path"]) == list: + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + if 'image' in modality: + ann['image'] = self.vis_processor(Image.open(ann[f"images_path"])) + else: + ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32) + + return ann + + +class VATEXCaptionEvalDataset(VATEXCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data['text_input'] + return data + + +class VATEXCaptionInstuctDataset(VATEXCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data diff --git a/lavis/datasets/datasets/vg_vqa_datasets.py b/lavis/datasets/datasets/vg_vqa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..30d3c67d82a8283ceff01299f554f5f2d6eb0081 --- /dev/null +++ b/lavis/datasets/datasets/vg_vqa_datasets.py @@ -0,0 +1,51 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import random + +from PIL import Image + +from lavis.datasets.datasets.vqa_datasets import VQADataset + + +class VGVQADataset(VQADataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + answers = [ann["answer"]] + # TODO this should be configured better + weights = [1.] + + return { + "image": image, + "text_input": question, + "answers": answers, + "weights": weights, + } + + +class VGVQAInstructDataset(VGVQADataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = random.choice(data["answers"]) + return data + def collater(self, samples): + data = super().collater(samples) + data['text_output'] = data['answer'] + return data + diff --git a/lavis/datasets/datasets/video_caption_datasets.py b/lavis/datasets/datasets/video_caption_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..3feed795b33e199459b74b7fab689470ec23a34a --- /dev/null +++ b/lavis/datasets/datasets/video_caption_datasets.py @@ -0,0 +1,177 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import math +from lavis.datasets.datasets.base_dataset import BaseDataset +from lavis.datasets.datasets.caption_datasets import CaptionDataset + + +class VideoCaptionDataset(CaptionDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + + ann = self.annotation[index] + + vname = ann["video"] + video_path = os.path.join(self.vis_root, vname) + + try: + video = self.vis_processor(video_path) + except: + print(f"Could not load {video_path}") + return None + if video==None: + return None + + caption = self.text_processor(ann["caption"]) + + # "image_id" is kept to stay compatible with the COCO evaluation format + return { + "video": video, + "text_input": caption, + "image_id": self.img_ids[ann["image_id"]], + } + + +class VideoCaptionEvalDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + # videos set. do not repeat videos in inference + ## todo: make it deduplicated because creating annotation file makes + seen = set() + self.annotation = [x for x in self.annotation if x["video"] not in seen and not seen.add(x["image_id"])] + + def __len__(self): + return len(self.annotation) + + def __getitem__(self, index): + ann = self.annotation[index] + + vname = ann["video"] + video_path = os.path.join(self.vis_root, vname) + + try: + video = self.vis_processor(video_path) + except: + print(f"Could not load {video_path}") + return None + + return { + "video": video, + "image_id": ann["image_id"], + "instance_id": ann["instance_id"], + } + + +class VideoCaptionInstructDataset(VideoCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + + + +class ClipCaptionDataset(BaseDataset): + """ + Handles video datasets where subclip of full video needs to be loaded. + """ + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + + ann = self.annotation[index] + + vname = ann["video_path"] + video_path = os.path.join(self.vis_root, vname) + try: + video = self.vis_processor(video_path, start_sec=math.floor(ann['ts'][0]), end_sec=math.ceil(ann['ts'][1])) + except: + return None + + + caption = ann["caption"] if 'caption' in ann else ann["query"] + + image_id = ann['youtube_id'] if 'youtube_id' in ann else ann["video_id"] if "video_id" in ann else vname + + # "image_id" is kept to stay compatible with the COCO evaluation format + return { + "video": video, + "text_input": self.text_processor(caption), + "image_id": image_id, + "instance_id": ann['instance_id'], + } + +class ClipCaptionInstructDataset(ClipCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +class ClipCaptionEvalDataset(ClipCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data + + +class WebVideoCaptionDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def _get_video(self, index): + """ + If video does not exist, loop to the next one. + """ + max_retries = 3 + for _ in range(max_retries): + ann = self.annotation[index] + video_path = os.path.join(self.vis_root, f"{ann['videoid']}.mp4") + try: + video = self.vis_processor(video_path) + return video, video_path, ann + except: + index = (index + 1) % len(self.annotation) # Safely loop back to start of annotations + return None + + def __getitem__(self, index): + video, video_path, ann = self._get_video(index) + caption = self.text_processor(ann["name"]) + + # "image_id" is kept for compatibility with the COCO evaluation format + return { + "video": video, + "text_input": caption, + "image_id": ann["videoid"], + "instance_id": ann["instance_id"], + } + +class WebVideoCaptionInstructDataset(WebVideoCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data diff --git a/lavis/datasets/datasets/video_vqa_datasets.py b/lavis/datasets/datasets/video_vqa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..63e92bcabfe2c4a1423acabad9988309a33dfdcc --- /dev/null +++ b/lavis/datasets/datasets/video_vqa_datasets.py @@ -0,0 +1,84 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import json +import os +from collections import OrderedDict + +from lavis.datasets.datasets.multimodal_classification_datasets import ( + MultimodalClassificationDataset, +) + + +class __DisplMixin: + def displ_item(self, index): + ann = self.annotation[index] + + vname = ann["video"] + vpath = os.path.join(self.vis_root, vname) + + return OrderedDict( + {"file": vpath, "question": ann["question"], "answer": ann["answer"]} + ) + + +class VideoQADataset(MultimodalClassificationDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def _build_class_labels(self, ans_path): + ans2label = json.load(open(ans_path)) + + self.class_labels = ans2label + + def _get_answer_label(self, answer): + if answer in self.class_labels: + return self.class_labels[answer] + else: + return len(self.class_labels) + + def __getitem__(self, index): + assert ( + self.class_labels + ), f"class_labels of {__class__.__name__} is not built yet." + + ann = self.annotation[index] + + vname = ann["video"] + vpath = os.path.join(self.vis_root, vname) + + frms = self.vis_processor(vpath) + question = self.text_processor(ann["question"]) + + return { + "video": frms, + "text_input": question, + "answers": self._get_answer_label(ann["answer"]), + "question_id": ann["question_id"], + "instance_id": ann["instance_id"], + } + +class VideoQAInstructDataset(VideoQADataset): + def __getitem__(self, index): + ann = self.annotation[index] + + vname = ann["video"] + vpath = os.path.join(self.vis_root, vname) + + frms = self.vis_processor(vpath) + question = self.text_processor(ann["question"]) + + return { + "video": frms, + "text_input": question, + "answer": ann["answer"], + "text_output": ann["answer"], + "question_id": ann["question_id"], + "instance_id": ann["instance_id"], + ## add weight to use with vqa eval script + "weight": [1.] + } diff --git a/lavis/datasets/datasets/violin_dataset.py b/lavis/datasets/datasets/violin_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c967d81703e0d18ab8f6ddded7dc5aecf9f209bc --- /dev/null +++ b/lavis/datasets/datasets/violin_dataset.py @@ -0,0 +1,112 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import random +from lavis.datasets.datasets.base_dataset import BaseDataset + +from lavis.datasets.datasets.multimodal_classification_datasets import ( + MultimodalClassificationDataset, +) + +from lavis.datasets.datasets.caption_datasets import CaptionDataset + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["video_path"], + "sentence": ann["sentence"], + "label": ann["label"], + "video": sample["video"], + } + ) + +class ViolinVideoEntailmentDataset(MultimodalClassificationDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.class_labels = self._build_class_labels() + + def _build_class_labels(self): + return {"wrong": 0, "correct": 1} + + def __getitem__(self, index): + + ann = self.annotation[index] + + vname = ann['video_path'] + video_path = os.path.join(self.vis_root, vname) + + try: + video = self.vis_processor(video_path, start_sec=ann['start_time'], end_sec=ann['end_time']) + except: + return None + + sentence = self.text_processor(ann["statement"]) + + # "image_id" is kept to stay compatible with the COCO evaluation format + return { + "video": video, + "video_path": vname, + "sentence": sentence, + "label": self.class_labels[ann["label"]], + "image_id": ann["source"], + "instance_id": ann['instance_id'], + } + +class ViolinVideoEntailmentInstructDataset(ViolinVideoEntailmentDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + templates = ["is it true that {}?", "is the satement {} contained in the video?", "is the statement {} entailed in the video?"] + if data != None: + data['text_output'] = "yes" if data['label'] == 'correct' else 'no' + data['text_input'] = random.choice(templates).format(data["sentence"]) + return data + + +class ViolinVideoCaptionDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.annotation = [ann for ann in self.annotation if ann['label'] == 'correct'] + + def __getitem__(self, index): + + ann = self.annotation[index] + + vname = ann['video_path'] + video_path = os.path.join(self.vis_root, vname) + + try: + video = self.vis_processor(video_path, start_sec=ann['start_time'], end_sec=ann['end_time']) + except: + return None + caption = self.text_processor(ann["statement"]) + + # "image_id" is kept to stay compatible with the COCO evaluation format + return { + "video": video, + "text_input": self.text_processor(caption), + "image_id": ann["source"], + "instance_id": ann['instance_id'], + } + +class ViolinVideoCaptionInstructDataset(ViolinVideoCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + +class ViolinVideoCaptionEvalDataset(ViolinVideoCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data diff --git a/lavis/datasets/datasets/visdial_dialogue_datasets.py b/lavis/datasets/datasets/visdial_dialogue_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..5f036fe3016c807eb43e689b21218062d57cc73c --- /dev/null +++ b/lavis/datasets/datasets/visdial_dialogue_datasets.py @@ -0,0 +1,138 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from collections import OrderedDict + +from PIL import Image + +from lavis.datasets.datasets.dialogue_datasets import DialogueDataset, DialogueEvalDataset + +import json +import copy + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "dialogue": ann["dialogue"], + "image": sample["image"], + } + ) + + +class VisDialDataset(DialogueDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + self.vis_root = vis_root + self.annotation = [] + for ann_path in ann_paths: + data = json.load(open(ann_path, "r"))['data'] + dialogs = data['dialogs'] + answers = data['answers'] + questions = data['questions'] + + for dialog in dialogs: + all_turns = [ + { + "answer": answers[d["answer"]], + "question": questions[d["question"]], + } + for d in dialog['dialog'] + ] + for i in range(len(all_turns)): + dialogue_context = ' '.join([f" q: {t['question']} a: {t['answer']}" for t in all_turns[:i]]).strip() + last_turn = all_turns[i] + + question = last_turn["question"] + answer = last_turn["answer"] + + dialog["dialog"] = dialogue_context + dialog["question"] = question + dialog["answer"] = answer + + self.annotation.append(dialog) + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + self.img_ids = {} + n = 0 + for ann in self.annotation: + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + + def __getitem__(self, index): + + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root,"VisualDialog_train2018", f'VisualDialog_train2018_'+ str(ann["image_id"]).zfill(12)+'.jpg') + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + return { + "image": image, + "dialog": self.text_processor(ann["dialog"]), + "text_input": self.text_processor(ann["question"]), + "image_id": self.img_ids[ann["image_id"]], + "answer": ann["answer"] + } + + +class VisDialInstructDataset(VisDialDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data["text_output"] = data["answer"] + return data + +class VisDialEvalDataset(VisDialDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + + ann = self.annotation[index] + image_path = os.path.join(self.vis_root, "VisualDialog_val2018", 'VisualDialog_val2018_'+str(ann["image_id"]).zfill(12)+'.jpg') + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + return { + "image": image, + "dialog": self.text_processor(ann["dialog"]), + "text_input": self.text_processor(ann["question"]), + "image_id": self.img_ids[ann["image_id"]], + "answer": ann["answer"] + } \ No newline at end of file diff --git a/lavis/datasets/datasets/vizwiz_vqa_datasets.py b/lavis/datasets/datasets/vizwiz_vqa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..c94c7b450bdb9cb50fa726ae19a54502f2a13beb --- /dev/null +++ b/lavis/datasets/datasets/vizwiz_vqa_datasets.py @@ -0,0 +1,54 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import os +from collections import Counter +from PIL import Image +from lavis.datasets.datasets.vqa_datasets import VQAEvalDataset + +class VizWizEvalDataset(VQAEvalDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + if 'val' in ann["image"]: + image_path = os.path.join(self.vis_root.replace('images', 'val'), ann["image"]) + else: + image_path = os.path.join(self.vis_root.replace('images', 'test'), ann["image"]) + + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + if "answers" in ann: + num_annotators = len(ann["answers"]) + answers = [item['answer'] for item in ann["answers"]] + answer_counts = Counter(answers) + answers = list(set(answers)) + weights = [answer_counts[ans]/num_annotators for ans in answers] + else: + # test + return { + "image": image, + "question_id": ann["image"], + "instance_id": ann["instance_id"], + "text_input": question, + } + + return { + "image": image, + "text_input": question, + "instance_id": ann["instance_id"], + "question_id": ann["instance_id"], + "weights": weights, + "answer": answers + } diff --git a/lavis/datasets/datasets/vlep_dataset.py b/lavis/datasets/datasets/vlep_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..f02acc1a5c5688133ef8eaf5caf5969ed8a879f8 --- /dev/null +++ b/lavis/datasets/datasets/vlep_dataset.py @@ -0,0 +1,72 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import random +from lavis.datasets.datasets.base_dataset import BaseDataset +import math + +from lavis.datasets.datasets.caption_datasets import CaptionDataset + + +class VlepVideoDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + existing_videos = [f.replace('.mp4', '') for f in os.listdir(self.vis_root)] + self.annotation = [ann for ann in self.annotation if ann['vid_name'] in existing_videos] + + + def __getitem__(self, index): + ann = self.annotation[index] + + vname = ann['vid_name']+'.mp4' + video_path = os.path.join(self.vis_root, vname) + + try: + video = self.vis_processor(video_path, start_sec=math.floor(ann['ts'][0]), end_sec=math.ceil(ann['ts'][1])) + except: + return None + + caption = self.text_processor(ann['events'][ann['answer']]) + + # "image_id" is kept to stay compatible with the COCO evaluation format + return { + "video": video, + "text_input": self.text_processor(caption), + "image_id": vname, + "example_id": ann['example_id'], + "instance_id": ann["instance_id"] + } + +class VlepVideoInstructDataset(VlepVideoDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + templates = [ + "what is likely to happen next?", + "what comes after this?", + "where is this leading?", + "in your estimation, what's the next move?", + "can you foresee the subsequent events?", + "based on the video, what might follow?", + "can you give a glimpse into what might be coming?", + ] + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor(random.choice(templates)) + return data + +class VlepVideoEvalDataset(VlepVideoDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data \ No newline at end of file diff --git a/lavis/datasets/datasets/vqa_datasets.py b/lavis/datasets/datasets/vqa_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..3c3185f5f86199628482ca50932ea7b09b733a9a --- /dev/null +++ b/lavis/datasets/datasets/vqa_datasets.py @@ -0,0 +1,55 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch + +from lavis.datasets.datasets.base_dataset import BaseDataset + + +class VQADataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def collater(self, samples): + # Filter out None samples + samples = [s for s in samples if s is not None] + # Check if samples is empty after filtering + if not samples: + return None + image_list, question_list, answer_list, weight_list = [], [], [], [] + + num_answers = [] + + for sample in samples: + image_list.append(sample["image"]) + question_list.append(sample["text_input"]) + + weight_list.extend(sample["weights"]) + + answers = sample["answers"] + + answer_list.extend(answers) + num_answers.append(len(answers)) + + return { + "image": torch.stack(image_list, dim=0), + "text_input": question_list, + "answer": answer_list, + "weight": weight_list, + "n_answers": torch.LongTensor(num_answers), + } + +class VQAInstructDataset(VQADataset): + def collater(self, samples): + data = super().collater(samples) + data['text_output'] = data['answer'] + return data + +class VQAEvalDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + diff --git a/lavis/datasets/datasets/vsr_datasets.py b/lavis/datasets/datasets/vsr_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..dddb6cee1e9e210d19706ff86097014fe242a39d --- /dev/null +++ b/lavis/datasets/datasets/vsr_datasets.py @@ -0,0 +1,104 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json + +from PIL import Image +from PIL import ImageFile + +from lavis.datasets.datasets.multimodal_classification_datasets import ( + MultimodalClassificationDataset, +) +from lavis.datasets.datasets.base_dataset import BaseDataset + +class VSRClassificationDataset(MultimodalClassificationDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.class_labels = self._build_class_labels() + self.classnames = ['no', 'yes'] + + def _build_class_labels(self): + return {"no": 0, "yes": 1} + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + img_id = ann["image"].split('.')[0] + + return { + "image": image, + "image_id": img_id, + "text_input": ann['caption'], + "label": ann["label"], + "instance_id": ann["instance_id"], + } + +class VSRClassificationInstructDataset(VSRClassificationDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data["answer"]= ["yes", "true"] if data['label'] == 1 else ["no", "false"] + data["text_output"] = "yes" if data["label"] == 1 else "no" + return data + +class VSRCaptionDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.annotation = [ann for ann in self.annotation if ann['label'] == 1] + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + img_id = ann["image"].split('.')[0] + + return { + "image": image, + "image_id": img_id, + "text_input": ann['caption'], + } + +class VSRCaptionInstructDataset(VSRCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + data['text_output'] = data["text_input"] + data['text_input'] = self.text_processor("") + return data + + +class VSRCaptionEvalDataset(VSRCaptionDataset): + def __getitem__(self, index): + data = super().__getitem__(index) + if data != None: + del data["text_input"] + return data \ No newline at end of file diff --git a/lavis/datasets/datasets/yt8m_video_dialogue_datasets.py b/lavis/datasets/datasets/yt8m_video_dialogue_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..d54f6c7e6ce9cb560cf42c2f6cd99362a4a6f351 --- /dev/null +++ b/lavis/datasets/datasets/yt8m_video_dialogue_datasets.py @@ -0,0 +1,66 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import os +import copy +import random +from PIL import Image +from lavis.datasets.datasets.base_dataset import ( + BaseDataset +) + + +class YT8MDialDataset(BaseDataset): + def __init__(self, **kwargs): + super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths']) + + self.modalities = kwargs['modalities'] + + for modality in self.modalities: + if 'image' in modality: + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + continue + setattr(self, f"{modality}_root", kwargs[f"{modality}_root"]) + setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"]) + setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')()) + self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities]) + self.annotation = [ann for ann in self.annotation if ann['youtube_id'] in self.sample_ids] + + def get_existing_audio_annotations(self): + return [f.split('_')[0] for f in os.listdir(self.audio_root)] + + def get_existing_video_annotations(self): + return [f.split('_')[0] for f in os.listdir(self.video_root)] + + def get_audio_path(self, ann): + return os.path.join(self.audio_root, f'{ann["youtube_id"]}_{ann["start_sec"]}_{ann["end_sec"]}.flac') + + def get_video_path(self, ann): + return os.path.join(self.video_root, f'{ann["youtube_id"]}_{ann["start_sec"]}_{ann["end_sec"]}.mp4') + + def __getitem__(self, index): + ann = copy.deepcopy(self.annotation[index]) + for modality in self.modalities: + ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann) + if type(ann[f"{modality}_path"]) == list: + ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"]) + if 'video' in modality: + try: + ann['video'] = getattr(self, f"video_processor")(ann[f"video_path"], start_sec=ann['start_sec'], end_sec=ann['end_sec']).to(torch.float32) + except: + return None + elif 'image' in modality: + ann['image'] = self.vis_processor(Image.open(ann[f"images_path"])) + else: + ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32) + ann["sample_id"] = ann["youtube_id"] + ann['text_output'] = self.text_processor(ann['response']) + ann['text_input'] = self.text_processor(ann['context']) + ann["question_id"] = index + ann['captions'] = ann['response'] + return ann \ No newline at end of file diff --git a/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE b/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..4062f42c8e3870f993fee16032dde12965e391a3 --- /dev/null +++ b/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE @@ -0,0 +1,25 @@ +// Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +MIT License + +Copyright (c) 2019 Igor Brigadir + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md b/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0dd0b9d5bfe304770d06b2adc363f33a6c390ced --- /dev/null +++ b/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md @@ -0,0 +1,22 @@ + + +# Download Conceptual Captions Data + +Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder + +`Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333) + +run `download_data_cc3m.py` or `download_data_cc12m.py`. + +Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is. + +Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates. + +A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this. + +It should take about a day or two to download the training data, keep an eye on disk space. diff --git a/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb b/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7ca886ffb664a0c02b45b5b0fabc5159284ef88e --- /dev/null +++ b/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb @@ -0,0 +1,227 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "from lavis.common.utils import get_abs_path, get_cache_path" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cc12m = pd.read_csv(\"downloaded_cc12m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "caption a very typical bus station\n", + "path /export/home/.cache/lavis/conceptual_caption/i...\n", + "dataset cc3m\n", + "mimetype image/jpeg\n", + "size 36078\n", + "status 200\n", + "url http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cc12m.iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3318333" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(cc12m)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 2759017 valid records\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "cnt = 0\n", + "\n", + "valid_records = []\n", + "\n", + "for i, path in tqdm(enumerate(cc12m.path.unique()), total=len(cc12m.path.unique())):\n", + " path = str(path)\n", + " if os.path.exists(path):\n", + " record = cc12m.iloc[i]\n", + " valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n", + "\n", + " cnt += 1\n", + "\n", + "print(\"Found {} valid records\".format(cnt))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2759017" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(valid_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n", + " 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "valid_records[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n" + ] + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." + ] + } + ], + "source": [ + "from omegaconf import OmegaConf\n", + "\n", + "\n", + "config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_12m.yaml\")\n", + "\n", + "ann_path = OmegaConf.load(\n", + " config_path\n", + ").datasets.conceptual_caption_12m.build_info.annotations.train.storage[0]\n", + "\n", + "ann_path = get_cache_path(ann_path)\n", + "\n", + "if os.path.exists(ann_path):\n", + " # abort\n", + " print(\"{} already exists\".format(ann_path))\n", + "else:\n", + " # Save the valid records to a json file\n", + " with open(ann_path, \"w\") as f:\n", + " f.write(json.dumps(valid_records))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.10 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb b/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ce08209d0f16120d2b1c11be095de6482d9fb71f --- /dev/null +++ b/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb @@ -0,0 +1,227 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "from lavis.common.utils import get_abs_path, get_cache_path" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cc3m = pd.read_csv(\"downloaded_cc3m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "caption a very typical bus station\n", + "path /export/home/.cache/lavis/conceptual_caption/i...\n", + "dataset cc3m\n", + "mimetype image/jpeg\n", + "size 36078\n", + "status 200\n", + "url http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cc3m.iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3318333" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(cc3m)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 2759017 valid records\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "cnt = 0\n", + "\n", + "valid_records = []\n", + "\n", + "for i, path in tqdm(enumerate(cc3m.path.unique()), total=len(cc3m.path.unique())):\n", + " path = str(path)\n", + " if os.path.exists(path):\n", + " record = cc3m.iloc[i]\n", + " valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n", + "\n", + " cnt += 1\n", + "\n", + "print(\"Found {} valid records\".format(cnt))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2759017" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(valid_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n", + " 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "valid_records[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n" + ] + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." + ] + } + ], + "source": [ + "from omegaconf import OmegaConf\n", + "\n", + "\n", + "config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_3m.yaml\")\n", + "\n", + "ann_path = OmegaConf.load(\n", + " config_path\n", + ").datasets.conceptual_caption_3m.build_info.annotations.train.storage[0]\n", + "\n", + "ann_path = get_cache_path(ann_path)\n", + "\n", + "if os.path.exists(ann_path):\n", + " # abort\n", + " print(\"{} already exists\".format(ann_path))\n", + "else:\n", + " # Save the valid records to a json file\n", + " with open(ann_path, \"w\") as f:\n", + " f.write(json.dumps(valid_records))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.10 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py b/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py new file mode 100644 index 0000000000000000000000000000000000000000..c60b6fb8e5ae81783f9fafa71648c147871798ec --- /dev/null +++ b/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py @@ -0,0 +1,232 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import time +from PIL import Image +from lavis.common.utils import get_abs_path, get_cache_path +from multiprocessing import Pool +from omegaconf import OmegaConf +from pathlib import Path +from torchvision.transforms import functional as TF +from tqdm import tqdm +import glob +import io +import json +import magic # pip install python-magic +import numpy as np +import os +import pandas as pd +import requests +import shelve +import zlib + +headers = { + #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot + "X-Forwarded-For": "64.18.15.200", +} + + +def _df_split_apply(tup_arg): + split_ind, subset, func = tup_arg + r = subset.apply(func, axis=1) + return (split_ind, r) + + +def df_multiprocess(df, processes, chunk_size, func, dataset_name): + print("Generating parts...") + with shelve.open( + "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size) + ) as results: + + pbar = tqdm(total=len(df), position=0) + # Resume: + finished_chunks = set([int(k) for k in results.keys()]) + pbar.desc = "Resuming" + for k in results.keys(): + pbar.update(len(results[str(k)][1])) + + pool_data = ( + (index, df[i : i + chunk_size], func) + for index, i in enumerate(range(0, len(df), chunk_size)) + if index not in finished_chunks + ) + print( + int(len(df) / chunk_size), + "parts.", + chunk_size, + "per part.", + "Using", + processes, + "processes", + ) + + pbar.desc = "Downloading" + with Pool(processes) as pool: + for i, result in enumerate( + pool.imap_unordered(_df_split_apply, pool_data, 2) + ): + results[str(result[0])] = result + pbar.update(len(result[1])) + pbar.close() + + print("Finished Downloading.") + return + + +# Unique name based on url +def _file_name(row): + name = ( + "%s/%s_%s" + % ( + # row["folder"], + storage_dir, + row.name, + (zlib.crc32(row["url"].encode("utf-8")) & 0xFFFFFFFF), + ) + + ".jpg" + ) + return name + + +# For checking mimetypes separately without download +def check_mimetype(row): + if os.path.isfile(str(row["file"])): + row["mimetype"] = magic.from_file(row["file"], mime=True) + row["size"] = os.stat(row["file"]).st_size + return row + + +# Don't download image, just check with a HEAD request, can't resume. +# Can use this instead of download_image to get HTTP status codes. +def check_download(row): + fname = _file_name(row) + try: + # not all sites will support HEAD + response = requests.head( + row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers + ) + row["status"] = response.status_code + row["headers"] = dict(response.headers) + except: + # log errors later, set error as 408 timeout + row["status"] = 408 + return row + if response.ok: + row["file"] = fname + return row + + +def resize_img(req): + image = Image.open(req).convert("RGB") + image = TF.resize( + # image, size=(resize_size, resize_size) + image, + size=resize_size, + ) # , interpolation=Image.LANCZOS) + return image + + +def download_image(row): + fname = _file_name(row) + # Skip Already downloaded, retry others later + if os.path.isfile(fname): + row["status"] = 200 + row["file"] = fname + row["mimetype"] = magic.from_file(row["file"], mime=True) + row["size"] = os.stat(row["file"]).st_size + return row + + try: + # use smaller timeout to skip errors, but can result in failed downloads + response = requests.get( + row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers + ) + row["status"] = response.status_code + # row['headers'] = dict(response.headers) + except Exception as e: + # log errors later, set error as 408 timeout + row["status"] = 408 + return row + + if response.ok: + try: + # some sites respond with gzip transport encoding + response.raw.decode_content = True + img = resize_img(io.BytesIO(response.content)) + img.save(fname) + + row["mimetype"] = magic.from_file(fname, mime=True) + row["size"] = os.stat(fname).st_size + + except Exception as e: + # # This is if it times out during a download or decode + row["status"] = 408 + + row["file"] = fname + return row + + +def open_tsv(fname, folder): + print("Opening %s Data File..." % fname) + df = pd.read_csv( + fname, sep="\t", names=["url", "caption"] + ) # , usecols=range(1, 2)) + df["folder"] = folder + print("Processing", len(df), " Images:") + return df + + +def df_from_shelve(chunk_size, func, dataset_name): + print("Generating Dataframe from results...") + with shelve.open( + "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size) + ) as results: + keylist = sorted([int(k) for k in results.keys()]) + df = pd.concat([results[str(k)][1] for k in keylist], sort=True) + return df + + +resize_size = 384 + +config_path = get_abs_path("configs/datasets/conceptual_caption/defaults_12m.yaml") + +storage_dir = OmegaConf.load( + config_path +).datasets.conceptual_caption_12m.build_info.images.storage +storage_dir = Path(get_cache_path(storage_dir)) + +os.makedirs(storage_dir, exist_ok=True) + +# number of processes in the pool can be larger than cores +num_processes = 96 +# num_processes = 1 +# chunk_size is how many images per chunk per process - changing this resets progress when restarting. +images_per_part = 100 + +data_name = "cc12m" +# os.makedirs(data_name, exist_ok=True) + +df = open_tsv("cc12m.tsv", data_name) +df_multiprocess( + df=df, + processes=num_processes, + chunk_size=images_per_part, + func=download_image, + dataset_name=data_name, +) +df = df_from_shelve( + chunk_size=images_per_part, func=download_image, dataset_name=data_name +) +df.to_csv( + "downloaded_%s_report.tsv.gz" % data_name, + compression="gzip", + sep="\t", + header=False, + index=False, +) +print("Saved.") diff --git a/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py b/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py new file mode 100644 index 0000000000000000000000000000000000000000..2edd7a224436f7fa2d923501caadd40db040f8a1 --- /dev/null +++ b/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py @@ -0,0 +1,229 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import glob +from pathlib import Path +import time +from omegaconf import OmegaConf +import pandas as pd +import numpy as np +import requests +import zlib +import os +import io +import shelve +from lavis.common.utils import get_abs_path, get_cache_path +import magic # pip install python-magic +import json +from multiprocessing import Pool +from tqdm import tqdm +from PIL import Image +from torchvision.transforms import functional as TF + +headers = { + #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot + "X-Forwarded-For": "64.18.15.200", +} + + +def _df_split_apply(tup_arg): + split_ind, subset, func = tup_arg + r = subset.apply(func, axis=1) + return (split_ind, r) + + +def df_multiprocess(df, processes, chunk_size, func, dataset_name): + print("Generating parts...") + with shelve.open( + "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size) + ) as results: + + pbar = tqdm(total=len(df), position=0) + # Resume: + finished_chunks = set([int(k) for k in results.keys()]) + pbar.desc = "Resuming" + for k in results.keys(): + pbar.update(len(results[str(k)][1])) + + pool_data = ( + (index, df[i : i + chunk_size], func) + for index, i in enumerate(range(0, len(df), chunk_size)) + if index not in finished_chunks + ) + print( + int(len(df) / chunk_size), + "parts.", + chunk_size, + "per part.", + "Using", + processes, + "processes", + ) + + pbar.desc = "Downloading" + with Pool(processes) as pool: + for i, result in enumerate( + pool.imap_unordered(_df_split_apply, pool_data, 2) + ): + results[str(result[0])] = result + pbar.update(len(result[1])) + pbar.close() + + print("Finished Downloading.") + return + + +# Unique name based on url +def _file_name(row): + name = ( + "%s/%s_%s" + % ( + # row["folder"], + storage_dir, + row.name, + (zlib.crc32(row["url"].encode("utf-8")) & 0xFFFFFFFF), + ) + + ".jpg" + ) + return name + + +# For checking mimetypes separately without download +def check_mimetype(row): + if os.path.isfile(str(row["file"])): + row["mimetype"] = magic.from_file(row["file"], mime=True) + row["size"] = os.stat(row["file"]).st_size + return row + + +# Don't download image, just check with a HEAD request, can't resume. +# Can use this instead of download_image to get HTTP status codes. +def check_download(row): + fname = _file_name(row) + try: + # not all sites will support HEAD + response = requests.head( + row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers + ) + row["status"] = response.status_code + row["headers"] = dict(response.headers) + except: + # log errors later, set error as 408 timeout + row["status"] = 408 + return row + if response.ok: + row["file"] = fname + return row + + +def resize_img(req): + image = Image.open(req).convert("RGB") + image = TF.resize( + # image, size=(resize_size, resize_size) + image, + size=resize_size, + ) # , interpolation=Image.LANCZOS) + return image + + +def download_image(row): + fname = _file_name(row) + # Skip Already downloaded, retry others later + if os.path.isfile(fname): + row["status"] = 200 + row["file"] = fname + row["mimetype"] = magic.from_file(row["file"], mime=True) + row["size"] = os.stat(row["file"]).st_size + return row + + try: + # use smaller timeout to skip errors, but can result in failed downloads + response = requests.get( + row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers + ) + row["status"] = response.status_code + # row['headers'] = dict(response.headers) + except Exception as e: + # log errors later, set error as 408 timeout + row["status"] = 408 + return row + + if response.ok: + try: + # some sites respond with gzip transport encoding + response.raw.decode_content = True + img = resize_img(io.BytesIO(response.content)) + img.save(fname) + + row["mimetype"] = magic.from_file(fname, mime=True) + row["size"] = os.stat(fname).st_size + + except Exception as e: + # # This is if it times out during a download or decode + row["status"] = 408 + + row["file"] = fname + return row + + +def open_tsv(fname, folder): + print("Opening %s Data File..." % fname) + df = pd.read_csv( + fname, sep="\t", names=["caption", "url"] + ) # , usecols=range(1, 2)) + df["folder"] = folder + print("Processing", len(df), " Images:") + return df + + +def df_from_shelve(chunk_size, func, dataset_name): + print("Generating Dataframe from results...") + with shelve.open( + "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size) + ) as results: + keylist = sorted([int(k) for k in results.keys()]) + df = pd.concat([results[str(k)][1] for k in keylist], sort=True) + return df + + +resize_size = 384 + +config_path = get_abs_path("configs/datasets/conceptual_caption/defaults_3m.yaml") + +storage_dir = OmegaConf.load( + config_path +).datasets.conceptual_caption_3m.build_info.images.storage +storage_dir = Path(get_cache_path(storage_dir)) + +os.makedirs(storage_dir, exist_ok=True) + +# number of processes in the pool can be larger than cores +num_processes = 32 +# chunk_size is how many images per chunk per process - changing this resets progress when restarting. +images_per_part = 100 + +data_name = "cc3m" +df = open_tsv("Train_GCC-training.tsv", data_name) +df_multiprocess( + df=df, + processes=num_processes, + chunk_size=images_per_part, + func=download_image, + dataset_name=data_name, +) +df = df_from_shelve( + chunk_size=images_per_part, func=download_image, dataset_name=data_name +) +df.to_csv( + "downloaded_%s_report.tsv.gz" % data_name, + compression="gzip", + sep="\t", + header=False, + index=False, +) +print("Saved.") diff --git a/lavis/datasets/download_scripts/download_charade.py b/lavis/datasets/download_scripts/download_charade.py new file mode 100644 index 0000000000000000000000000000000000000000..b8dd1d71146d55561dac82122862c6a6b8a88b54 --- /dev/null +++ b/lavis/datasets/download_scripts/download_charade.py @@ -0,0 +1,26 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import json +from tqdm import tqdm + +train_file = './train.jsonl' +test_file = './test.jsonl' + +train_data = [json.loads(l.strip()) for l in open(train_file).readlines()] +test_data = [json.loads(l.strip()) for l in open(test_file).readlines()] + +for d in tqdm(train_data): + d['video_path'] = d['video_id'] + '.mp4' + d['ts'] = [float(d['start']), float(d['end'])] + +for d in tqdm(test_data): + d['video_path'] = d['video_id'] + '.mp4' + d['ts'] = [float(d['start']), float(d['end'])] + +json.dump(train_data, open('train_lavis.json', 'w')) +json.dump(test_data, open('test_lavis.json', 'w')) \ No newline at end of file diff --git a/lavis/datasets/download_scripts/download_coco.py b/lavis/datasets/download_scripts/download_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..283448aed1b745a975bc89b5c531a853efdd31f4 --- /dev/null +++ b/lavis/datasets/download_scripts/download_coco.py @@ -0,0 +1,57 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from pathlib import Path + +from omegaconf import OmegaConf + +from lavis.common.utils import ( + cleanup_dir, + download_and_extract_archive, + get_abs_path, + get_cache_path, +) + + +DATA_URL = { + "train": "http://images.cocodataset.org/zips/train2014.zip", # md5: 0da8c0bd3d6becc4dcb32757491aca88 + "val": "http://images.cocodataset.org/zips/val2014.zip", # md5: a3d79f5ed8d289b7a7554ce06a5782b3 + "test": "http://images.cocodataset.org/zips/test2014.zip", # md5: 04127eef689ceac55e3a572c2c92f264 + "test2015": "http://images.cocodataset.org/zips/test2015.zip", # md5: 04127eef689ceac55e3a572c2c92f264 +} + + +def download_datasets(root, url): + download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir) + + +if __name__ == "__main__": + + config_path = get_abs_path("configs/datasets/coco/defaults_cap.yaml") + + storage_dir = OmegaConf.load( + config_path + ).datasets.coco_caption.build_info.images.storage + + download_dir = Path(get_cache_path(storage_dir)).parent / "download" + storage_dir = Path(get_cache_path(storage_dir)) + + if storage_dir.exists(): + print(f"Dataset already exists at {storage_dir}. Aborting.") + exit(0) + + try: + for k, v in DATA_URL.items(): + print("Downloading {} to {}".format(v, k)) + download_datasets(download_dir, v) + except Exception as e: + # remove download dir if failed + cleanup_dir(download_dir) + print("Failed to download or extracting datasets. Aborting.") + + cleanup_dir(download_dir) diff --git a/lavis/datasets/download_scripts/download_coin.py b/lavis/datasets/download_scripts/download_coin.py new file mode 100644 index 0000000000000000000000000000000000000000..8376aa00ae9a605d71f2ff98a13a22595fb8c6de --- /dev/null +++ b/lavis/datasets/download_scripts/download_coin.py @@ -0,0 +1,57 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + + +## Pre-requisities: run 'pip install youtube-dl' to install the youtube-dl package. +## Specify your location of output videos and input json file. +## It can also be used for youcookii by updating the file paths. +import json +import os + +output_path = './videos' +json_path = './COIN.json' + +if not os.path.exists(output_path): + os.mkdir(output_path) + +data = json.load(open(json_path, 'r'))['database'] +youtube_ids = list(data.keys()) + +for youtube_id in data: + info = data[youtube_id] + type = info['recipe_type'] + url = info['video_url'] + vid_loc = output_path + '/' + str(type) + if not os.path.exists(vid_loc): + os.mkdir(vid_loc) + os.system('youtube-dl -o ' + vid_loc + '/' + youtube_id + '.mp4' + ' -f best ' + url) + + # To save disk space, you could download the best format available + # but not better that 480p or any other qualities optinally + # See https://askubuntu.com/questions/486297/how-to-select-video-quality-from-youtube-dl + +## convert annotations +all_json = json.load(open(json_path))['database'] +train_data = [] +test_data = [] +for k,v in all_json.items(): + for gt_ann in v['annotation']: + new_ann = {} + youtube_id = v["video_url"].split("/")[-1] + new_ann['youtube_id'] = youtube_id + new_ann["recipe_type"] = v["recipe_type"] + new_ann['video_path'] = f'{v["recipe_type"]}/{youtube_id}.mp4' + new_ann['caption'] = gt_ann['label'] + new_ann['id'] = gt_ann['id'] + new_ann['ts'] = gt_ann['ts'] + if v['subset'] == 'training': + train_data.append(new_ann) + else: + test_data.append(new_ann) + +json.dump(train_data, open('train.json', 'w')) +json.dump(test_data, open('test.json', 'w')) \ No newline at end of file diff --git a/lavis/datasets/download_scripts/download_didemo.py b/lavis/datasets/download_scripts/download_didemo.py new file mode 100644 index 0000000000000000000000000000000000000000..376b71c4de1e83442a0209796c95f55da6b3e71a --- /dev/null +++ b/lavis/datasets/download_scripts/download_didemo.py @@ -0,0 +1,70 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from pathlib import Path + +from omegaconf import OmegaConf + +from lavis.common.utils import ( + cleanup_dir, + download_and_extract_archive, + get_abs_path, + get_cache_path, +) + +DATA_URL = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/didemo_videos.tar.gz" + + +def download_datasets(root, url): + """ + Download the Imagenet-R dataset archives and expand them + in the folder provided as parameter + """ + download_and_extract_archive(url=url, download_root=root) + + +def move_files(download_path, storage_path): + """ + Move files from download_path to storage_path + """ + print("Moving to {}".format(storage_path)) + + os.makedirs(storage_path, exist_ok=True) + + for file_name in os.listdir(download_path): + os.rename( + os.path.join(download_path, file_name), + os.path.join(storage_path, file_name), + ) + + +if __name__ == "__main__": + + config_path = get_abs_path("configs/datasets/didemo/defaults_ret.yaml") + + storage_dir = OmegaConf.load( + config_path + ).datasets.didemo_retrieval.build_info.videos.storage + + download_dir = Path(get_cache_path(storage_dir)).parent / "download" + storage_dir = Path(get_cache_path(storage_dir)) + + if storage_dir.exists(): + print(f"Dataset already exists at {storage_dir}. Aborting.") + exit(0) + + try: + print("Downloading {} to {}".format(DATA_URL, download_dir)) + download_datasets(download_dir, DATA_URL) + except Exception as e: + # remove download dir if failed + cleanup_dir(download_dir) + print("Failed to download or extracting datasets. Aborting.") + + move_files(download_dir / "videos", storage_dir) + cleanup_dir(download_dir) diff --git a/lavis/datasets/download_scripts/download_flickr.py b/lavis/datasets/download_scripts/download_flickr.py new file mode 100644 index 0000000000000000000000000000000000000000..3075f02299110b729ccb0f4b34f7b9cf23046b6c --- /dev/null +++ b/lavis/datasets/download_scripts/download_flickr.py @@ -0,0 +1,78 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from pathlib import Path + +from omegaconf import OmegaConf + +from lavis.common.utils import ( + cleanup_dir, + get_abs_path, + get_cache_path, +) + +import opendatasets as od + + +DATA_URL = "https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset" + +print( + """ + To download the dataset, you need to have a Kaggle account and the associated key. + See https://www.kaggle.com/docs/api to create account and a new API token. + """ +) + + +def move_directory(src_dir, dst_dir): + """ + Move files from download_path to storage_path + """ + print("Moving to {}".format(dst_dir)) + + os.makedirs(dst_dir, exist_ok=True) + + for file_name in os.listdir(src_dir): + os.rename( + os.path.join(src_dir, file_name), + os.path.join(dst_dir, file_name), + ) + + +if __name__ == "__main__": + + config_path = get_abs_path("configs/datasets/flickr30k/defaults.yaml") + + storage_dir = OmegaConf.load( + config_path + ).datasets.flickr30k.build_info.images.storage + + storage_dir = Path(get_cache_path(storage_dir)) + download_dir = storage_dir.parent / "download" + + if storage_dir.exists(): + print(f"Dataset already exists at {storage_dir}. Aborting.") + exit(0) + + os.makedirs(download_dir) + + try: + print("Downloading {} to {}".format(DATA_URL, download_dir)) + od.download(DATA_URL, download_dir) + except Exception as e: + print(e) + # remove download dir if failed + cleanup_dir(download_dir) + exit(1) + + move_directory( + download_dir / "flickr-image-dataset" / "flickr30k_images" / "flickr30k_images", + storage_dir / "flickr30k-images", + ) + + cleanup_dir(download_dir) diff --git a/lavis/datasets/download_scripts/download_gqa.py b/lavis/datasets/download_scripts/download_gqa.py new file mode 100644 index 0000000000000000000000000000000000000000..0bce71408c9f8d8973ef8f7fa9419d328127978e --- /dev/null +++ b/lavis/datasets/download_scripts/download_gqa.py @@ -0,0 +1,51 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from pathlib import Path + +from omegaconf import OmegaConf + +from lavis.common.utils import ( + cleanup_dir, + download_and_extract_archive, + get_abs_path, + get_cache_path, +) + + +DATA_URL = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip" + + +def download_datasets(root, url): + download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir.parent) + + +if __name__ == "__main__": + + config_path = get_abs_path("configs/datasets/gqa/defaults.yaml") + + storage_dir = OmegaConf.load( + config_path + ).datasets.gqa.build_info.images.storage + + download_dir = Path(get_cache_path(storage_dir)).parent / "download" + storage_dir = Path(get_cache_path(storage_dir)) + + if storage_dir.exists(): + print(f"Dataset already exists at {storage_dir}. Aborting.") + exit(0) + + try: + print("Downloading {}".format(DATA_URL)) + download_datasets(download_dir, DATA_URL) + except Exception as e: + # remove download dir if failed + cleanup_dir(download_dir) + print("Failed to download or extracting datasets. Aborting.") + + cleanup_dir(download_dir) diff --git a/lavis/datasets/download_scripts/download_iconqa.py b/lavis/datasets/download_scripts/download_iconqa.py new file mode 100644 index 0000000000000000000000000000000000000000..1070c436ab6bdc387c33f24db94c1f09868f57ed --- /dev/null +++ b/lavis/datasets/download_scripts/download_iconqa.py @@ -0,0 +1,35 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json +from tqdm import tqdm +import shutil +import subprocess + + +image_dir = f'./all_images' +os.makedirs(image_dir, exist_ok=True) +for split in ['train', 'test', 'val']: + print(f"Processing split {split}...") + path = f'{os.path.abspath(image_dir)}/{split}/choose_txt' + annotations = [] + for id in tqdm(os.listdir(path)): + if not os.path.isdir(os.path.join(path, id)): + continue + ann = json.load(open(os.path.join(path, id, 'data.json'), "r")) + ann['instance_id'] = id + ann['image_id'] = f'{split}_{id}' + ann['image'] = f'{split}_{id}.png' + os.system(' '.join(('ln -s',os.path.join(path, id, 'image.png'),os.path.join(image_dir,ann["image"])))) + + annotations.append(ann) + f = open(f'annotations_{split}.json', 'w') + f.write(json.dumps(annotations)) + f.close() + + \ No newline at end of file diff --git a/lavis/datasets/download_scripts/download_msrvtt.py b/lavis/datasets/download_scripts/download_msrvtt.py new file mode 100644 index 0000000000000000000000000000000000000000..3e9dc1cd942ad3a17d0debe0c2b94e6edbc56c61 --- /dev/null +++ b/lavis/datasets/download_scripts/download_msrvtt.py @@ -0,0 +1,105 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from pathlib import Path + +from omegaconf import OmegaConf + +from lavis.common.utils import ( + cleanup_dir, + download_and_extract_archive, + get_abs_path, + get_cache_path, +) + + +# TODO +# 1. Go to https://www.mediafire.com/file/czh8sezbo9s4692/test_videos.zip/file +# and https://www.mediafire.com/file/x3rrbe4hwp04e6w/train_val_videos.zip/file +# 2. Right-click the Download button and copy the link address +# e.g. +# DATA_URL = { +# "train": "https://download1602.mediafire.com/xxxxxxxxxxxx/x3rrbe4hwp04e6w/train_val_videos.zip", +# "test": "https://download2390.mediafire.com/xxxxxxxxxxxx/czh8sezbo9s4692/test_videos.zip", +# } +# 3. Paste the link address to DATA_URL + +DATA_URL = { + "train": "https://download2295.mediafire.com/4bb7p74xrbgg/x3rrbe4hwp04e6w/train_val_videos.zip", + "test": "https://download2390.mediafire.com/79hfq3592lqg/czh8sezbo9s4692/test_videos.zip", +} + + +def download_datasets(root, url): + """ + Download the Imagenet-R dataset archives and expand them + in the folder provided as parameter + """ + download_and_extract_archive(url=url, download_root=root) + + +def merge_datasets(download_path, storage_path): + """ + Merge datasets in download_path to storage_path + """ + + # Merge train and test datasets + train_path = os.path.join(download_path, "TrainValVideo") + test_path = os.path.join(download_path, "TestVideo") + train_test_path = storage_path + + print("Merging to {}".format(train_test_path)) + + os.makedirs(train_test_path, exist_ok=True) + + for file_name in os.listdir(train_path): + os.rename( + os.path.join(train_path, file_name), + os.path.join(train_test_path, file_name), + ) + + for file_name in os.listdir(test_path): + os.rename( + os.path.join(test_path, file_name), + os.path.join(train_test_path, file_name), + ) + + +if __name__ == "__main__": + + config_path = get_abs_path("configs/datasets/msrvtt/defaults_cap.yaml") + + storage_dir = OmegaConf.load( + config_path + ).datasets.msrvtt_cap.build_info.videos.storage + + download_dir = Path(get_cache_path(storage_dir)).parent / "download" + storage_dir = Path(get_cache_path(storage_dir)) + + if storage_dir.exists(): + print(f"Dataset already exists at {storage_dir}. Aborting.") + exit(0) + + try: + for k, v in DATA_URL.items(): + print("Downloading {} to {}".format(v, k)) + download_datasets(download_dir, v) + except Exception as e: + # remove download dir if failed + cleanup_dir(download_dir) + print("Failed to download or extracting datasets. Aborting.") + + try: + merge_datasets(download_dir, storage_dir) + except Exception as e: + # remove storage dir if failed + cleanup_dir(download_dir) + cleanup_dir(storage_dir) + print("Failed to merging datasets. Aborting.") + + cleanup_dir(download_dir) diff --git a/lavis/datasets/download_scripts/download_msvd.py b/lavis/datasets/download_scripts/download_msvd.py new file mode 100644 index 0000000000000000000000000000000000000000..c4bf5467f3af7acdde7f7a25a38d28c599525771 --- /dev/null +++ b/lavis/datasets/download_scripts/download_msvd.py @@ -0,0 +1,67 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from pathlib import Path + +from omegaconf import OmegaConf + +from lavis.common.utils import ( + cleanup_dir, + download_and_extract_archive, + get_abs_path, + get_cache_path, +) + + +DATA_URL = "https://www.cs.utexas.edu/users/ml/clamp/videoDescription/YouTubeClips.tar" + + +def download_datasets(root, url): + download_and_extract_archive(url=url, download_root=root) + + +def move_files(download_path, storage_path): + """ + Move files from download_path to storage_path + """ + print("Moving to {}".format(storage_path)) + + os.makedirs(storage_path, exist_ok=True) + + for file_name in os.listdir(download_path): + os.rename( + os.path.join(download_path, file_name), + os.path.join(storage_path, file_name), + ) + + +if __name__ == "__main__": + + config_path = get_abs_path("configs/datasets/msvd/defaults_cap.yaml") + + storage_dir = OmegaConf.load( + config_path + ).datasets.msvd_cap.build_info.videos.storage + + download_dir = Path(get_cache_path(storage_dir)).parent / "download" + storage_dir = Path(get_cache_path(storage_dir)) + + if storage_dir.exists(): + print(f"Dataset already exists at {storage_dir}. Aborting.") + exit(0) + + try: + print("Downloading {}".format(DATA_URL)) + download_datasets(download_dir, DATA_URL) + except Exception as e: + # remove download dir if failed + cleanup_dir(download_dir) + print("Failed to download or extracting datasets. Aborting.") + + move_files(download_dir / "YouTubeClips", storage_dir) + cleanup_dir(download_dir) diff --git a/lavis/datasets/download_scripts/download_nocaps.py b/lavis/datasets/download_scripts/download_nocaps.py new file mode 100644 index 0000000000000000000000000000000000000000..ab56a7c10d958e6debb3968ca1c4def3da3beb0a --- /dev/null +++ b/lavis/datasets/download_scripts/download_nocaps.py @@ -0,0 +1,134 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import json +import logging +import os +import time +from multiprocessing import Pool + +import numpy as np +import requests +import tqdm +from lavis.common.utils import cleanup_dir, get_abs_path, get_cache_path +from omegaconf import OmegaConf + +header_mzl = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36", + # "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot + # "X-Forwarded-For": "64.18.15.200", +} + +header_gbot = { + "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot +} + +headers = [header_mzl, header_gbot] + +# Setup +logging.basicConfig(filename="download_nocaps.log", filemode="w", level=logging.INFO) +requests.packages.urllib3.disable_warnings( + requests.packages.urllib3.exceptions.InsecureRequestWarning +) + + +def download_file(url, filename): + max_retries = 20 + cur_retries = 0 + + header = headers[0] + + while cur_retries < max_retries: + try: + r = requests.get(url, headers=header, timeout=10) + with open(filename, "wb") as f: + f.write(r.content) + + break + except Exception as e: + logging.info(" ".join(repr(e).splitlines())) + logging.error(url) + cur_retries += 1 + + # random sample a header from headers + header = headers[np.random.randint(0, len(headers))] + + time.sleep(3 + cur_retries * 2) + + +def download_image_from_url_val(url): + basename = os.path.basename(url) + filename = os.path.join(storage_dir, "val", basename) + + download_file(url, filename) + + +def download_image_from_url_test(url): + basename = os.path.basename(url) + filename = os.path.join(storage_dir, "test", basename) + + download_file(url, filename) + + +if __name__ == "__main__": + os.makedirs("tmp", exist_ok=True) + + # storage dir + config_path = get_abs_path("configs/datasets/nocaps/defaults.yaml") + + storage_dir = OmegaConf.load(config_path).datasets.nocaps.build_info.images.storage + storage_dir = get_cache_path(storage_dir) + # make sure the storage dir exists + os.makedirs(storage_dir, exist_ok=True) + print("Storage dir:", storage_dir) + + # make sure the storage dir for val and test exists + os.makedirs(os.path.join(storage_dir, "val"), exist_ok=True) + os.makedirs(os.path.join(storage_dir, "test"), exist_ok=True) + + # download annotations + val_url = "https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json" + tst_url = "https://s3.amazonaws.com/nocaps/nocaps_test_image_info.json" + + print("Downloading validation annotations from %s" % val_url) + download_file(val_url, "tmp/nocaps_val_ann.json") + print("Downloading testing annotations from %s" % tst_url) + download_file(tst_url, "tmp/nocaps_tst_ann.json") + + # open annotations + val_ann = json.load(open("tmp/nocaps_val_ann.json")) + tst_ann = json.load(open("tmp/nocaps_tst_ann.json")) + + # collect image urls + val_info = val_ann["images"] + tst_info = tst_ann["images"] + + val_urls = [info["coco_url"] for info in val_info] + tst_urls = [info["coco_url"] for info in tst_info] + + # setup multiprocessing + # large n_procs possibly causes server to reject requests + n_procs = 16 + + with Pool(n_procs) as pool: + print("Downloading validation images...") + list( + tqdm.tqdm( + pool.imap(download_image_from_url_val, val_urls), total=len(val_urls) + ) + ) + + with Pool(n_procs) as pool: + print("Downloading test images...") + list( + tqdm.tqdm( + pool.imap(download_image_from_url_test, tst_urls), total=len(tst_urls) + ) + ) + + # clean tmp + cleanup_dir("tmp") diff --git a/lavis/datasets/download_scripts/download_sbu.py b/lavis/datasets/download_scripts/download_sbu.py new file mode 100644 index 0000000000000000000000000000000000000000..9ffbf43c670d471f7eb160bcb8a9b6bd887aaf65 --- /dev/null +++ b/lavis/datasets/download_scripts/download_sbu.py @@ -0,0 +1,82 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import io +import os +import pathlib +import urllib +import tqdm + +from concurrent.futures import ThreadPoolExecutor + +from lavis.common.utils import get_abs_path, get_cache_path +from lavis.datasets.builders import load_dataset +from omegaconf import OmegaConf +from PIL import Image + +# DATA_URL = {"train": "http://www.cs.rice.edu/~vo9/sbucaptions/sbu_images.tar"} + +USER_AGENT = ( + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" +) + + +def fetch_single_image(image_url, timeout=None, retries=0): + for _ in range(retries + 1): + try: + request = urllib.request.Request( + image_url, + data=None, + headers={"user-agent": USER_AGENT}, + ) + with urllib.request.urlopen(request, timeout=timeout) as req: + image = Image.open(io.BytesIO(req.read())) + break + except Exception: + image = None + return image + + +def download_and_save_image(ann, save_dir, timeout=None, retries=0): + image = fetch_single_image(ann["url"], timeout=timeout, retries=retries) + + if image is not None: + image_path = os.path.join(save_dir, ann["image"]) + print(image_path) + image.save(image_path) + + +if __name__ == "__main__": + + config_path = get_abs_path("configs/datasets/sbu_caption/defaults.yaml") + + storage_dir = OmegaConf.load( + config_path + ).datasets.sbu_caption.build_info.images.storage + + storage_dir = pathlib.Path(get_cache_path(storage_dir)) + + if storage_dir.exists(): + print(f"Dataset already exists at {storage_dir}. Aborting.") + exit(0) + + storage_dir.mkdir(parents=True, exist_ok=True) + + num_threads = 20 + dset = load_dataset("sbu_caption")["train"].annotation + + print("Downloading dataset...") + # multiprocessing + with ThreadPoolExecutor(max_workers=num_threads) as executor: + for ann in tqdm.tqdm(dset): + executor.submit( + download_and_save_image, + ann, + storage_dir, + timeout=30, + retries=10, + ) diff --git a/lavis/datasets/download_scripts/download_vg.py b/lavis/datasets/download_scripts/download_vg.py new file mode 100644 index 0000000000000000000000000000000000000000..7fbb7828f035f2cc9b32471129f0d2ec0f916f8e --- /dev/null +++ b/lavis/datasets/download_scripts/download_vg.py @@ -0,0 +1,55 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +from pathlib import Path + +from omegaconf import OmegaConf + +from lavis.common.utils import ( + cleanup_dir, + download_and_extract_archive, + get_abs_path, + get_cache_path, +) + + +DATA_URL = { + "train": "https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip", + "train2": "https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip", +} + + +def download_datasets(root, url): + download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir) + + +if __name__ == "__main__": + + config_path = get_abs_path("configs/datasets/vg/defaults_caption.yaml") + + storage_dir = OmegaConf.load( + config_path + ).datasets.vg_caption.build_info.images.storage + + download_dir = Path(get_cache_path(storage_dir)).parent / "download" + storage_dir = Path(get_cache_path(storage_dir)) + + if storage_dir.exists(): + print(f"Dataset already exists at {storage_dir}. Aborting.") + exit(0) + + try: + for k, v in DATA_URL.items(): + print("Downloading {} to {}".format(v, k)) + download_datasets(download_dir, v) + except Exception as e: + # remove download dir if failed + cleanup_dir(download_dir) + print("Failed to download or extracting datasets. Aborting.") + + cleanup_dir(download_dir) diff --git a/lavis/datasets/download_scripts/download_violin.py b/lavis/datasets/download_scripts/download_violin.py new file mode 100644 index 0000000000000000000000000000000000000000..45b850965bbaa671caef955ea60c7d1f302ecc37 --- /dev/null +++ b/lavis/datasets/download_scripts/download_violin.py @@ -0,0 +1,19 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import json +import os + +json_path = './violin_annotation.json' + +## convert annotations +all_json = json.load(open(json_path)) +train_data = [v for v in all_json.values() if 'split' in v and v['split'] == 'train'] +test_data = [v for v in all_json.values() if 'split' in v and v['split'] == 'test'] + +json.dump(train_data, open('train.json', 'w')) +json.dump(test_data, open('test.json', 'w')) \ No newline at end of file diff --git a/lavis/models/__init__.py b/lavis/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..47dd552beacb5bf777ee0bca26fb44ceee0be46c --- /dev/null +++ b/lavis/models/__init__.py @@ -0,0 +1,270 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +import torch +from omegaconf import OmegaConf +from lavis.common.registry import registry + +from lavis.models.base_model import BaseModel + +from lavis.models.albef_models.albef_classification import AlbefClassification +from lavis.models.albef_models.albef_feature_extractor import AlbefFeatureExtractor +from lavis.models.albef_models.albef_nlvr import AlbefNLVR +from lavis.models.albef_models.albef_pretrain import AlbefPretrain +from lavis.models.albef_models.albef_retrieval import AlbefRetrieval +from lavis.models.albef_models.albef_vqa import AlbefVQA +from lavis.models.alpro_models.alpro_qa import AlproQA +from lavis.models.alpro_models.alpro_retrieval import AlproRetrieval + +from lavis.models.blip_models.blip import BlipBase +from lavis.models.blip_models.blip_caption import BlipCaption +from lavis.models.blip_models.blip_classification import BlipClassification +from lavis.models.blip_models.blip_feature_extractor import BlipFeatureExtractor +from lavis.models.blip_models.blip_image_text_matching import BlipITM +from lavis.models.blip_models.blip_nlvr import BlipNLVR +from lavis.models.blip_models.blip_pretrain import BlipPretrain +from lavis.models.blip_models.blip_retrieval import BlipRetrieval +from lavis.models.blip_models.blip_vqa import BlipVQA + +from lavis.models.blip2_models.blip2 import Blip2Base +from lavis.models.blip2_models.blip2_opt import Blip2OPT +from lavis.models.blip2_models.blip2_t5 import Blip2T5 +from lavis.models.blip2_models.blip2_qformer import Blip2Qformer +from lavis.models.blip2_models.blip2_image_text_matching import Blip2ITM + +from lavis.models.blip2_models.blip2_t5_instruct import Blip2T5Instruct +from lavis.models.blip2_models.blip2_vicuna_instruct import Blip2VicunaInstruct +from lavis.models.blip2_models.blip2_vicuna_xinstruct import Blip2VicunaXInstruct + +from lavis.models.blip_diffusion_models.blip_diffusion import BlipDiffusion + +from lavis.models.pnp_vqa_models.pnp_vqa import PNPVQA +from lavis.models.pnp_vqa_models.pnp_unifiedqav2_fid import PNPUnifiedQAv2FiD +from lavis.models.img2prompt_models.img2prompt_vqa import Img2PromptVQA +from lavis.models.med import XBertLMHeadDecoder +from lavis.models.vit import VisionTransformerEncoder +from lavis.models.clip_models.model import CLIP + +from lavis.models.gpt_models.gpt_dialogue import GPTDialogue + +from lavis.processors.base_processor import BaseProcessor + + +__all__ = [ + "load_model", + "AlbefClassification", + "AlbefFeatureExtractor", + "AlbefNLVR", + "AlbefVQA", + "AlbefPretrain", + "AlbefRetrieval", + "AlproQA", + "AlproRetrieval", + "BaseModel", + "BlipBase", + "BlipFeatureExtractor", + "BlipCaption", + "BlipClassification", + "BlipDiffusion", + "BlipITM", + "BlipNLVR", + "BlipPretrain", + "BlipRetrieval", + "BlipVQA", + "Blip2Qformer", + "Blip2Base", + "Blip2ITM", + "Blip2OPT", + "Blip2T5", + "Blip2T5Instruct", + "Blip2VicunaInstruct", + "Blip2VicunaXInstruct", + "PNPVQA", + "Img2PromptVQA", + "PNPUnifiedQAv2FiD", + "CLIP", + "VisionTransformerEncoder", + "XBertLMHeadDecoder", + "GPTDialogue", +] + + +def load_model(name, model_type, is_eval=False, device="cpu", checkpoint=None): + """ + Load supported models. + + To list all available models and types in registry: + >>> from lavis.models import model_zoo + >>> print(model_zoo) + + Args: + name (str): name of the model. + model_type (str): type of the model. + is_eval (bool): whether the model is in eval mode. Default: False. + device (str): device to use. Default: "cpu". + checkpoint (str): path or to checkpoint. Default: None. + Note that expecting the checkpoint to have the same keys in state_dict as the model. + + Returns: + model (torch.nn.Module): model. + """ + + model = registry.get_model_class(name).from_pretrained(model_type=model_type) + + if checkpoint is not None: + model.load_checkpoint(checkpoint) + + if is_eval: + model.eval() + + if device == "cpu": + model = model.float() + + return model.to(device) + + +def load_preprocess(config): + """ + Load preprocessor configs and construct preprocessors. + + If no preprocessor is specified, return BaseProcessor, which does not do any preprocessing. + + Args: + config (dict): preprocessor configs. + + Returns: + vis_processors (dict): preprocessors for visual inputs. + txt_processors (dict): preprocessors for text inputs. + + Key is "train" or "eval" for processors used in training and evaluation respectively. + """ + + def _build_proc_from_cfg(cfg): + return ( + registry.get_processor_class(cfg.name).from_config(cfg) + if cfg is not None + else BaseProcessor() + ) + + vis_processors = dict() + txt_processors = dict() + + vis_proc_cfg = config.get("vis_processor") + txt_proc_cfg = config.get("text_processor") + + if vis_proc_cfg is not None: + vis_train_cfg = vis_proc_cfg.get("train") + vis_eval_cfg = vis_proc_cfg.get("eval") + else: + vis_train_cfg = None + vis_eval_cfg = None + + vis_processors["train"] = _build_proc_from_cfg(vis_train_cfg) + vis_processors["eval"] = _build_proc_from_cfg(vis_eval_cfg) + + if txt_proc_cfg is not None: + txt_train_cfg = txt_proc_cfg.get("train") + txt_eval_cfg = txt_proc_cfg.get("eval") + else: + txt_train_cfg = None + txt_eval_cfg = None + + txt_processors["train"] = _build_proc_from_cfg(txt_train_cfg) + txt_processors["eval"] = _build_proc_from_cfg(txt_eval_cfg) + + return vis_processors, txt_processors + + +def load_model_and_preprocess(name, model_type, is_eval=False, device="cpu"): + """ + Load model and its related preprocessors. + + List all available models and types in registry: + >>> from lavis.models import model_zoo + >>> print(model_zoo) + + Args: + name (str): name of the model. + model_type (str): type of the model. + is_eval (bool): whether the model is in eval mode. Default: False. + device (str): device to use. Default: "cpu". + + Returns: + model (torch.nn.Module): model. + vis_processors (dict): preprocessors for visual inputs. + txt_processors (dict): preprocessors for text inputs. + """ + model_cls = registry.get_model_class(name) + + # load model + model = model_cls.from_pretrained(model_type=model_type) + + if is_eval: + model.eval() + + # load preprocess + cfg = OmegaConf.load(model_cls.default_config_path(model_type)) + if cfg is not None: + preprocess_cfg = cfg.preprocess + + vis_processors, txt_processors = load_preprocess(preprocess_cfg) + else: + vis_processors, txt_processors = None, None + logging.info( + f"""No default preprocess for model {name} ({model_type}). + This can happen if the model is not finetuned on downstream datasets, + or it is not intended for direct use without finetuning. + """ + ) + + if device == "cpu" or device == torch.device("cpu"): + model = model.float() + + return model.to(device), vis_processors, txt_processors + + +class ModelZoo: + """ + A utility class to create string representation of available model architectures and types. + + >>> from lavis.models import model_zoo + >>> # list all available models + >>> print(model_zoo) + >>> # show total number of models + >>> print(len(model_zoo)) + """ + + def __init__(self) -> None: + self.model_zoo = { + k: list(v.PRETRAINED_MODEL_CONFIG_DICT.keys()) + for k, v in registry.mapping["model_name_mapping"].items() + } + + def __str__(self) -> str: + return ( + "=" * 50 + + "\n" + + f"{'Architectures':<30} {'Types'}\n" + + "=" * 50 + + "\n" + + "\n".join( + [ + f"{name:<30} {', '.join(types)}" + for name, types in self.model_zoo.items() + ] + ) + ) + + def __iter__(self): + return iter(self.model_zoo.items()) + + def __len__(self): + return sum([len(v) for v in self.model_zoo.values()]) + + +model_zoo = ModelZoo() diff --git a/lavis/models/__pycache__/__init__.cpython-310.pyc b/lavis/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1a28cf26ca2a2c72db33e755940cebf9f77b70a Binary files /dev/null and b/lavis/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/__pycache__/base_model.cpython-310.pyc b/lavis/models/__pycache__/base_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80e108640adc00cdb9d73ab33d99f4699000bb72 Binary files /dev/null and b/lavis/models/__pycache__/base_model.cpython-310.pyc differ diff --git a/lavis/models/__pycache__/clip_vit.cpython-310.pyc b/lavis/models/__pycache__/clip_vit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cd2a85a98ea4db9368f133ce8fd1235a65f79a6 Binary files /dev/null and b/lavis/models/__pycache__/clip_vit.cpython-310.pyc differ diff --git a/lavis/models/__pycache__/eva_vit.cpython-310.pyc b/lavis/models/__pycache__/eva_vit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68b9e24835a01c8cefff9e23f32df59505518d6b Binary files /dev/null and b/lavis/models/__pycache__/eva_vit.cpython-310.pyc differ diff --git a/lavis/models/__pycache__/med.cpython-310.pyc b/lavis/models/__pycache__/med.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80a7e9806dfa424ecf7fafe7d85931c81d2fdd93 Binary files /dev/null and b/lavis/models/__pycache__/med.cpython-310.pyc differ diff --git a/lavis/models/__pycache__/vit.cpython-310.pyc b/lavis/models/__pycache__/vit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f3753d2654e8ad543195cca4d967ad3e920e1e0 Binary files /dev/null and b/lavis/models/__pycache__/vit.cpython-310.pyc differ diff --git a/lavis/models/albef_models/__init__.py b/lavis/models/albef_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..512729237e32354ec0aca598343320afaf7d4acd --- /dev/null +++ b/lavis/models/albef_models/__init__.py @@ -0,0 +1,202 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import datetime +import logging +import os +import time + +import lavis.common.dist_utils as dist_utils +import torch +import torch.distributed as dist +import torch.nn.functional as F +from lavis.common.dist_utils import download_cached_file +from lavis.common.logger import MetricLogger +from lavis.common.utils import is_url +from lavis.models.base_model import BaseModel +from lavis.models.vit import interpolate_pos_embed +from transformers import BertTokenizer + + +class AlbefBase(BaseModel): + @classmethod + def init_tokenizer(cls): + return BertTokenizer.from_pretrained("bert-base-uncased") + + def load_from_pretrained(self, url_or_filename, rename_text_keys=True): + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + if "model" in checkpoint: + state_dict = checkpoint["model"] + else: + state_dict = checkpoint + + state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed( + state_dict["visual_encoder.pos_embed"], self.visual_encoder + ) + if ( + "visual_encoder_m.pos_embed" in self.state_dict().keys() + and "visual_encoder_m.pos_embed" in state_dict + ): + state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed( + state_dict["visual_encoder_m.pos_embed"], self.visual_encoder_m + ) + + if rename_text_keys: + for key in list(state_dict.keys()): + if "bert" in key: + new_key = key.replace("bert.", "") + state_dict[new_key] = state_dict[key] + del state_dict[key] + + for key in self.state_dict().keys(): + if key in state_dict.keys(): + if state_dict[key].shape != self.state_dict()[key].shape: + del state_dict[key] + + msg = self.load_state_dict(state_dict, strict=False) + + logging.info("Missing keys {}".format(msg.missing_keys)) + logging.info("load checkpoint from %s" % url_or_filename) + return msg + + +def compute_sim_matrix(model, data_loader, **kwargs): + k_test = kwargs.pop("k_test") + + metric_logger = MetricLogger(delimiter=" ") + header = "Evaluation:" + + logging.info("Computing features for evaluation...") + start_time = time.time() + + texts = data_loader.dataset.text + num_text = len(texts) + text_bs = 256 + text_ids = [] + text_embeds = [] + text_atts = [] + for i in range(0, num_text, text_bs): + text = texts[i : min(num_text, i + text_bs)] + text_input = model.tokenizer( + text, + padding="max_length", + truncation=True, + max_length=35, + return_tensors="pt", + ).to(model.device) + text_output = model.text_encoder.forward_text(text_input) + text_embed = F.normalize( + model.text_proj(text_output.last_hidden_state[:, 0, :]) + ) + text_embeds.append(text_embed) + text_ids.append(text_input.input_ids) + text_atts.append(text_input.attention_mask) + + text_embeds = torch.cat(text_embeds, dim=0) + text_ids = torch.cat(text_ids, dim=0) + text_atts = torch.cat(text_atts, dim=0) + if hasattr(model.tokenizer, "enc_token_id"): + text_ids[:, 0] = model.tokenizer.enc_token_id + + image_feats = [] + image_embeds = [] + for samples in data_loader: + image = samples["image"] + + image = image.to(model.device) + image_feat = model.visual_encoder.forward_features(image) + image_embed = model.vision_proj(image_feat[:, 0, :]) + image_embed = F.normalize(image_embed, dim=-1) + + image_feats.append(image_feat.cpu()) + image_embeds.append(image_embed) + + image_feats = torch.cat(image_feats, dim=0) + image_embeds = torch.cat(image_embeds, dim=0) + + sims_matrix = image_embeds @ text_embeds.t() + score_matrix_i2t = torch.full( + (len(data_loader.dataset.image), len(texts)), -100.0 + ).to(model.device) + + num_tasks = dist_utils.get_world_size() + rank = dist_utils.get_rank() + step = sims_matrix.size(0) // num_tasks + 1 + start = rank * step + end = min(sims_matrix.size(0), start + step) + + for i, sims in enumerate( + metric_logger.log_every(sims_matrix[start:end], 50, header) + ): + # topk_sim, topk_idx = sims.topk(k=config["k_test"], dim=0) + topk_sim, topk_idx = sims.topk(k=k_test, dim=0) + + encoder_output = image_feats[start + i].repeat(k_test, 1, 1).to(model.device) + encoder_att = torch.ones(encoder_output.size()[:-1], dtype=torch.long).to( + model.device + ) + output = model.text_encoder( + text_ids[topk_idx], + attention_mask=text_atts[topk_idx], + encoder_hidden_states=encoder_output, + encoder_attention_mask=encoder_att, + return_dict=True, + ) + score = model.itm_head(output.last_hidden_state[:, 0, :])[:, 1] + score_matrix_i2t[start + i, topk_idx] = score + topk_sim + + sims_matrix = sims_matrix.t() + score_matrix_t2i = torch.full( + (len(texts), len(data_loader.dataset.image)), -100.0 + ).to(model.device) + + step = sims_matrix.size(0) // num_tasks + 1 + start = rank * step + end = min(sims_matrix.size(0), start + step) + + for i, sims in enumerate( + metric_logger.log_every(sims_matrix[start:end], 50, header) + ): + + topk_sim, topk_idx = sims.topk(k=k_test, dim=0) + encoder_output = image_feats[topk_idx.cpu()].to(model.device) + encoder_att = torch.ones(encoder_output.size()[:-1], dtype=torch.long).to( + model.device + ) + output = model.text_encoder( + text_ids[start + i].repeat(k_test, 1), + attention_mask=text_atts[start + i].repeat(k_test, 1), + encoder_hidden_states=encoder_output, + encoder_attention_mask=encoder_att, + return_dict=True, + ) + score = model.itm_head(output.last_hidden_state[:, 0, :])[:, 1] + score_matrix_t2i[start + i, topk_idx] = score + topk_sim + + if dist_utils.is_dist_avail_and_initialized(): + dist.barrier() + torch.distributed.all_reduce( + score_matrix_i2t, op=torch.distributed.ReduceOp.SUM + ) + torch.distributed.all_reduce( + score_matrix_t2i, op=torch.distributed.ReduceOp.SUM + ) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + logging.info("Evaluation time {}".format(total_time_str)) + + return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy() diff --git a/lavis/models/albef_models/__pycache__/__init__.cpython-310.pyc b/lavis/models/albef_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cf2a0f3987d9d4199b55b0388ab78894ba1da58 Binary files /dev/null and b/lavis/models/albef_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/albef_models/__pycache__/albef_classification.cpython-310.pyc b/lavis/models/albef_models/__pycache__/albef_classification.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..320276e7b8b586836aea7dbc1e651099991a567a Binary files /dev/null and b/lavis/models/albef_models/__pycache__/albef_classification.cpython-310.pyc differ diff --git a/lavis/models/albef_models/__pycache__/albef_feature_extractor.cpython-310.pyc b/lavis/models/albef_models/__pycache__/albef_feature_extractor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15a65f0b5cd44c1941102b082ab43f8b0b6a7503 Binary files /dev/null and b/lavis/models/albef_models/__pycache__/albef_feature_extractor.cpython-310.pyc differ diff --git a/lavis/models/albef_models/__pycache__/albef_nlvr.cpython-310.pyc b/lavis/models/albef_models/__pycache__/albef_nlvr.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b23ef75de5e492adf07a082b5ff665ff03009b63 Binary files /dev/null and b/lavis/models/albef_models/__pycache__/albef_nlvr.cpython-310.pyc differ diff --git a/lavis/models/albef_models/__pycache__/albef_outputs.cpython-310.pyc b/lavis/models/albef_models/__pycache__/albef_outputs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38705fd15a43997b56b1777c35a1f86795a78ce9 Binary files /dev/null and b/lavis/models/albef_models/__pycache__/albef_outputs.cpython-310.pyc differ diff --git a/lavis/models/albef_models/__pycache__/albef_pretrain.cpython-310.pyc b/lavis/models/albef_models/__pycache__/albef_pretrain.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c7d0d16e1b09d40e317ce0c1425db306abf4108 Binary files /dev/null and b/lavis/models/albef_models/__pycache__/albef_pretrain.cpython-310.pyc differ diff --git a/lavis/models/albef_models/__pycache__/albef_retrieval.cpython-310.pyc b/lavis/models/albef_models/__pycache__/albef_retrieval.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63d426b0bc6c496c936bef7c5a5b6535e546adf2 Binary files /dev/null and b/lavis/models/albef_models/__pycache__/albef_retrieval.cpython-310.pyc differ diff --git a/lavis/models/albef_models/__pycache__/albef_vqa.cpython-310.pyc b/lavis/models/albef_models/__pycache__/albef_vqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c76b41a0013aa4daec94848e8ca9779cec55e92 Binary files /dev/null and b/lavis/models/albef_models/__pycache__/albef_vqa.cpython-310.pyc differ diff --git a/lavis/models/albef_models/albef_classification.py b/lavis/models/albef_models/albef_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..2d82de9a40ab53f443bf67fc6cfe24c6b6ed81cd --- /dev/null +++ b/lavis/models/albef_models/albef_classification.py @@ -0,0 +1,182 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import warnings +from copy import deepcopy + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.albef_models import AlbefBase +from lavis.models.albef_models.albef_outputs import ( + AlbefIntermediateOutput, + AlbefOutputWithLogits, +) +from lavis.models.base_model import MomentumDistilationMixin +from lavis.models.med import XBertEncoder +from lavis.models.vit import VisionTransformerEncoder +from torch import nn + + +@registry.register_model("albef_classification") +class AlbefClassification(AlbefBase, MomentumDistilationMixin): + PRETRAINED_MODEL_CONFIG_DICT = { + "ve": "configs/models/albef_classification_ve.yaml", + } + + def __init__( + self, + image_encoder, + text_encoder, + num_classes, + momentum=0.995, + alpha=0.4, + use_distill=True, + max_txt_len=40, + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + self.max_txt_len = max_txt_len + + self.use_distill = use_distill + + self.visual_encoder = image_encoder + self.text_encoder = text_encoder + + hidden_size = text_encoder.config.hidden_size + + if num_classes > 0: + self.cls_head = nn.Sequential( + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, num_classes), + ) + else: + warnings.warn( + f"Found num_classes=0, initializing {type(self)} without classifier." + ) + + if self.use_distill: + self.visual_encoder_m = deepcopy(self.visual_encoder) + self.text_encoder_m = deepcopy(self.text_encoder) + self.cls_head_m = deepcopy(self.cls_head) + + self.momentum = momentum + self.alpha = alpha + + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.cls_head, self.cls_head_m], + ] + + self.copy_params() + + def _rampup_factor(self, epoch, iters, num_iters_per_epoch): + return min(1, (epoch * num_iters_per_epoch + iters) / num_iters_per_epoch) + + def forward(self, samples, is_train=True): + sentences = samples["text_input"] + sentences = self.tokenizer( + sentences, + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + samples.update({"tokenized_text": sentences}) + + targets = samples["label"] + + image_embeds = self.visual_encoder.forward_features(samples["image"]) + encoder_output = self.text_encoder.forward_automask( + samples["tokenized_text"], image_embeds + ) + + prediction = self.cls_head(encoder_output.last_hidden_state[:, 0, :]) + + if is_train: + if self.use_distill: + with torch.no_grad(): + self._momentum_update() + + image_embeds_m = self.visual_encoder_m(samples["image"]) + encoder_output_m = self.text_encoder_m.forward_automask( + samples["tokenized_text"], image_embeds_m + ) + + prediction_m = self.cls_head_m( + encoder_output_m.last_hidden_state[:, 0, :] + ) + + alpha = self.alpha * self._rampup_factor( + epoch=samples["epoch"], + iters=samples["iters"], + num_iters_per_epoch=samples["num_iters_per_epoch"], + ) + + loss = (1 - alpha) * F.cross_entropy( + prediction, targets + ) - alpha * torch.sum( + F.log_softmax(prediction, dim=1) * F.softmax(prediction_m, dim=1), + dim=1, + ).mean() + else: + loss = F.cross_entropy(prediction, targets) + + image_embeds_m, encoder_output_m, prediction_m = None, None, None + + # return {"loss": loss} + return AlbefOutputWithLogits( + loss=loss, + intermediate_output=AlbefIntermediateOutput( + image_embeds=image_embeds, + image_embeds_m=image_embeds_m, + encoder_output=encoder_output, + encoder_output_m=encoder_output_m, + ), + logits=prediction, + logits_m=prediction_m, + ) + else: + return {"predictions": prediction, "targets": targets} + + def predict(self, samples): + output = self.forward(samples, is_train=False) + return output + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg) + + # text encoder + multimodal encoder + text_encoder = XBertEncoder.from_config(cfg) + + alpha = cfg.get("alpha", 0.4) + momentum = cfg.get("momentum", 0.995) + use_distill = cfg.get("use_distill", True) + num_classes = cfg.get("num_classes", -1) + max_txt_len = cfg.get("max_txt_len", 40) + + assert num_classes > 1, "Invalid number of classes provided, found {}".format( + num_classes + ) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + use_distill=use_distill, + alpha=alpha, + num_classes=num_classes, + momentum=momentum, + max_txt_len=max_txt_len, + ) + + model.load_checkpoint_from_config(cfg) + + return model diff --git a/lavis/models/albef_models/albef_feature_extractor.py b/lavis/models/albef_models/albef_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..df7632c6d8e7eac7e6ae019379e53febd3f7ef0c --- /dev/null +++ b/lavis/models/albef_models/albef_feature_extractor.py @@ -0,0 +1,204 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import warnings + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.common.utils import get_abs_path +from lavis.models.albef_models import AlbefBase +from lavis.models.albef_models.albef_outputs import AlbefOutputFeatures +from lavis.models.med import BertForMaskedLM +from lavis.models.vit import VisionTransformerEncoder +from torch import nn +from transformers import BertConfig + + +@registry.register_model("albef_feature_extractor") +class AlbefFeatureExtractor(AlbefBase): + PRETRAINED_MODEL_CONFIG_DICT = { + "base": "configs/models/albef_feature_extractor.yaml", + } + + def __init__(self, image_encoder, text_encoder, embed_dim=256, max_txt_len=30): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = image_encoder + self.text_encoder = text_encoder + + text_width = text_encoder.config.hidden_size + vision_width = image_encoder.vision_width + + self.embed_dim = embed_dim + + self.vision_proj = nn.Linear(vision_width, embed_dim) + self.text_proj = nn.Linear(text_width, embed_dim) + + self.max_txt_len = max_txt_len + + self.temp = nn.Parameter(0.07 * torch.ones([])) + + @torch.no_grad() + def extract_features(self, samples, mode="multimodal"): + """ + Extract features for multimodal or unimodal samples. + + Args: + samples (dict): A dictionary of samples, containing the following keys: + - image (torch.Tensor): A tensor of shape (B, C, H, W) containing the image. + Raw images should be preprocessed before being passed to feature extractor. + - text_input (list): A list of strings containing the text, length B. + mode (str): The mode of feature extraction. Can be either "multimodal", "text" or "image". + If "multimodal", return image features and multimodal features; + if "text", return text features; + if "image", return image features. + Default: "multimodal". + + Returns: + An AlbefOutputFeatures object, see lavis/models/albef_models/albef_outputs.py for details. + + Examples: + ```python + >>> from PIL import Image + >>> from lavis.models import load_model_and_preprocess + >>> raw_image = Image.open("docs/data/merlion.png").convert("RGB") + >>> caption = "a large fountain spewing water into the air" + >>> model, vis_processors, txt_processors = load_model_and_preprocess("albef_feature_extractor", is_eval=True) + >>> image = vis_processors["eval"](raw_image).unsqueeze(0) + >>> text_input = txt_processors["eval"](caption) + + >>> sample = {"image": image, "text_input": [text_input]} + + >>> features_multimodal = model.extract_features(sample) + >>> features_multimodal.keys() + odict_keys(['image_embeds', 'multimodal_embeds']) + >>> features_multimodal.image_embeds.shape + torch.Size([1, 197, 768]) + >>> features_multimodal.multimodal_embeds.shape + torch.Size([1, 12, 768]) + + >>> features_text = model.extract_features(sample, mode="text") + >>> features_text.keys() + odict_keys(['text_embeds', 'text_features']) + >>> features_text.text_embeds.shape + torch.Size([1, 12, 768]) + >>> features_text.text_features.shape + torch.Size([1, 12, 256]) + + >>> features_image = model.extract_features(sample, mode="image") + >>> features_image.keys() + odict_keys(['image_embeds', 'image_features']) + >>> features_image.image_embeds.shape + torch.Size([1, 197, 768]) + >>> features_image.image_features.shape + torch.Size([1, 197, 256]) + ``` + """ + image = samples["image"] + caption = samples["text_input"] + + if isinstance(mode, str): + mode = [mode] + + for m in mode: + assert m in [ + "multimodal", + "image", + "text", + ], "mode must be one of [multimodal, image, text], but got {}".format(m) + + # initalize output + image_embeds, text_embeds, multimodal_embeds = None, None, None + image_features, text_features = None, None + + if "image" in mode or "multimodal" in mode: + assert ( + image is not None + ), "image must be provided if mode is 'image' or 'multimodal'" + + image_embeds = self.visual_encoder.forward_features(image) + image_features = F.normalize(self.vision_proj(image_embeds), dim=-1) + + if "text" in mode or "multimodal" in mode: + assert ( + caption is not None + ), "text must be provided if mode is 'text' or 'multimodal'" + + text = self.tokenizer( + caption, + padding=True, + return_tensors="pt", + ).to(self.device) + + text_output = self.text_encoder.bert( + text.input_ids, + attention_mask=text.attention_mask, + return_dict=True, + mode="text", + ) + text_embeds = text_output.last_hidden_state + text_features = F.normalize(self.text_proj(text_embeds), dim=-1) + + if "multimodal" in mode: + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + + # forward the positve image-text pair + output = self.text_encoder.bert( + encoder_embeds=text_embeds, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + mode="fusion", + ) + + multimodal_embeds = output.last_hidden_state + + return AlbefOutputFeatures( + image_embeds=image_embeds, + image_embeds_proj=image_features, + text_embeds=text_embeds, + text_embeds_proj=text_features, + multimodal_embeds=multimodal_embeds, + ) + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg, from_pretrained=True) + config_text_encoder = BertConfig.from_json_file( + get_abs_path(cfg["med_config_path"]) + ) + config_text_encoder.fusion_layer = 6 + text_encoder = BertForMaskedLM.from_pretrained( + "bert-base-uncased", config=config_text_encoder + ) + + embed_dim = cfg.get("embed_dim", 256) + max_txt_len = cfg.get("max_txt_len", 30) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + embed_dim=embed_dim, + max_txt_len=max_txt_len, + ) + + # load pre-trained weights + pretrain_path = cfg.get("pretrained", None) + if pretrain_path is not None: + msg = model.load_from_pretrained( + url_or_filename=pretrain_path, rename_text_keys=False + ) + else: + warnings.warn("No pretrained weights are loaded.") + + return model diff --git a/lavis/models/albef_models/albef_nlvr.py b/lavis/models/albef_models/albef_nlvr.py new file mode 100644 index 0000000000000000000000000000000000000000..5df836b18479600f0c1dedd7d56200b2b6b054d9 --- /dev/null +++ b/lavis/models/albef_models/albef_nlvr.py @@ -0,0 +1,260 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from copy import deepcopy + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.common.utils import get_abs_path +from lavis.models.albef_models import AlbefBase +from lavis.models.albef_models.albef_outputs import AlbefIntermediateOutput, AlbefOutput +from lavis.models.base_model import MomentumDistilationMixin +from lavis.models.med import BertModel +from lavis.models.vit import VisionTransformerEncoder +from torch import nn +from transformers import BertConfig + + +@registry.register_model("albef_nlvr") +class AlbefNLVR(AlbefBase, MomentumDistilationMixin): + PRETRAINED_MODEL_CONFIG_DICT = { + "nlvr": "configs/models/albef_nlvr.yaml", + } + + def __init__( + self, + image_encoder, + text_encoder, + num_classes, + momentum=0.995, + alpha=0.4, + use_distill=True, + max_txt_len=40, + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + self.max_txt_len = max_txt_len + + self.use_distill = use_distill + + self.visual_encoder = image_encoder + self.text_encoder = text_encoder + + hidden_size = text_encoder.config.hidden_size + self.cls_head = nn.Sequential( + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, num_classes), + ) + + self.share_cross_attention(self.text_encoder.encoder) + + if self.use_distill: + self.visual_encoder_m = deepcopy(self.visual_encoder) + self.text_encoder_m = deepcopy(self.text_encoder) + self.cls_head_m = deepcopy(self.cls_head) + + self.share_cross_attention(self.text_encoder_m.encoder) + + self.momentum = momentum + self.alpha = alpha + + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.cls_head, self.cls_head_m], + ] + + self.copy_params() + + def _rampup_factor(self, epoch, iters, num_iters_per_epoch): + return min(1, (epoch * num_iters_per_epoch + iters) / (2 * num_iters_per_epoch)) + + def forward(self, samples, is_train=True): + """ + Forward function for training and evaluation. + + Args: + samples (dict): a dict of input samples, which contains the following keys: + - image0 (torch.Tensor): input image 0, shape (batch_size, 3, H, W), default H=384, W=384. + - image1 (torch.Tensor): input image 1, shape (batch_size, 3, H, W), default H=384, W=384. + - text_input (list): list of strings, each string is a natural language sentence. + - label (torch.LongTensor): ground truth label with shape (batch_size,). + is_train (bool): whether the model is in training mode. + If True, the model will return the loss; + If False, the model will return the prediction. + + Examples: + >>> import torch + >>> from lavis.models import load_model + >>> model = load_model("albef_nlvr") + >>> samples = { + ... "image0": torch.randn(2, 3, 384, 384), + ... "image1": torch.randn(2, 3, 384, 384), + ... "text_input": ["there is a ferret in tall grass", "there are lips in one of the images"], + ... "label": torch.tensor([0, 1]), + ... } + >>> output = model(samples) + >>> output.keys() + odict_keys(['intermediate_output', 'loss']) + """ + text = samples["text_input"] + text = self.tokenizer( + text, + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + targets = samples["label"] + + image0 = samples["image0"] + image1 = samples["image1"] + images = torch.cat([image0, image1], dim=0) + + image_embeds = self.visual_encoder.forward_features(images) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + image0_embeds, image1_embeds = torch.split(image_embeds, targets.size(0)) + + encoder_output = self.text_encoder( + text.input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=[image0_embeds, image1_embeds], + encoder_attention_mask=[ + image_atts[: image0_embeds.size(0)], + image_atts[image0_embeds.size(0) :], + ], + return_dict=True, + ) + + prediction = self.cls_head(encoder_output.last_hidden_state[:, 0, :]) + + if is_train: + if self.use_distill: + with torch.no_grad(): + self._momentum_update() + + image_embeds_m = self.visual_encoder_m(images) + image0_embeds_m, image1_embeds_m = torch.split( + image_embeds_m, targets.size(0) + ) + encoder_output_m = self.text_encoder( + text.input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=[image0_embeds_m, image1_embeds_m], + encoder_attention_mask=[ + image_atts[: image0_embeds_m.size(0)], + image_atts[image0_embeds_m.size(0) :], + ], + return_dict=True, + ) + + prediction_m = self.cls_head_m( + encoder_output_m.last_hidden_state[:, 0, :] + ) + + alpha = self.alpha * self._rampup_factor( + epoch=samples["epoch"], + iters=samples["iters"], + num_iters_per_epoch=samples["num_iters_per_epoch"], + ) + + loss = (1 - alpha) * F.cross_entropy( + prediction, targets + ) - alpha * torch.sum( + F.log_softmax(prediction, dim=1) * F.softmax(prediction_m, dim=1), + dim=1, + ).mean() + else: + loss = F.cross_entropy(prediction, targets) + + encoder_output_m = None + image0_embeds_m, image1_embeds_m = None, None + + # return {"loss": loss} + return AlbefOutput( + loss=loss, + intermediate_output=AlbefIntermediateOutput( + image_embeds=torch.stack([image0_embeds, image1_embeds], dim=0), + image_embeds_m=torch.stack( + [image0_embeds_m, image1_embeds_m], dim=0 + ), + encoder_output=encoder_output, + encoder_output_m=encoder_output_m, + ), + ) + else: + return {"predictions": prediction, "targets": targets} + + def share_cross_attention(self, model): + for i in range(6): + layer_num = 6 + i * 2 + modules_0 = model.layer[layer_num].crossattention.self._modules + modules_1 = model.layer[layer_num + 1].crossattention.self._modules + + for name in modules_0.keys(): + if "key" in name or "value" in name: + module_0 = modules_0[name] + module_1 = modules_1[name] + if hasattr(module_0, "weight"): + module_0.weight = module_1.weight + if hasattr(module_0, "bias"): + module_0.bias = module_1.bias + + def predict(self, samples): + output = self.forward(samples, is_train=False) + return output + + def load_from_pretrained(self, url_or_filename, use_distill=True): + _, msg = super().load_from_pretrained(url_or_filename) + + if use_distill and any(["_m" in k for k in msg.missing_keys]): + # this is required when initializing the model from TA pre-trained weights + self.copy_params() + + return msg + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg) + + # text encoder + multimodal encoder + bert_config = BertConfig.from_json_file(get_abs_path(cfg["med_config_path"])) + bert_config.num_hidden_layers = 18 + + text_encoder = BertModel.from_pretrained( + "bert-base-uncased", config=bert_config, add_pooling_layer=False + ) + + alpha = cfg.get("alpha", 0.4) + momentum = cfg.get("momentum", 0.995) + use_distill = cfg.get("use_distill", True) + num_classes = cfg.get("num_classes", -1) + max_txt_len = cfg.get("max_txt_len", 40) + + assert num_classes > 1, "Invalid number of classes provided, found {}".format( + num_classes + ) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + use_distill=use_distill, + alpha=alpha, + num_classes=num_classes, + momentum=momentum, + max_txt_len=max_txt_len, + ) + + model.load_checkpoint_from_config(cfg) + + return model diff --git a/lavis/models/albef_models/albef_outputs.py b/lavis/models/albef_models/albef_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..a3f73f39cf175319aa095cb24f30e9496f305a74 --- /dev/null +++ b/lavis/models/albef_models/albef_outputs.py @@ -0,0 +1,97 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from dataclasses import dataclass +from typing import Optional + +import torch +from transformers.modeling_outputs import ( + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + ModelOutput, +) + + +@dataclass +class AlbefSimilarity(ModelOutput): + sim_i2t: torch.FloatTensor = None + sim_t2i: torch.FloatTensor = None + + sim_i2t_m: Optional[torch.FloatTensor] = None + sim_t2i_m: Optional[torch.FloatTensor] = None + + sim_i2t_targets: Optional[torch.FloatTensor] = None + sim_t2i_targets: Optional[torch.FloatTensor] = None + + +@dataclass +class AlbefIntermediateOutput(ModelOutput): + # uni-modal features + image_embeds: torch.FloatTensor = None + text_embeds: Optional[torch.FloatTensor] = None + + image_embeds_m: Optional[torch.FloatTensor] = None + text_embeds_m: Optional[torch.FloatTensor] = None + + # intermediate outputs of multimodal encoder + encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None + encoder_output_m: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None + encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None + + itm_logits: Optional[torch.FloatTensor] = None + itm_labels: Optional[torch.LongTensor] = None + + # intermediate outputs of multimodal decoder + decoder_output: Optional[CausalLMOutputWithCrossAttentions] = None + decoder_labels: Optional[torch.LongTensor] = None + + +@dataclass +class AlbefOutput(ModelOutput): + # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional. + sims: Optional[AlbefSimilarity] = None + + intermediate_output: AlbefIntermediateOutput = None + + loss: Optional[torch.FloatTensor] = None + + loss_itc: Optional[torch.FloatTensor] = None + + loss_itm: Optional[torch.FloatTensor] = None + + loss_mlm: Optional[torch.FloatTensor] = None + + +@dataclass +class AlbefOutputWithLogits(AlbefOutput): + logits: torch.FloatTensor = None + logits_m: torch.FloatTensor = None + + +@dataclass +class AlbefOutputFeatures(ModelOutput): + """ + Data class of features from AlbefFeatureExtractor. + + Args: + image_embeds: `torch.FloatTensor` of shape `(batch_size, num_patches+1, embed_dim)`, `optional` + image_features: `torch.FloatTensor` of shape `(batch_size, num_patches+1, feature_dim)`, `optional` + text_embeds: `torch.FloatTensor` of shape `(batch_size, sequence_length+1, embed_dim)`, `optional` + text_features: `torch.FloatTensor` of shape `(batch_size, sequence_length+1, feature_dim)`, `optional` + + The first embedding or feature is for the [CLS] token. + + Features are obtained by projecting the corresponding embedding into a normalized low-dimensional space. + """ + + image_embeds: Optional[torch.FloatTensor] = None + image_embeds_proj: Optional[torch.FloatTensor] = None + + text_embeds: Optional[torch.FloatTensor] = None + text_embeds_proj: Optional[torch.FloatTensor] = None + + multimodal_embeds: Optional[torch.FloatTensor] = None diff --git a/lavis/models/albef_models/albef_pretrain.py b/lavis/models/albef_models/albef_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..e25baf30a65f3218bb7b9ab8ebed6b01f74c773b --- /dev/null +++ b/lavis/models/albef_models/albef_pretrain.py @@ -0,0 +1,416 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from copy import deepcopy + +import numpy as np +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.common.utils import get_abs_path +from lavis.models.albef_models import AlbefBase +from lavis.models.albef_models.albef_outputs import ( + AlbefIntermediateOutput, + AlbefOutput, + AlbefSimilarity, +) +from lavis.models.base_model import MomentumDistilationMixin, SharedQueueMixin +from lavis.models.med import BertForMaskedLM +from lavis.models.vit import VisionTransformerEncoder +from torch import nn +from transformers import BertConfig + + +@registry.register_model("albef_pretrain") +class AlbefPretrain(AlbefBase, MomentumDistilationMixin, SharedQueueMixin): + """ + ALBEF pretrain model. + + Supported model types: + - base: ALBEF base model used for pretraining. + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "base": "configs/models/albef_pretrain_base.yaml", + } + + def __init__( + self, + image_encoder, + text_encoder, + queue_size, + embed_dim=256, + mlm_mask_prob=0.15, + temp=0.07, + momentum=0.995, + alpha=0.4, + max_txt_len=30, + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = image_encoder + self.text_encoder = text_encoder + + text_width = text_encoder.config.hidden_size + vision_width = image_encoder.vision_width + + self.embed_dim = embed_dim + + self.vision_proj = nn.Linear(vision_width, embed_dim) + self.text_proj = nn.Linear(text_width, embed_dim) + + self.itm_head = nn.Linear(text_width, 2) + + # create the momentum encoder + self.visual_encoder_m = deepcopy(self.visual_encoder) + self.text_encoder_m = deepcopy(self.text_encoder) + + self.vision_proj_m = deepcopy(self.vision_proj) + self.text_proj_m = deepcopy(self.text_proj) + + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.vision_proj, self.vision_proj_m], + [self.text_proj, self.text_proj_m], + ] + self.copy_params() + + # create the queue + self.register_buffer("image_queue", torch.randn(embed_dim, queue_size)) + self.register_buffer("text_queue", torch.randn(embed_dim, queue_size)) + self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) + + self.image_queue = nn.functional.normalize(self.image_queue, dim=0) + self.text_queue = nn.functional.normalize(self.text_queue, dim=0) + + self.queue_size = queue_size + self.momentum = momentum + self.temp = nn.Parameter(temp * torch.ones([])) + + self.alpha = alpha + self.max_txt_len = max_txt_len + + self.mlm_probability = mlm_mask_prob + + def _rampup_factor(self, epoch, iters, num_iters_per_epoch): + return min(1, (epoch * num_iters_per_epoch + iters) / (2 * num_iters_per_epoch)) + + def forward(self, samples): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). The input images. Default: H=224, W=224. + - text_input (list): A list of length batch_size, each element is a string of text/caption. + - epoch (int): The current epoch. + - iters (int): The current iteration. + - num_iters_per_epoch (int): The number of iterations per epoch. + + Returns: + BlipOutput: A BlipOutput object containing loss and intermediate output. See ``lavis.models.blip_models.blip_outputs.BlipOutput`` for more details. + + Examples: + >>> import torch + >>> from lavis.models import load_model + >>> model = load_model("albef_pretrain") + >>> images = torch.randn(4, 3, 224, 224) + >>> text_input = ["caption of image 1", "another caption of image 1", "caption of image 2", "caption of image 3"] + >>> samples = {"image": images, "text_input": text_input, "epoch": 0, "iters": 0, "num_iters_per_epoch": 100} + >>> output = model(samples) + >>> output.keys() + odict_keys(['sims', 'intermediate_output', 'loss', 'loss_itc', 'loss_itm', 'loss_mlm']) + """ + image = samples["image"] + caption = samples["text_input"] + + alpha = self.alpha * self._rampup_factor( + epoch=samples["epoch"], + iters=samples["iters"], + num_iters_per_epoch=samples["num_iters_per_epoch"], + ) + + with torch.no_grad(): + self.temp.clamp_(0.001, 0.5) + + image_embeds = self.visual_encoder.forward_features(image) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + + text = self.tokenizer( + caption, + padding="max_length", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + + text_output = self.text_encoder.bert( + text.input_ids, + attention_mask=text.attention_mask, + return_dict=True, + mode="text", + ) + text_embeds = text_output.last_hidden_state + text_feat = F.normalize(self.text_proj(text_embeds[:, 0, :]), dim=-1) + + # get momentum features + with torch.no_grad(): + self._momentum_update() + image_embeds_m = self.visual_encoder_m(image) + image_feat_m = F.normalize( + self.vision_proj_m(image_embeds_m[:, 0, :]), dim=-1 + ) + image_feat_all = torch.cat( + [image_feat_m.t(), self.image_queue.clone().detach()], dim=1 + ) + text_output_m = self.text_encoder_m.bert( + text.input_ids, + attention_mask=text.attention_mask, + return_dict=True, + mode="text", + ) + text_embeds_m = text_output_m.last_hidden_state + text_feat_m = F.normalize(self.text_proj_m(text_embeds_m[:, 0, :]), dim=-1) + text_feat_all = torch.cat( + [text_feat_m.t(), self.text_queue.clone().detach()], dim=1 + ) + + sim_i2t_m = image_feat_m @ text_feat_all / self.temp + sim_t2i_m = text_feat_m @ image_feat_all / self.temp + + sim_targets = torch.zeros(sim_i2t_m.size()).to(image.device) + sim_targets.fill_diagonal_(1) + + sim_i2t_targets = ( + alpha * F.softmax(sim_i2t_m, dim=1) + (1 - alpha) * sim_targets + ) + sim_t2i_targets = ( + alpha * F.softmax(sim_t2i_m, dim=1) + (1 - alpha) * sim_targets + ) + + sim_i2t = image_feat @ text_feat_all / self.temp + sim_t2i = text_feat @ image_feat_all / self.temp + + loss_i2t = -torch.sum( + F.log_softmax(sim_i2t, dim=1) * sim_i2t_targets, dim=1 + ).mean() + loss_t2i = -torch.sum( + F.log_softmax(sim_t2i, dim=1) * sim_t2i_targets, dim=1 + ).mean() + + loss_itc = (loss_i2t + loss_t2i) / 2 + + self._dequeue_and_enqueue(image_feat_m, text_feat_m) + + # forward the positve image-text pair + encoder_output_pos = self.text_encoder.bert( + encoder_embeds=text_embeds, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + mode="fusion", + ) + with torch.no_grad(): + bs = image.size(0) + + weights_i2t = sim_i2t[:, :bs].clone() + weights_t2i = sim_t2i[:, :bs].clone() + + weights_i2t.fill_diagonal_(-np.Inf) + weights_t2i.fill_diagonal_(-np.Inf) + + weights_i2t = F.softmax(weights_i2t, dim=1) + weights_t2i = F.softmax(weights_t2i, dim=1) + + # select a negative image for each text + image_embeds_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_t2i[b], 1).item() + image_embeds_neg.append(image_embeds[neg_idx]) + image_embeds_neg = torch.stack(image_embeds_neg, dim=0) + + # select a negative text for each image + text_embeds_neg = [] + text_atts_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_i2t[b], 1).item() + text_embeds_neg.append(text_embeds[neg_idx]) + text_atts_neg.append(text.attention_mask[neg_idx]) + text_embeds_neg = torch.stack(text_embeds_neg, dim=0) + text_atts_neg = torch.stack(text_atts_neg, dim=0) + + text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0) + text_atts_all = torch.cat([text.attention_mask, text_atts_neg], dim=0) + + image_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0) + image_atts_all = torch.cat([image_atts, image_atts], dim=0) + + encoder_output_neg = self.text_encoder.bert( + encoder_embeds=text_embeds_all, + attention_mask=text_atts_all, + encoder_hidden_states=image_embeds_all, + encoder_attention_mask=image_atts_all, + return_dict=True, + mode="fusion", + ) + + vl_embeddings = torch.cat( + [ + encoder_output_pos.last_hidden_state[:, 0, :], + encoder_output_neg.last_hidden_state[:, 0, :], + ], + dim=0, + ) + itm_logits = self.itm_head(vl_embeddings) + + itm_labels = torch.cat( + [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)], + dim=0, + ).to(self.device) + loss_itm = F.cross_entropy(itm_logits, itm_labels) + + # MLM + input_ids = text.input_ids.clone() + labels = input_ids.clone() + + probability_matrix = torch.full(labels.shape, self.mlm_probability) + input_ids, labels = self.mask( + input_ids, + self.text_encoder.config.vocab_size, + self.device, + targets=labels, + probability_matrix=probability_matrix, + ) + + with torch.no_grad(): + logits_m = self.text_encoder_m( + input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds_m, + encoder_attention_mask=image_atts, + return_dict=True, + return_logits=True, + ) + mlm_output = self.text_encoder( + input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + labels=labels, + soft_labels=F.softmax(logits_m, dim=-1), + alpha=alpha, + ) + loss_mlm = mlm_output.loss + + return AlbefOutput( + loss=loss_itc + loss_itm + loss_mlm, + loss_itc=loss_itc, + loss_itm=loss_itm, + loss_mlm=loss_mlm, + sims=AlbefSimilarity( + sim_i2t=sim_i2t, + sim_t2i=sim_t2i, + sim_i2t_m=sim_i2t_m, + sim_t2i_m=sim_t2i_m, + sim_i2t_targets=sim_i2t_targets, + sim_t2i_targets=sim_t2i_targets, + ), + intermediate_output=AlbefIntermediateOutput( + image_embeds=image_embeds, + image_embeds_m=image_embeds_m, + text_embeds=text_embeds, + text_embeds_m=text_embeds_m, + encoder_output=encoder_output_pos, + encoder_output_neg=encoder_output_neg, + itm_logits=itm_logits, + itm_labels=itm_labels, + ), + ) + + def mask( + self, + input_ids, + vocab_size, + device, + targets=None, + masked_indices=None, + probability_matrix=None, + ): + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ + if masked_indices is None: + masked_indices = torch.bernoulli(probability_matrix).bool() + + masked_indices[input_ids == self.tokenizer.pad_token_id] = False + masked_indices[input_ids == self.tokenizer.cls_token_id] = False + + if targets is not None: + targets[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = ( + torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices + ) + input_ids[indices_replaced] = self.tokenizer.mask_token_id + + # 10% of the time, we replace masked input tokens with random word + indices_random = ( + torch.bernoulli(torch.full(input_ids.shape, 0.5)).bool() + & masked_indices + & ~indices_replaced + ) + random_words = torch.randint(vocab_size, input_ids.shape, dtype=torch.long).to( + device + ) + input_ids[indices_random] = random_words[indices_random] + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + + if targets is not None: + return input_ids, targets + else: + return input_ids + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg, from_pretrained=True) + config_text_encoder = BertConfig.from_json_file( + get_abs_path(cfg["med_config_path"]) + ) + config_text_encoder.fusion_layer = 6 + text_encoder = BertForMaskedLM.from_pretrained( + "bert-base-uncased", config=config_text_encoder + ) + + embed_dim = cfg.get("embed_dim", 256) + momentum = cfg.get("momentum", 0.995) + alpha = cfg.get("alpha", 0.4) + mlm_mask_prob = cfg.get("mlm_mask_prob", 0.15) + temp = cfg.get("temp", 0.07) + max_txt_len = cfg.get("max_txt_len", 30) + queue_size = cfg.get("queue_size", 65536) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + queue_size=queue_size, + embed_dim=embed_dim, + mlm_mask_prob=mlm_mask_prob, + temp=temp, + momentum=momentum, + alpha=alpha, + max_txt_len=max_txt_len, + ) + + return model diff --git a/lavis/models/albef_models/albef_retrieval.py b/lavis/models/albef_models/albef_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..dafea6d806445bb851dc6b4d47281d65d81508cf --- /dev/null +++ b/lavis/models/albef_models/albef_retrieval.py @@ -0,0 +1,344 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from copy import deepcopy + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.albef_models import AlbefBase, compute_sim_matrix +from lavis.models.albef_models.albef_outputs import ( + AlbefIntermediateOutput, + AlbefOutput, + AlbefSimilarity, +) +from lavis.models.base_model import MomentumDistilationMixin, SharedQueueMixin +from lavis.models.med import XBertEncoder +from lavis.models.vit import VisionTransformerEncoder +from torch import nn + + +@registry.register_model("albef_retrieval") +class AlbefRetrieval(AlbefBase, MomentumDistilationMixin, SharedQueueMixin): + """ + ALBEF retrieval model. + + Supported model types: + - coco: fine-tuned ALBEF base model on COCO dataset (Karparthy split). + - flickr: fine-tuned ALBEF base model on Flickr30k dataset. + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("albef_retrieval", "coco") + >>> model = load_model("albef_retrieval", "flickr") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "coco": "configs/models/albef_retrieval_coco.yaml", + "flickr": "configs/models/albef_retrieval_flickr.yaml", + } + + def __init__( + self, + image_encoder, + text_encoder, + queue_size, + embed_dim=256, + temp=0.07, + use_distill=True, + momentum=0.995, + alpha=0.4, + max_txt_len=30, + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = image_encoder + self.text_encoder = text_encoder + + text_width = text_encoder.config.hidden_size + vision_width = image_encoder.vision_width + + self.vision_proj = nn.Linear(vision_width, embed_dim) + self.text_proj = nn.Linear(text_width, embed_dim) + + self.itm_head = nn.Linear(text_width, 2) + + # create the momentum encoder + self.visual_encoder_m = deepcopy(self.visual_encoder) + self.text_encoder_m = deepcopy(self.text_encoder) + + self.vision_proj_m = deepcopy(self.vision_proj) + self.text_proj_m = deepcopy(self.text_proj) + + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.vision_proj, self.vision_proj_m], + [self.text_proj, self.text_proj_m], + ] + self.copy_params() + + # create the queue + self.register_buffer("image_queue", torch.randn(embed_dim, queue_size)) + self.register_buffer("text_queue", torch.randn(embed_dim, queue_size)) + self.register_buffer("idx_queue", torch.full((1, queue_size), -100)) + self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) + + self.image_queue = nn.functional.normalize(self.image_queue, dim=0) + self.text_queue = nn.functional.normalize(self.text_queue, dim=0) + + self.queue_size = queue_size + self.momentum = momentum + self.temp = nn.Parameter(temp * torch.ones([])) + + self.alpha = alpha + self.max_txt_len = max_txt_len + self.use_distill = use_distill + + def _rampup_factor(self, epoch, iters, num_iters_per_epoch): + return min(1, (epoch * num_iters_per_epoch + iters) / (2 * num_iters_per_epoch)) + + def forward(self, samples): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). The input images. + - text_input (list): A list of length batch_size, each element is a string of text/caption. + - image_id (torch.Tensor): A tensor of shape (batch_size, ). The image ids, used to identify same images in batch. + - epoch (int): The current epoch. + - iters (int): The current iteration. + - num_iters_per_epoch (int): The number of iterations per epoch. + + Returns: + BlipOutput: A BlipOutput object. See ``lavis.models.blip_models.blip_outputs.BlipOutput`` for more details. + + Examples: + >>> import torch + >>> from lavis.models import load_model + >>> model = load_model("albef_retrieval", "coco") + >>> images = torch.randn(4, 3, 384, 384) + >>> text_input = ["caption of image 1", "another caption of image 1", "caption of image 2", "caption of image 3"] + >>> image_id = torch.tensor([1, 1, 2, 3]) + >>> samples = {"image": images, "text_input": text_input, "image_id": image_id, "epoch": 0, "iters": 0, "num_iters_per_epoch": 100} + >>> output = model(samples) + >>> output.keys() + odict_keys(['sims', 'intermediate_output', 'loss', 'loss_itc', 'loss_itm']) + """ + image = samples["image"] + caption = samples["text_input"] + idx = samples["image_id"] + + alpha = self.alpha * self._rampup_factor( + epoch=samples["epoch"], + iters=samples["iters"], + num_iters_per_epoch=samples["num_iters_per_epoch"], + ) + + with torch.no_grad(): + self.temp.clamp_(0.001, 0.5) + + image_embeds = self.visual_encoder.forward_features(image) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + + image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + + text = self.tokenizer( + caption, + padding="max_length", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + text_output = self.text_encoder.forward_text(text) + + text_embeds = text_output.last_hidden_state + text_feat = F.normalize(self.text_proj(text_embeds[:, 0, :]), dim=-1) + + idx = idx.view(-1, 1) + idx_all = torch.cat([idx.t(), self.idx_queue.clone().detach()], dim=1) + pos_idx = torch.eq(idx, idx_all).float() + sim_targets = pos_idx / pos_idx.sum(1, keepdim=True) + + with torch.no_grad(): + self._momentum_update() + image_embeds_m = self.visual_encoder_m(image) + image_feat_m = F.normalize( + self.vision_proj_m(image_embeds_m[:, 0, :]), dim=-1 + ) + image_feat_all = torch.cat( + [image_feat_m.t(), self.image_queue.clone().detach()], dim=1 + ) + text_output_m = self.text_encoder_m.forward_text(text) + text_embeds_m = text_output_m.last_hidden_state + text_feat_m = F.normalize(self.text_proj_m(text_embeds_m[:, 0, :]), dim=-1) + text_feat_all = torch.cat( + [text_feat_m.t(), self.text_queue.clone().detach()], dim=1 + ) + + if self.use_distill: + sim_i2t_m = image_feat_m @ text_feat_all / self.temp + sim_t2i_m = text_feat_m @ image_feat_all / self.temp + + sim_i2t_targets = ( + alpha * F.softmax(sim_i2t_m, dim=1) + (1 - alpha) * sim_targets + ) + sim_t2i_targets = ( + alpha * F.softmax(sim_t2i_m, dim=1) + (1 - alpha) * sim_targets + ) + + sim_i2t = image_feat @ text_feat_all / self.temp + sim_t2i = text_feat @ image_feat_all / self.temp + + if self.use_distill: + loss_i2t = -torch.sum( + F.log_softmax(sim_i2t, dim=1) * sim_i2t_targets, dim=1 + ).mean() + loss_t2i = -torch.sum( + F.log_softmax(sim_t2i, dim=1) * sim_t2i_targets, dim=1 + ).mean() + else: + loss_i2t = -torch.sum( + F.log_softmax(sim_i2t, dim=1) * sim_targets, dim=1 + ).mean() + loss_t2i = -torch.sum( + F.log_softmax(sim_t2i, dim=1) * sim_targets, dim=1 + ).mean() + + loss_itc = (loss_i2t + loss_t2i) / 2 + + self._dequeue_and_enqueue(image_feat_m, text_feat_m, idx) + + encoder_output_pos = self.text_encoder( + encoder_embeds=text_embeds, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + mode="fusion", + ) + + with torch.no_grad(): + bs = image.size(0) + weights_i2t = F.softmax(sim_i2t[:, :bs] + 1e-4, dim=1) + weights_t2i = F.softmax(sim_t2i[:, :bs] + 1e-4, dim=1) + + mask = torch.eq(idx, idx.T) + weights_i2t.masked_fill_(mask, 0) + weights_t2i.masked_fill_(mask, 0) + + # select a negative image for each text + image_embeds_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_t2i[b], 1).item() + image_embeds_neg.append(image_embeds[neg_idx]) + image_embeds_neg = torch.stack(image_embeds_neg, dim=0) + + # select a negative text for each image + text_embeds_neg = [] + text_atts_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_i2t[b], 1).item() + text_embeds_neg.append(text_embeds[neg_idx]) + text_atts_neg.append(text.attention_mask[neg_idx]) + text_embeds_neg = torch.stack(text_embeds_neg, dim=0) + text_atts_neg = torch.stack(text_atts_neg, dim=0) + + text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0) + text_atts_all = torch.cat([text.attention_mask, text_atts_neg], dim=0) + + image_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0) + image_atts_all = torch.cat([image_atts, image_atts], dim=0) + + encoder_output_neg = self.text_encoder( + encoder_embeds=text_embeds_all, + attention_mask=text_atts_all, + encoder_hidden_states=image_embeds_all, + encoder_attention_mask=image_atts_all, + return_dict=True, + mode="fusion", + ) + + vl_embeddings = torch.cat( + [ + encoder_output_pos.last_hidden_state[:, 0, :], + encoder_output_neg.last_hidden_state[:, 0, :], + ], + dim=0, + ) + itm_logits = self.itm_head(vl_embeddings) + + itm_labels = torch.cat( + [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)], + dim=0, + ).to(self.device) + loss_itm = F.cross_entropy(itm_logits, itm_labels) + + return AlbefOutput( + loss=loss_itc + loss_itm, + loss_itc=loss_itc, + loss_itm=loss_itm, + sims=AlbefSimilarity( + sim_i2t=sim_i2t, + sim_t2i=sim_t2i, + sim_i2t_m=sim_i2t_m, + sim_t2i_m=sim_t2i_m, + sim_i2t_targets=sim_i2t_targets, + sim_t2i_targets=sim_t2i_targets, + ), + intermediate_output=AlbefIntermediateOutput( + image_embeds=image_embeds, + image_embeds_m=image_embeds_m, + text_embeds=text_embeds, + text_embeds_m=text_embeds_m, + encoder_output=encoder_output_pos, + encoder_output_neg=encoder_output_neg, + itm_logits=itm_logits, + itm_labels=itm_labels, + ), + ) + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg, from_pretrained=False) + text_encoder = XBertEncoder.from_config(cfg) + + embed_dim = cfg.get("embed_dim", 256) + momentum = cfg.get("momentum", 0.995) + alpha = cfg.get("alpha", 0.4) + temp = cfg.get("temp", 0.07) + max_txt_len = cfg.get("max_txt_len", 30) + queue_size = cfg.get("queue_size", 0) + use_distill = cfg.get("use_distill", True) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + queue_size=queue_size, + embed_dim=embed_dim, + temp=temp, + momentum=momentum, + alpha=alpha, + max_txt_len=max_txt_len, + use_distill=use_distill, + ) + + model.load_checkpoint_from_config(cfg) + + return model + + def compute_sim_matrix(self, data_loader, task_cfg): + """ + Compute similarity i2t, t2i matrix for the given data loader. + """ + k_test = task_cfg.k_test + + return compute_sim_matrix(model=self, data_loader=data_loader, k_test=k_test) diff --git a/lavis/models/albef_models/albef_vqa.py b/lavis/models/albef_models/albef_vqa.py new file mode 100644 index 0000000000000000000000000000000000000000..eb4dcbb9cd34a28637d3420ef4bdad5be47563b3 --- /dev/null +++ b/lavis/models/albef_models/albef_vqa.py @@ -0,0 +1,442 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +import os +from copy import deepcopy + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.common.utils import get_abs_path, is_url +from lavis.models.albef_models import AlbefBase +from lavis.models.albef_models.albef_outputs import AlbefIntermediateOutput, AlbefOutput +from lavis.models.base_model import MomentumDistilationMixin, tile +from lavis.models.med import BertConfig, BertLMHeadModel, XBertEncoder +from lavis.models.vit import VisionTransformerEncoder, interpolate_pos_embed +from lavis.common.dist_utils import download_cached_file + + +@registry.register_model("albef_vqa") +class AlbefVQA(AlbefBase, MomentumDistilationMixin): + """ + ALBEF VQA models. + + Supported model types: + - base: vqa model initialized with pre-trained ALBEF base model on 115M image-text pairs after CapFilt; not fine-tuned. + - vqav2: fine-tuned ALBEF base model on VQA v2.0 dataset. + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("albef_vqa", "vqav2") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "vqav2": "configs/models/albef_vqav2.yaml", + } + + def __init__( + self, + image_encoder, + text_encoder, + text_decoder, + use_distill=True, + momentum=0.995, + alpha=0.4, + max_txt_len=35, + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + self.max_txt_len = max_txt_len + + self.use_distill = use_distill + + self.visual_encoder = image_encoder + + self.text_encoder = text_encoder + self.text_decoder = text_decoder + + if self.use_distill: + self.visual_encoder_m = deepcopy(self.visual_encoder) + self.text_encoder_m = deepcopy(self.text_encoder) + self.text_decoder_m = deepcopy(self.text_decoder) + + self.momentum = momentum + self.alpha = alpha + + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.text_decoder, self.text_decoder_m], + ] + + self.copy_params() + + def _rampup_factor(self, epoch, iters, num_iters_per_epoch): + return min(1, (epoch * num_iters_per_epoch + iters) / num_iters_per_epoch) + + def forward(self, samples): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). Default H=480, W=480. + - text_input (list): A list of strings, each string is a question + - answer (list): A list of strings, each string is an answer + - weight (torch.Tensor): A tensor used to weigh each answer in the loss computation. + The shape of the tensor is (sum(n_answers),) + - n_answers (torch.Tensor): A tensor shape (batch_size,) containing the number of answers + for each question in the batch. + + Returns: + An AlbefOutput object containing loss and intermediate outputs; + see lavis/models/albef_models/albef_outputs.py for more details. + + Examples: + >>> import torch + >>> from lavis.models import load_model + >>> model = load_model("albef_vqa") + >>> samples = { + ... "image": torch.rand(2, 3, 384, 384), + ... "text_input": ["What is this?", "What is that?"], + ... "answer": ["cat", "cat", "dog"], + ... "weight": torch.tensor([1.0, 1.0, 1.0]), + ... "n_answers": torch.tensor([2, 1]), + ... "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000, + ... } + >>> output = model(samples) + >>> output.keys() + odict_keys(['intermediate_output', 'loss']) + """ + ( + encoder_output, + encoder_output_m, + image_embeds, + image_embeds_m, + ) = self.forward_encoder(samples) + loss, decoder_output, decoder_targets = self.forward_decoder( + samples, encoder_out=(encoder_output, encoder_output_m) + ) + + return AlbefOutput( + loss=loss, + intermediate_output=AlbefIntermediateOutput( + image_embeds=image_embeds, + image_embeds_m=image_embeds_m, + encoder_output=encoder_output, + encoder_output_m=encoder_output_m, + decoder_output=decoder_output, + decoder_labels=decoder_targets, + ), + ) + + def forward_encoder(self, samples): + questions = samples["text_input"] + questions = self.tokenizer( + questions, + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + samples.update({"tokenized_text": questions}) + + image_embeds = self.visual_encoder.forward_features(samples["image"]) + encoder_output = self.text_encoder.forward_automask( + tokenized_text=samples["tokenized_text"], visual_embeds=image_embeds + ) + + if self.use_distill: + self._momentum_update() + with torch.no_grad(): + image_embeds_m = self.visual_encoder_m(samples["image"]) + encoder_output_m = self.text_encoder_m.forward_automask( + tokenized_text=samples["tokenized_text"], + visual_embeds=image_embeds_m, + ) + else: + encoder_output_m = None + image_embeds_m = None + + return encoder_output, encoder_output_m, image_embeds, image_embeds_m + + def forward_decoder(self, samples, encoder_out, **kwargs): + answers = self.tokenizer( + samples["answer"], padding="longest", return_tensors="pt" + ).to(self.device) + answer_targets = answers.input_ids.masked_fill( + answers.input_ids == self.tokenizer.pad_token_id, -100 + ) + + question_states = [] + question_atts = [] + + question = samples["tokenized_text"] + question_output, question_output_m = encoder_out + + for b, n in enumerate(samples["n_answers"]): + question_states += [question_output.last_hidden_state[b]] * n + question_atts += [question.attention_mask[b]] * n + + question_states = torch.stack(question_states, dim=0) + question_atts = torch.stack(question_atts, dim=0) + + if self.use_distill: + with torch.no_grad(): + question_states_m = [] + for b, n in enumerate(samples["n_answers"]): + question_states_m += [question_output_m.last_hidden_state[b]] * n + question_states_m = torch.stack(question_states_m, 0) + + logits_m = self.text_decoder_m( + answers.input_ids, + attention_mask=answers.attention_mask, + encoder_hidden_states=question_states_m, + encoder_attention_mask=question_atts, + return_logits=True, + ) + + alpha = self.alpha * self._rampup_factor( + epoch=samples["epoch"], + iters=samples["iters"], + num_iters_per_epoch=samples["num_iters_per_epoch"], + ) + + answer_output = self.text_decoder( + answers.input_ids, + attention_mask=answers.attention_mask, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + labels=answer_targets, + soft_labels=F.softmax(logits_m, dim=-1), + alpha=alpha, + return_dict=True, + reduction="none", + ) + + loss = samples["weight"] * answer_output.loss + bsz = samples["image"].size(0) + + loss = loss.sum() / bsz + + return loss, answer_output, answer_targets + + def predict_answers(self, samples, answer_list, num_ans_candidates=128, **kwargs): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). Default H=480, W=480. + - text_input (str or [str]): String or a list of strings, each string is a question. + The number of questions must be equal to the batch size. If a single string, will be converted to a list of string, with length 1 first. + num_ans_candidates (int): Number of answer candidates, used to filter out answers with low probability. + answer_list (list): A list of strings, each string is an answer. + + Returns: + List: A list of strings, each string is an answer. + + Examples: + >>> from PIL import Image + >>> from lavis.models import load_model_and_preprocess + >>> model, vis_processors, txt_processors = load_model_and_preprocess("albef_vqa", "vqav2") + >>> raw_image = Image.open("docs/data/merlion.png").convert("RGB") + >>> question = "Which city is this photo taken?" + >>> image = vis_processors["eval"](raw_image).unsqueeze(0) + >>> question = txt_processors["eval"](question) + >>> samples = {"image": image, "text_input": [question]} + >>> answer_list = ["Singapore", "London", "Palo Alto", "Tokyo"] + >>> answers = model.predict_answers(samples, answer_list=answer_list) + >>> answers + ['Singapore'] + """ + + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] + + assert len(samples["text_input"]) == samples["image"].size( + 0 + ), "The number of questions must be equal to the batch size." + + num_ans_candidates = min(num_ans_candidates, len(answer_list)) + + return self.rank_answers( + samples, answer_list=answer_list, num_ans_candidates=num_ans_candidates + ) + + def rank_answers(self, samples, answer_list, num_ans_candidates): + """ + Generate the first token of answers using decoder and select ${num_ans_candidates} + most probable ones. Then select answers from answer list, which start with the probable tokens. + Lastly, use the selected answers as the ground-truth labels for decoding and calculating LM loss. + Return the answers that minimize the losses as result. + + """ + answer_candidates = self.tokenizer( + answer_list, padding="longest", return_tensors="pt" + ).to(self.device) + # answer_candidates.input_ids[:, 0] = self.tokenizer.bos_token_id + + answer_ids = answer_candidates.input_ids + answer_atts = answer_candidates.attention_mask + + question_output, _, _, _ = self.forward_encoder(samples) + question_states = question_output.last_hidden_state + + tokenized_question = samples["tokenized_text"] + question_atts = tokenized_question.attention_mask + + num_ques = question_states.size(0) + start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token + + start_output = self.text_decoder( + start_ids, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + return_dict=True, + reduction="none", + ) + logits = start_output.logits[:, 0, :] # first token's logit + + # topk_probs: top-k probability + # topk_ids: [num_question, k] + answer_first_token = answer_ids[:, 1] + prob_first_token = F.softmax(logits, dim=1).index_select( + dim=1, index=answer_first_token + ) + topk_probs, topk_ids = prob_first_token.topk(num_ans_candidates, dim=1) + + # answer input: [num_question*k, answer_len] + input_ids = [] + input_atts = [] + for b, topk_id in enumerate(topk_ids): + input_ids.append(answer_ids.index_select(dim=0, index=topk_id)) + input_atts.append(answer_atts.index_select(dim=0, index=topk_id)) + input_ids = torch.cat(input_ids, dim=0) + input_atts = torch.cat(input_atts, dim=0) + + targets_ids = input_ids.masked_fill( + input_ids == self.tokenizer.pad_token_id, -100 + ) + + # repeat encoder's output for top-k answers + question_states = tile(question_states, 0, num_ans_candidates) + question_atts = tile(question_atts, 0, num_ans_candidates) + + output = self.text_decoder( + input_ids, + attention_mask=input_atts, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + labels=targets_ids, + return_dict=True, + reduction="none", + ) + + log_probs_sum = -output.loss + log_probs_sum = log_probs_sum.view(num_ques, num_ans_candidates) + + max_topk_ids = log_probs_sum.argmax(dim=1) + max_ids = topk_ids[max_topk_ids >= 0, max_topk_ids] + + answers = [answer_list[max_id] for max_id in max_ids] + + return answers + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg) + + text_encoder = XBertEncoder.from_config(cfg) + + config_decoder = BertConfig.from_json_file(get_abs_path(cfg["med_config_path"])) + config_decoder.fusion_layer = 0 + config_decoder.num_hidden_layers = 6 + text_decoder = BertLMHeadModel.from_pretrained( + "bert-base-uncased", config=config_decoder + ) + + alpha = cfg.get("alpha", 0.4) + momentum = cfg.get("momentum", 0.995) + use_distill = cfg.get("use_distill", True) + max_txt_len = cfg.get("max_txt_len", 25) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + text_decoder=text_decoder, + use_distill=use_distill, + momentum=momentum, + alpha=alpha, + max_txt_len=max_txt_len, + ) + + # load pre-trained weights + model.load_checkpoint_from_config(cfg) + + return model + + def load_from_pretrained(self, url_or_filename): + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + if "model" in checkpoint: + state_dict = checkpoint["model"] + else: + state_dict = checkpoint + + # reshape positional embedding to accomodate for image resolution change + pos_embed_reshaped = interpolate_pos_embed( + state_dict["visual_encoder.pos_embed"], self.visual_encoder + ) + state_dict["visual_encoder.pos_embed"] = pos_embed_reshaped + + m_pos_embed_reshaped = interpolate_pos_embed( + state_dict["visual_encoder_m.pos_embed"], self.visual_encoder_m + ) + state_dict["visual_encoder_m.pos_embed"] = m_pos_embed_reshaped + + for key in list(state_dict.keys()): + if "bert" in key: + encoder_key = key.replace("bert.", "") + state_dict[encoder_key] = state_dict[key] + + # intialize text decoder as multimodal encoder (last 6 layers of model.text_encoder) + if "text_encoder" in key: + if "layer" in key: + encoder_keys = key.split(".") + layer_num = int(encoder_keys[4]) + + if layer_num < 6: + del state_dict[key] + continue + else: + decoder_layer_num = layer_num - 6 + encoder_keys[4] = str(decoder_layer_num) + encoder_key = ".".join(encoder_keys) + else: + encoder_key = key + decoder_key = encoder_key.replace("text_encoder", "text_decoder") + state_dict[decoder_key] = state_dict[key] + + del state_dict[key] + + for key in self.state_dict().keys(): + if key in state_dict.keys(): + if state_dict[key].shape != self.state_dict()[key].shape: + del state_dict[key] + + msg = self.load_state_dict(state_dict, strict=False) + logging.info("load checkpoint from %s" % url_or_filename) + logging.info(f"missing keys: {msg.missing_keys}") + + return msg diff --git a/lavis/models/alpro_models/__init__.py b/lavis/models/alpro_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1dfd29514a54a24857c775b461a8937243c06784 --- /dev/null +++ b/lavis/models/alpro_models/__init__.py @@ -0,0 +1,103 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +import os + +import torch +import torch.nn.functional as F +from lavis.common.dist_utils import download_cached_file +from lavis.common.utils import is_url +from lavis.models.base_model import BaseModel +from transformers import BertTokenizer + + +class AlproBase(BaseModel): + @classmethod + def init_tokenizer(cls): + return BertTokenizer.from_pretrained("bert-base-uncased") + + def load_from_pretrained(self, url_or_filename, num_frames, num_patches): + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + if "model" in checkpoint: + state_dict = checkpoint["model"] + else: + state_dict = checkpoint + + for key in list(state_dict.keys()): + if "bert" in key: + new_key = key.replace("bert.", "") + state_dict[new_key] = state_dict[key] + del state_dict[key] + + spatial_embed_key = "visual_encoder.model.pos_embed" + temporal_embed_key = "visual_encoder.model.time_embed" + + ## Resizing spatial embeddings in case they don't match + if num_patches + 1 != state_dict[spatial_embed_key].size(1): + state_dict[spatial_embed_key] = resize_spatial_embedding( + state_dict, spatial_embed_key, num_patches + ) + else: + logging.info( + "The length of spatial position embedding matches. No need to resize." + ) + + ## Resizing time embeddings in case they don't match + if temporal_embed_key in state_dict and num_frames != state_dict[ + temporal_embed_key + ].size(1): + state_dict[temporal_embed_key] = resize_temporal_embedding( + state_dict, temporal_embed_key, num_frames + ) + else: + logging.info( + "No temporal encoding found. Or the length of temporal position embedding matches. No need to resize." + ) + + msg = self.load_state_dict(state_dict, strict=False) + logging.info("Missing keys {}".format(msg.missing_keys)) + logging.info("load checkpoint from %s" % url_or_filename) + + return msg + + +def resize_spatial_embedding(state_dict, key, num_patches): + logging.info( + f"Resizing spatial position embedding from {state_dict[key].size(1)} to {num_patches + 1}" + ) + + pos_embed = state_dict[key] + + cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1) + other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(1, 2) + + new_pos_embed = F.interpolate(other_pos_embed, size=(num_patches), mode="nearest") + new_pos_embed = new_pos_embed.transpose(1, 2) + new_pos_embed = torch.cat((cls_pos_embed, new_pos_embed), 1) + + return new_pos_embed + + +def resize_temporal_embedding(state_dict, key, num_frames): + logging.info( + f"Resizing temporal position embedding from {state_dict[key].size(1)} to {num_frames}" + ) + + time_embed = state_dict[key].transpose(1, 2) + new_time_embed = F.interpolate(time_embed, size=(num_frames), mode="nearest") + + return new_time_embed.transpose(1, 2) diff --git a/lavis/models/alpro_models/__pycache__/__init__.cpython-310.pyc b/lavis/models/alpro_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce0c80cebe0f1b9386c4c9678de23456215db8ad Binary files /dev/null and b/lavis/models/alpro_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/alpro_models/__pycache__/alpro_outputs.cpython-310.pyc b/lavis/models/alpro_models/__pycache__/alpro_outputs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bfb1eab686f2a1297e1d3eff5da1547bd82315b1 Binary files /dev/null and b/lavis/models/alpro_models/__pycache__/alpro_outputs.cpython-310.pyc differ diff --git a/lavis/models/alpro_models/__pycache__/alpro_qa.cpython-310.pyc b/lavis/models/alpro_models/__pycache__/alpro_qa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3beb66ad40f85404008ee2a9f94fa49fbbdf1540 Binary files /dev/null and b/lavis/models/alpro_models/__pycache__/alpro_qa.cpython-310.pyc differ diff --git a/lavis/models/alpro_models/__pycache__/alpro_retrieval.cpython-310.pyc b/lavis/models/alpro_models/__pycache__/alpro_retrieval.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1690720e422ea295b6f4ccab0f1be2554b716496 Binary files /dev/null and b/lavis/models/alpro_models/__pycache__/alpro_retrieval.cpython-310.pyc differ diff --git a/lavis/models/alpro_models/alpro_outputs.py b/lavis/models/alpro_models/alpro_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..68a11a9cfbd95c866597cf0e8d5a126134587de6 --- /dev/null +++ b/lavis/models/alpro_models/alpro_outputs.py @@ -0,0 +1,59 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from dataclasses import dataclass +from typing import Optional + +import torch +from transformers.modeling_outputs import ( + BaseModelOutputWithPoolingAndCrossAttentions, + ModelOutput, +) + + +@dataclass +class AlproSimilarity(ModelOutput): + sim_v2t: torch.FloatTensor = None + sim_t2v: torch.FloatTensor = None + + sim_v2t_targets: Optional[torch.FloatTensor] = None + sim_t2v_targets: Optional[torch.FloatTensor] = None + + +@dataclass +class AlproIntermediateOutput(ModelOutput): + # uni-modal features + video_embeds: torch.FloatTensor = None + text_embeds: Optional[torch.FloatTensor] = None + + # intermediate outputs of multimodal encoder + encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None + encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None + + vtm_logits: Optional[torch.FloatTensor] = None + vtm_labels: Optional[torch.LongTensor] = None + + +@dataclass +class AlproOutput(ModelOutput): + # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional. + sims: Optional[AlproSimilarity] = None + + intermediate_output: AlproIntermediateOutput = None + + loss: Optional[torch.FloatTensor] = None + + loss_vtc: Optional[torch.FloatTensor] = None + + loss_vtm: Optional[torch.FloatTensor] = None + + loss_mlm: Optional[torch.FloatTensor] = None + + +@dataclass +class AlproOutputWithLogits(AlproOutput): + logits: torch.FloatTensor = None diff --git a/lavis/models/alpro_models/alpro_qa.py b/lavis/models/alpro_models/alpro_qa.py new file mode 100644 index 0000000000000000000000000000000000000000..2a931be0e23f2c218431288b8390f7a3304702c8 --- /dev/null +++ b/lavis/models/alpro_models/alpro_qa.py @@ -0,0 +1,141 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from warnings import warn + +import torch +import torch.nn.functional as F +from lavis.common.config import node_to_dict +from lavis.common.registry import registry +from lavis.models.alpro_models import AlproBase +from lavis.models.alpro_models.alpro_outputs import ( + AlproIntermediateOutput, + AlproOutputWithLogits, +) +from lavis.models.med import XBertEncoder +from lavis.models.timesformer.vit import TimeSformer +from torch import nn + + +@registry.register_model("alpro_qa") +class AlproQA(AlproBase): + PRETRAINED_MODEL_CONFIG_DICT = { + "msrvtt": "configs/models/alpro_qa_msrvtt.yaml", + "msvd": "configs/models/alpro_qa_msvd.yaml", + } + + def __init__( + self, visual_encoder, text_encoder, hidden_size, num_classes, max_txt_len=40 + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = visual_encoder + + self.text_encoder = text_encoder + + if num_classes > 0: + self.classifier = nn.Sequential( + nn.Linear(hidden_size, hidden_size * 2), + nn.ReLU(True), + nn.Linear(hidden_size * 2, num_classes), + ) + else: + warn(f"num_classes is 0. Initialized {type(self)} without classifier.") + + self.max_txt_len = max_txt_len + + def forward(self, samples, is_train=True): + visual_inputs = samples["video"] + question = samples["text_input"] + targets = samples["answers"] + + # forward text + text = self.tokenizer( + question, + padding="max_length", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + text_output = self.text_encoder.forward_text( + text, + token_type_ids=torch.zeros( + text.input_ids.shape, dtype=torch.long, device=self.device + ), + ) + text_embeds = text_output.last_hidden_state + + # forward visual + # timeSformer asks for (b, c, t, h, w) as input. + video_embeds = self.visual_encoder.forward_features(visual_inputs) + video_atts = torch.ones(video_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + + # forward cross-encoder + attention_mask = torch.cat([text.attention_mask, video_atts], dim=1) + embedding_output = torch.cat([text_embeds, video_embeds], dim=1) + + encoder_output = self.text_encoder( + encoder_embeds=embedding_output, + attention_mask=attention_mask, + return_dict=True, + mode="fusion", + ) + + prediction = self.classifier(encoder_output.last_hidden_state[:, 0, :]) + if is_train: + loss = F.cross_entropy(prediction, targets) + # return {"loss": loss} + return AlproOutputWithLogits( + loss=loss, + intermediate_output=AlproIntermediateOutput( + video_embeds=video_embeds, + text_embeds=text_embeds, + encoder_output=encoder_output, + ), + logits=prediction, + ) + else: + return {"predictions": prediction, "targets": targets} + + def predict(self, samples): + output = self.forward(samples, is_train=False) + return output + + @classmethod + def from_config(cls, cfg): + # vision encoder + visual_encoder_config = node_to_dict(cfg.timesformer) + visual_encoder = TimeSformer(**visual_encoder_config) + + # text encoder + text_encoder = XBertEncoder.from_config(cfg) + + num_classes = cfg.get("num_classes", -1) + hidden_size = cfg.get("hidden_size", 768) + + model = cls( + visual_encoder=visual_encoder, + text_encoder=text_encoder, + hidden_size=hidden_size, + num_classes=num_classes, + ) + + num_patches = ( + visual_encoder_config["image_size"] // visual_encoder_config["patch_size"] + ) ** 2 + num_frames = visual_encoder_config["n_frms"] + + model.load_checkpoint_from_config( + cfg, num_frames=num_frames, num_patches=num_patches + ) + + return model diff --git a/lavis/models/alpro_models/alpro_retrieval.py b/lavis/models/alpro_models/alpro_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..f574ad42bdfd2f13b21dc30430b72f9c278d0ced --- /dev/null +++ b/lavis/models/alpro_models/alpro_retrieval.py @@ -0,0 +1,422 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import datetime +import logging +import time + +import lavis.common.dist_utils as dist_utils +import numpy as np +import torch +import torch.distributed as dist +import torch.nn.functional as F +from lavis.common.config import node_to_dict +from lavis.common.dist_utils import get_rank +from lavis.common.logger import MetricLogger +from lavis.common.registry import registry +from lavis.models.alpro_models import AlproBase +from lavis.models.alpro_models.alpro_outputs import AlproIntermediateOutput, AlproOutput +from lavis.models.base_model import all_gather_with_grad +from lavis.models.med import XBertEncoder +from lavis.models.timesformer.vit import TimeSformer +from torch import nn + + +@registry.register_model("alpro_retrieval") +class AlproRetrieval(AlproBase): + PRETRAINED_MODEL_CONFIG_DICT = { + "msrvtt": "configs/models/alpro_retrieval_msrvtt.yaml", + "didemo": "configs/models/alpro_retrieval_didemo.yaml", + } + + def __init__( + self, + visual_encoder, + text_encoder, + vision_width=768, + text_width=768, + embed_dim=256, + max_txt_len=35, + temp=0.07, + ): + super().__init__() + + self.temp = nn.Parameter(torch.ones([]) * temp) + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = visual_encoder + self.text_encoder = text_encoder + + vision_width = vision_width + text_width = text_width + + self.vision_proj = nn.Linear(vision_width, embed_dim) + self.text_proj = nn.Linear(text_width, embed_dim) + + self.itm_head = nn.Linear(text_width, 2) + + self.max_txt_len = max_txt_len + + def forward(self, samples): + with torch.no_grad(): + self.temp.clamp_(0.001, 0.5) + + visual_inputs = samples["video"] + caption = samples["text_input"] + + b, t, c, h, w = visual_inputs.shape + + # forward text + text = self.tokenizer( + caption, + padding="max_length", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + text_output = self.text_encoder.forward_text( + text, + token_type_ids=torch.zeros( + text.input_ids.shape, dtype=torch.long, device=self.device + ), + ) + text_embeds = text_output.last_hidden_state + text_feat = F.normalize(self.text_proj(text_embeds[:, 0, :]), dim=-1) + + # forward visual + # timeSformer asks for (b, c, t, h, w) as input. + video_embeds = self.visual_encoder.forward_features(visual_inputs) + video_feat = F.normalize(self.vision_proj(video_embeds[:, 0, :]), dim=-1) + video_atts = torch.ones(video_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + + # ========== (in-batch) ITC loss ========== + gathered_video_feats = all_gather_with_grad(video_feat) + gathered_text_feats = all_gather_with_grad(text_feat) + + sim_v2t = video_feat @ gathered_text_feats.t() / self.temp + sim_t2v = text_feat @ gathered_video_feats.t() / self.temp + + sim_targets = torch.zeros_like(sim_v2t) + + local_rank = get_rank() + b_start, b_end = b * local_rank, b * (local_rank + 1) + sim_targets[:, b_start:b_end] = torch.eye(b) + + loss_v2t = -torch.sum(F.log_softmax(sim_v2t, dim=1) * sim_targets, dim=1).mean() + loss_t2v = -torch.sum(F.log_softmax(sim_t2v, dim=1) * sim_targets, dim=1).mean() + + vtc_loss = (loss_v2t + loss_t2v) / 2 + + ( + vtm_loss, + vtm_logits, + vtm_labels, + encoder_output, + encoder_output_neg, + ) = self.compute_vtm( + text_embeds=text_embeds, + text_atts=text.attention_mask, + image_embeds=video_embeds, + image_atts=video_atts, + sim_i2t=sim_v2t.clone(), # for hard mining + sim_t2i=sim_t2v.clone(), # for hard mining + ) + + loss = vtc_loss + vtm_loss + + # return {"loss": loss} + return AlproOutput( + loss=loss, + loss_vtc=vtc_loss, + loss_vtm=vtm_loss, + intermediate_output=AlproIntermediateOutput( + video_embeds=video_embeds, + text_embeds=text_embeds, + encoder_output=encoder_output, + encoder_output_neg=encoder_output_neg, + vtm_logits=vtm_logits, + vtm_labels=vtm_labels, + ), + ) + + def compute_vtm( + self, text_embeds, text_atts, image_embeds, image_atts, sim_i2t, sim_t2i + ): + device = self.device + + # ====== positive pairs ======= + attention_mask = torch.cat([text_atts, image_atts], dim=1) + embedding_output_pos = torch.cat([text_embeds, image_embeds], dim=1) + + encoder_outputs_pos = self.text_encoder( + encoder_embeds=embedding_output_pos, + attention_mask=attention_mask, + return_dict=True, + mode="fusion", + ) + + # ====== negative pairs ======= + bs = text_embeds.shape[0] + + local_rank = get_rank() + b_start, b_end = bs * local_rank, bs * (local_rank + 1) + + with torch.no_grad(): + weights_v2t = sim_i2t[:, b_start:b_end] + weights_t2v = sim_t2i[:, b_start:b_end] + + # never select self as negative + weights_v2t.fill_diagonal_(-np.Inf) + weights_t2v.fill_diagonal_(-np.Inf) + + weights_v2t = F.softmax(weights_v2t, dim=1) + weights_t2v = F.softmax(weights_t2v, dim=1) + + # select a negative image for each text + # FIXME to optimize using indexing operations + image_embeds_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_t2v[b], 1).item() + image_embeds_neg.append(image_embeds[neg_idx]) + image_embeds_neg = torch.stack(image_embeds_neg, dim=0) + + # select a negative text for each image + text_embeds_neg = [] + text_atts_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_v2t[b], 1).item() + text_embeds_neg.append(text_embeds[neg_idx]) + text_atts_neg.append(text_atts[neg_idx]) + + text_embeds_neg = torch.stack(text_embeds_neg, dim=0) + text_atts_neg = torch.stack(text_atts_neg, dim=0) + + text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0) + text_atts_all = torch.cat([text_atts, text_atts_neg], dim=0) + + video_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0) + video_atts_all = torch.cat([image_atts, image_atts], dim=0) + + attention_mask_all = torch.cat([text_atts_all, video_atts_all], dim=1) + embedding_output_all = torch.cat([text_embeds_all, video_embeds_all], dim=1) + + # forward negative pairs via cross encoder + encoder_outputs_neg = self.text_encoder( + encoder_embeds=embedding_output_all, + attention_mask=attention_mask_all, + return_dict=True, + mode="fusion", + ) + + vl_embeddings = torch.cat( + [ + encoder_outputs_pos.last_hidden_state[:, 0, :], + encoder_outputs_neg.last_hidden_state[:, 0, :], + ], + dim=0, + ) + vtm_logits = self.itm_head(vl_embeddings) + + vtm_labels = torch.cat( + [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)], + dim=0, + ).to(device) + vtm_loss = F.cross_entropy(vtm_logits, vtm_labels) + + return ( + vtm_loss, + vtm_logits, + vtm_labels, + encoder_outputs_pos, + encoder_outputs_neg, + ) + + def compute_sim_matrix(self, data_loader, task_cfg): + k_test = task_cfg.get("k_test") + + metric_logger = MetricLogger(delimiter=" ") + header = "Evaluation:" + + logging.info("Computing features for evaluation...") + start_time = time.time() + + texts = data_loader.dataset.text + num_text = len(texts) + text_bs = 256 + text_ids = [] + text_embeds = [] + text_feats = [] + text_atts = [] + for i in range(0, num_text, text_bs): + text = texts[i : min(num_text, i + text_bs)] + text_input = self.tokenizer( + text, + padding="max_length", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + text_output = self.text_encoder.forward_text( + text_input, + token_type_ids=torch.zeros( + text_input.input_ids.shape, dtype=torch.long, device=self.device + ), + ) + text_feats.append(text_output.last_hidden_state.cpu()) + text_embed = F.normalize( + self.text_proj(text_output.last_hidden_state[:, 0, :]) + ) + text_embeds.append(text_embed) + text_ids.append(text_input.input_ids) + text_atts.append(text_input.attention_mask) + + text_embeds = torch.cat(text_embeds, dim=0) + text_ids = torch.cat(text_ids, dim=0) + text_atts = torch.cat(text_atts, dim=0) + text_feats = torch.cat(text_feats, dim=0) + + video_feats = [] + video_embeds = [] + for samples in data_loader: + video = samples["video"] + + video = video.to(self.device) + video_feat = self.visual_encoder.forward_features(video) + video_embed = self.vision_proj(video_feat[:, 0, :]) + video_embed = F.normalize(video_embed, dim=-1) + + video_feats.append(video_feat.cpu()) + video_embeds.append(video_embed) + + video_feats = torch.cat(video_feats, dim=0) + video_embeds = torch.cat(video_embeds, dim=0) + + sims_matrix = video_embeds @ text_embeds.t() + score_matrix_v2t = torch.full( + (len(data_loader.dataset.image), len(texts)), -100.0 + ).to(self.device) + + num_tasks = dist_utils.get_world_size() + rank = dist_utils.get_rank() + step = sims_matrix.size(0) // num_tasks + 1 + start = rank * step + end = min(sims_matrix.size(0), start + step) + + # video-to-text + for i, sims in enumerate( + metric_logger.log_every(sims_matrix[start:end], 50, header) + ): + topk_sim, topk_idx = sims.topk(k=k_test, dim=0) + + video_feats_repeat = ( + video_feats[start + i].repeat(k_test, 1, 1).to(self.device) + ) + video_atts_repeat = torch.ones( + video_feats_repeat.size()[:-1], dtype=torch.long + ).to(self.device) + + attention_mask = torch.cat([text_atts[topk_idx], video_atts_repeat], dim=1) + embedding_output = torch.cat( + [text_feats[topk_idx].to(self.device), video_feats_repeat], dim=1 + ) + + output = self.text_encoder( + encoder_embeds=embedding_output, + attention_mask=attention_mask, + return_dict=True, + mode="fusion", + ) + + score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1] + score_matrix_v2t[start + i, topk_idx] = score + topk_sim + + # text-to-video + sims_matrix = sims_matrix.t() + score_matrix_t2v = torch.full( + (len(texts), len(data_loader.dataset.image)), -100.0 + ).to(self.device) + + step = sims_matrix.size(0) // num_tasks + 1 + start = rank * step + end = min(sims_matrix.size(0), start + step) + + for i, sims in enumerate( + metric_logger.log_every(sims_matrix[start:end], 50, header) + ): + + topk_sim, topk_idx = sims.topk(k=k_test, dim=0) + + text_feats_repeat = ( + text_feats[start + i].repeat(k_test, 1, 1).to(self.device) + ) + text_atts_repeat = text_atts[start + i].repeat(k_test, 1).to(self.device) + + video_atts = torch.ones( + video_feats[topk_idx].size()[:-1], dtype=torch.long + ).to(self.device) + + embedding_output = torch.cat( + [text_feats_repeat, video_feats[topk_idx].to(self.device)], dim=1 + ) + attention_mask = torch.cat([text_atts_repeat, video_atts], dim=1) + + output = self.text_encoder( + encoder_embeds=embedding_output, + attention_mask=attention_mask, + return_dict=True, + mode="fusion", + ) + + score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1] + score_matrix_t2v[start + i, topk_idx] = score + topk_sim + + if dist_utils.is_dist_avail_and_initialized(): + dist.barrier() + torch.distributed.all_reduce( + score_matrix_v2t, op=torch.distributed.ReduceOp.SUM + ) + torch.distributed.all_reduce( + score_matrix_t2v, op=torch.distributed.ReduceOp.SUM + ) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + logging.info("Evaluation time {}".format(total_time_str)) + + return score_matrix_v2t.cpu().numpy(), score_matrix_t2v.cpu().numpy() + + @classmethod + def from_config(cls, cfg): + # vision encoder + visual_encoder_config = node_to_dict(cfg.timesformer) + visual_encoder = TimeSformer(**visual_encoder_config) + + # text encoder + text_encoder = XBertEncoder.from_config(cfg) + + max_txt_len = cfg.get("max_txt_len", 35) + + model = cls( + visual_encoder=visual_encoder, + text_encoder=text_encoder, + max_txt_len=max_txt_len, + ) + + num_patches = ( + visual_encoder_config["image_size"] // visual_encoder_config["patch_size"] + ) ** 2 + num_frames = visual_encoder_config["n_frms"] + + model.load_checkpoint_from_config( + cfg, num_frames=num_frames, num_patches=num_patches + ) + + return model diff --git a/lavis/models/base_model.py b/lavis/models/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..44d66f3fc3c91cd1687eb37c53704f2a97f990a1 --- /dev/null +++ b/lavis/models/base_model.py @@ -0,0 +1,267 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +import os + +import numpy as np +import torch +import torch.nn as nn +from lavis.common.dist_utils import download_cached_file, is_dist_avail_and_initialized +from lavis.common.utils import get_abs_path, is_url +from omegaconf import OmegaConf + + +class BaseModel(nn.Module): + """Base class for models.""" + + def __init__(self): + super().__init__() + + @property + def device(self): + return list(self.parameters())[0].device + + def load_checkpoint(self, url_or_filename): + """ + Load from a finetuned checkpoint. + + This should expect no mismatch in the model keys and the checkpoint keys. + """ + + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + if "model" in checkpoint.keys(): + state_dict = checkpoint["model"] + else: + state_dict = checkpoint + + msg = self.load_state_dict(state_dict, strict=False) + + logging.info("Missing keys {}".format(msg.missing_keys)) + logging.info("load checkpoint from %s" % url_or_filename) + + return msg + + @classmethod + def from_pretrained(cls, model_type): + """ + Build a pretrained model from default configuration file, specified by model_type. + + Args: + - model_type (str): model type, specifying architecture and checkpoints. + + Returns: + - model (nn.Module): pretrained or finetuned model, depending on the configuration. + """ + model_cfg = OmegaConf.load(cls.default_config_path(model_type)).model + model = cls.from_config(model_cfg) + + return model + + @classmethod + def default_config_path(cls, model_type): + assert ( + model_type in cls.PRETRAINED_MODEL_CONFIG_DICT + ), "Unknown model type {}".format(model_type) + return get_abs_path(cls.PRETRAINED_MODEL_CONFIG_DICT[model_type]) + + def load_checkpoint_from_config(self, cfg, **kwargs): + """ + Load checkpoint as specified in the config file. + + If load_finetuned is True, load the finetuned model; otherwise, load the pretrained model. + When loading the pretrained model, each task-specific architecture may define their + own load_from_pretrained() method. + """ + load_finetuned = cfg.get("load_finetuned", True) + if load_finetuned: + finetune_path = cfg.get("finetuned", None) + assert ( + finetune_path is not None + ), "Found load_finetuned is True, but finetune_path is None." + self.load_checkpoint(url_or_filename=finetune_path) + else: + load_pretrained = cfg.get("load_pretrained", True) + if load_pretrained: + # load pre-trained weights + pretrain_path = cfg.get("pretrained", None) + assert "Found load_finetuned is False, but pretrain_path is None." + self.load_from_pretrained(url_or_filename=pretrain_path, **kwargs) + + def before_training(self, **kwargs): + pass + + def get_optimizer_params(self, weight_decay, lr_scale=1): + p_wd, p_non_wd = [], [] + for n, p in self.named_parameters(): + if not p.requires_grad: + continue # frozen weights + if p.ndim < 2 or "bias" in n or "ln" in n or "bn" in n: + p_non_wd.append(p) + else: + p_wd.append(p) + optim_params = [ + {"params": p_wd, "weight_decay": weight_decay, "lr_scale": lr_scale}, + {"params": p_non_wd, "weight_decay": 0, "lr_scale": lr_scale}, + ] + return optim_params + + def before_evaluation(self, **kwargs): + pass + + def show_n_params(self, return_str=True): + tot = 0 + for p in self.parameters(): + w = 1 + for x in p.shape: + w *= x + tot += w + if return_str: + if tot >= 1e6: + return "{:.1f}M".format(tot / 1e6) + else: + return "{:.1f}K".format(tot / 1e3) + else: + return tot + + +class BaseEncoder(nn.Module): + """ + Base class for primitive encoders, such as ViT, TimeSformer, etc. + """ + + def __init__(self): + super().__init__() + + def forward_features(self, samples, **kwargs): + raise NotImplementedError + + @property + def device(self): + return list(self.parameters())[0].device + + +class SharedQueueMixin: + @torch.no_grad() + def _dequeue_and_enqueue(self, image_feat, text_feat, idxs=None): + # gather keys before updating queue + image_feats = concat_all_gather(image_feat) + text_feats = concat_all_gather(text_feat) + + batch_size = image_feats.shape[0] + + ptr = int(self.queue_ptr) + assert self.queue_size % batch_size == 0 # for simplicity + + # replace the keys at ptr (dequeue and enqueue) + self.image_queue[:, ptr : ptr + batch_size] = image_feats.T + self.text_queue[:, ptr : ptr + batch_size] = text_feats.T + + if idxs is not None: + idxs = concat_all_gather(idxs) + self.idx_queue[:, ptr : ptr + batch_size] = idxs.T + + ptr = (ptr + batch_size) % self.queue_size # move pointer + self.queue_ptr[0] = ptr + + +class MomentumDistilationMixin: + @torch.no_grad() + def copy_params(self): + for model_pair in self.model_pairs: + for param, param_m in zip( + model_pair[0].parameters(), model_pair[1].parameters() + ): + param_m.data.copy_(param.data) # initialize + param_m.requires_grad = False # not update by gradient + + @torch.no_grad() + def _momentum_update(self): + for model_pair in self.model_pairs: + for param, param_m in zip( + model_pair[0].parameters(), model_pair[1].parameters() + ): + param_m.data = param_m.data * self.momentum + param.data * ( + 1.0 - self.momentum + ) + + +class GatherLayer(torch.autograd.Function): + """ + Gather tensors from all workers with support for backward propagation: + This implementation does not cut the gradients as torch.distributed.all_gather does. + """ + + @staticmethod + def forward(ctx, x): + output = [ + torch.zeros_like(x) for _ in range(torch.distributed.get_world_size()) + ] + torch.distributed.all_gather(output, x) + return tuple(output) + + @staticmethod + def backward(ctx, *grads): + all_gradients = torch.stack(grads) + torch.distributed.all_reduce(all_gradients) + return all_gradients[torch.distributed.get_rank()] + + +def all_gather_with_grad(tensors): + """ + Performs all_gather operation on the provided tensors. + Graph remains connected for backward grad computation. + """ + # Queue the gathered tensors + world_size = torch.distributed.get_world_size() + # There is no need for reduction in the single-proc case + if world_size == 1: + return tensors + + # tensor_all = GatherLayer.apply(tensors) + tensor_all = GatherLayer.apply(tensors) + + return torch.cat(tensor_all, dim=0) + + +@torch.no_grad() +def concat_all_gather(tensor): + """ + Performs all_gather operation on the provided tensors. + *** Warning ***: torch.distributed.all_gather has no gradient. + """ + # if use distributed training + if not is_dist_avail_and_initialized(): + return tensor + + tensors_gather = [ + torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size()) + ] + torch.distributed.all_gather(tensors_gather, tensor, async_op=False) + + output = torch.cat(tensors_gather, dim=0) + return output + + +def tile(x, dim, n_tile): + init_dim = x.size(dim) + repeat_idx = [1] * x.dim() + repeat_idx[dim] = n_tile + x = x.repeat(*(repeat_idx)) + order_index = torch.LongTensor( + np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]) + ) + return torch.index_select(x, dim, order_index.to(x.device)) diff --git a/lavis/models/beats/BEATs.py b/lavis/models/beats/BEATs.py new file mode 100644 index 0000000000000000000000000000000000000000..7a5e3887684e41ea70af5896dfd52f42b144f6a4 --- /dev/null +++ b/lavis/models/beats/BEATs.py @@ -0,0 +1,180 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on fairseq code bases +# https://github.com/pytorch/fairseq +# -------------------------------------------------------- + + +import torch +import torch.nn as nn +from torch.nn import LayerNorm +import torchaudio.compliance.kaldi as ta_kaldi + +from lavis.models.beats.backbone import ( + TransformerEncoder, +) + +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + + +class BEATsConfig: + def __init__(self, cfg=None): + self.input_patch_size: int = -1 # path size of patch embedding + self.embed_dim: int = 512 # patch embedding dimension + self.conv_bias: bool = False # include bias in conv encoder + + self.encoder_layers: int = 12 # num encoder layers in the transformer + self.encoder_embed_dim: int = 768 # encoder embedding dimension + self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN + self.encoder_attention_heads: int = 12 # num encoder attention heads + self.activation_fn: str = "gelu" # activation function to use + + self.layer_wise_gradient_decay_ratio: float = 1.0 # ratio for layer-wise gradient decay + self.layer_norm_first: bool = False # apply layernorm first in the transformer + self.deep_norm: bool = False # apply deep_norm first in the transformer + + # dropouts + self.dropout: float = 0.1 # dropout probability for the transformer + self.attention_dropout: float = 0.1 # dropout probability for attention weights + self.activation_dropout: float = 0.0 # dropout probability after activation in FFN + self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer + self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) + + # positional embeddings + self.conv_pos: int = 128 # number of filters for convolutional positional embeddings + self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding + + # relative position embedding + self.relative_position_embedding: bool = False # apply relative position embedding + self.num_buckets: int = 320 # number of buckets for relative position embedding + self.max_distance: int = 1280 # maximum distance for relative position embedding + self.gru_rel_pos: bool = False # apply gated relative position embedding + + # label predictor + self.finetuned_model: bool = False # whether the model is a fine-tuned model. + self.predictor_dropout: float = 0.1 # dropout probability for the predictor + self.predictor_class: int = 527 # target class number for the predictor + + if cfg is not None: + self.update(cfg) + + def update(self, cfg: dict): + self.__dict__.update(cfg) + + +class BEATs(nn.Module): + def __init__( + self, + cfg: BEATsConfig, + ) -> None: + super().__init__() + logger.info(f"BEATs Config: {cfg.__dict__}") + + self.cfg = cfg + + self.embed = cfg.embed_dim + self.post_extract_proj = ( + nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim + else None + ) + + self.input_patch_size = cfg.input_patch_size + self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size, + bias=cfg.conv_bias) + + self.dropout_input = nn.Dropout(cfg.dropout_input) + + assert not cfg.deep_norm or not cfg.layer_norm_first + self.encoder = TransformerEncoder(cfg) + self.layer_norm = LayerNorm(self.embed) + + if cfg.finetuned_model: + self.predictor_dropout = nn.Dropout(cfg.predictor_dropout) + self.predictor = nn.Linear(cfg.encoder_embed_dim, cfg.predictor_class) + else: + self.predictor = None + + def forward_padding_mask( + self, + features: torch.Tensor, + padding_mask: torch.Tensor, + ) -> torch.Tensor: + extra = padding_mask.size(1) % features.size(1) + if extra > 0: + padding_mask = padding_mask[:, :-extra] + padding_mask = padding_mask.view( + padding_mask.size(0), features.size(1), -1 + ) + padding_mask = padding_mask.all(-1) + return padding_mask + + def preprocess( + self, + source: torch.Tensor, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, + ) -> torch.Tensor: + fbanks = [] + for waveform in source: + waveform = waveform.unsqueeze(0) * 2 ** 15 + fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10) + fbanks.append(fbank) + fbank = torch.stack(fbanks, dim=0) + fbank = (fbank - fbank_mean) / (2 * fbank_std) + return fbank + + def extract_features( + self, + fbank: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, + ): + ## NOTE: preprocessing is done separately in lavis.processsors.audio_processors.BeatsAudioProcessor + # fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std) + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(fbank, padding_mask) + + fbank = fbank.unsqueeze(1) + features = self.patch_embedding(fbank) + features = features.reshape(features.shape[0], features.shape[1], -1) + features = features.transpose(1, 2) + features = self.layer_norm(features) + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(features, padding_mask) + + if self.post_extract_proj is not None: + features = self.post_extract_proj(features) + + x = self.dropout_input(features) + + x, layer_results = self.encoder( + x, + padding_mask=padding_mask, + ) + + if self.predictor is not None: + x = self.predictor_dropout(x) + logits = self.predictor(x) + + if padding_mask is not None and padding_mask.any(): + logits[padding_mask] = 0 + logits = logits.sum(dim=1) + logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits) + else: + logits = logits.mean(dim=1) + + lprobs = torch.sigmoid(logits) + + return lprobs, padding_mask + else: + return x, padding_mask diff --git a/lavis/models/beats/LICENSE_BEATs.txt b/lavis/models/beats/LICENSE_BEATs.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bb386e6f7e1b94c1f76d779fcd9b68720e833c3 --- /dev/null +++ b/lavis/models/beats/LICENSE_BEATs.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) Microsoft Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/lavis/models/beats/README.md b/lavis/models/beats/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd54e797a6ae5152c92fd2b37a8531f13e103f00 --- /dev/null +++ b/lavis/models/beats/README.md @@ -0,0 +1,127 @@ + +# BEATs + +[**BEATs**](https://arxiv.org/abs/2212.09058): **Audio Pre-Training with Acoustic Tokenizers** + +Official PyTorch implementation and pretrained models of BEATs + +## Pre-Trained and Fine-Tuned Tokenizers and Models +Iterations | Tokenizer | Pre-Trained Model | AudioSet Fine-Tuned Model 1 | AudioSet Fine-Tuned Model 2 +|---|---|---|---|--- +Iter1 | Random Projection | [BEATs_iter1](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter1 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter1 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | +Iter2 | [Tokenizer_iter2](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter2](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter2 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter2 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | +Iter3 | [Tokenizer_iter3](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | +Iter3+ | [Tokenizer_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS20K) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS20K) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | +Iter3+ | [Tokenizer_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS2M) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS2M) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | + + +### Load Tokenizers + +```python +import torch +from Tokenizers import TokenizersConfig, Tokenizers + +# load the pre-trained checkpoints +checkpoint = torch.load('/path/to/tokenizer.pt') + +cfg = TokenizersConfig(checkpoint['cfg']) +BEATs_tokenizer = Tokenizers(cfg) +BEATs_tokenizer.load_state_dict(checkpoint['model']) +BEATs_tokenizer.eval() + +# tokenize the audio and generate the labels +audio_input_16khz = torch.randn(1, 10000) +padding_mask = torch.zeros(1, 10000).bool() + +labels = BEATs_tokenizer.extract_labels(audio_input_16khz, padding_mask=padding_mask) +``` + + +### Load Pre-Trained Models + +```python +import torch +from BEATs import BEATs, BEATsConfig + +# load the pre-trained checkpoints +checkpoint = torch.load('/path/to/model.pt') + +cfg = BEATsConfig(checkpoint['cfg']) +BEATs_model = BEATs(cfg) +BEATs_model.load_state_dict(checkpoint['model']) +BEATs_model.eval() + +# extract the the audio representation +audio_input_16khz = torch.randn(1, 10000) +padding_mask = torch.zeros(1, 10000).bool() + +representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0] +``` + + +### Load Fine-tuned Models + +```python +import torch +from BEATs import BEATs, BEATsConfig + +# load the fine-tuned checkpoints +checkpoint = torch.load('/path/to/model.pt') + +cfg = BEATsConfig(checkpoint['cfg']) +BEATs_model = BEATs(cfg) +BEATs_model.load_state_dict(checkpoint['model']) +BEATs_model.eval() + +# predict the classification probability of each class +audio_input_16khz = torch.randn(3, 10000) +padding_mask = torch.zeros(3, 10000).bool() + +probs = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0] + +for i, (top5_label_prob, top5_label_idx) in enumerate(zip(*probs.topk(k=5))): + top5_label = [checkpoint['label_dict'][label_idx.item()] for label_idx in top5_label_idx] + print(f'Top 5 predicted labels of the {i}th audio are {top5_label} with probability of {top5_label_prob}') +``` + +## Evaluation Results + +### Comparing with the SOTA Single Models +![alt text](Evaluation_Results/Comparing_with_the_SOTA_Single_Models.png) + + +### Comparing with the SOTA Ensemble Models +![alt text](Evaluation_Results/Comparing_with_the_SOTA_Ensemble_Models.png) + + +### Comparing Different BEATS Tokenizers +![alt text](Evaluation_Results/Comparing_Different_BEATS_Tokenizers.png) + + +### Comparing Different Pre-Training Targets +![alt text](Evaluation_Results/Comparing_Different_Pre-Training_Targets.png) + + +## License +This project is licensed under the license found in the LICENSE file in the root directory of this source tree. +Portions of the source code are based on the [FAIRSEQ](https://github.com/pytorch/fairseq) and [VQGAN](https://github.com/CompVis/taming-transformers) project. + +[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct) + + +### Reference +If you find our work is useful in your research, please cite the following paper: +``` latex +@article{Chen2022beats, + title = {BEATs: Audio Pre-Training with Acoustic Tokenizers}, + author = {Sanyuan Chen and Yu Wu and Chengyi Wang and Shujie Liu and Daniel Tompkins and Zhuo Chen and Furu Wei}, + eprint={2212.09058}, + archivePrefix={arXiv}, + year={2022} +} +``` +### Contact Information + +For help or issues using BEATs models, please submit a GitHub issue. + +For other communications related to BEATs, please contact Yu Wu (`yuwu1@microsoft.com`). diff --git a/lavis/models/beats/Tokenizers.py b/lavis/models/beats/Tokenizers.py new file mode 100644 index 0000000000000000000000000000000000000000..d1bc368908c6c6c4c263b89463b1b53b6325b6b5 --- /dev/null +++ b/lavis/models/beats/Tokenizers.py @@ -0,0 +1,173 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on fairseq code bases +# https://github.com/pytorch/fairseq +# -------------------------------------------------------- + + +import torch +import torch.nn as nn +from torch.nn import LayerNorm +import torchaudio.compliance.kaldi as ta_kaldi + +from lavis.models.beats.backbone import ( + TransformerEncoder, +) +from lavis.models.beats.quantizer import ( + NormEMAVectorQuantizer, +) + +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + + +class TokenizersConfig: + def __init__(self, cfg=None): + self.input_patch_size: int = -1 # path size of patch embedding + self.embed_dim: int = 512 # patch embedding dimension + self.conv_bias: bool = False # include bias in conv encoder + + self.encoder_layers: int = 12 # num encoder layers in the transformer + self.encoder_embed_dim: int = 768 # encoder embedding dimension + self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN + self.encoder_attention_heads: int = 12 # num encoder attention heads + self.activation_fn: str = "gelu" # activation function to use + + self.layer_norm_first: bool = False # apply layernorm first in the transformer + self.deep_norm: bool = False # apply deep_norm first in the transformer + + # dropouts + self.dropout: float = 0.1 # dropout probability for the transformer + self.attention_dropout: float = 0.1 # dropout probability for attention weights + self.activation_dropout: float = 0.0 # dropout probability after activation in FFN + self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer + self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) + + # positional embeddings + self.conv_pos: int = 128 # number of filters for convolutional positional embeddings + self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding + + # relative position embedding + self.relative_position_embedding: bool = False # apply relative position embedding + self.num_buckets: int = 320 # number of buckets for relative position embedding + self.max_distance: int = 1280 # maximum distance for relative position embedding + self.gru_rel_pos: bool = False # apply gated relative position embedding + + # quantizer + self.quant_n: int = 1024 # codebook number in quantizer + self.quant_dim: int = 256 # codebook dimension in quantizer + + if cfg is not None: + self.update(cfg) + + def update(self, cfg: dict): + self.__dict__.update(cfg) + + +class Tokenizers(nn.Module): + def __init__( + self, + cfg: TokenizersConfig, + ) -> None: + super().__init__() + logger.info(f"Tokenizers Config: {cfg.__dict__}") + + self.cfg = cfg + + self.embed = cfg.embed_dim + self.post_extract_proj = ( + nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim + else None + ) + + self.input_patch_size = cfg.input_patch_size + self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size, + bias=cfg.conv_bias) + + self.dropout_input = nn.Dropout(cfg.dropout_input) + + assert not cfg.deep_norm or not cfg.layer_norm_first + self.encoder = TransformerEncoder(cfg) + self.layer_norm = LayerNorm(self.embed) + + self.quantize = NormEMAVectorQuantizer( + n_embed=cfg.quant_n, embedding_dim=cfg.quant_dim, beta=1.0, kmeans_init=True, decay=0.99, + ) + self.quant_n = cfg.quant_n + self.quantize_layer = nn.Sequential( + nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim), + nn.Tanh(), + nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim) # for quantize + ) + + def forward_padding_mask( + self, + features: torch.Tensor, + padding_mask: torch.Tensor, + ) -> torch.Tensor: + extra = padding_mask.size(1) % features.size(1) + if extra > 0: + padding_mask = padding_mask[:, :-extra] + padding_mask = padding_mask.view( + padding_mask.size(0), features.size(1), -1 + ) + padding_mask = padding_mask.all(-1) + return padding_mask + + def preprocess( + self, + source: torch.Tensor, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, + ) -> torch.Tensor: + fbanks = [] + for waveform in source: + waveform = waveform.unsqueeze(0) * 2 ** 15 + fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10) + fbanks.append(fbank) + fbank = torch.stack(fbanks, dim=0) + fbank = (fbank - fbank_mean) / (2 * fbank_std) + return fbank + + def extract_labels( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, + ): + fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std) + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(fbank, padding_mask) + + fbank = fbank.unsqueeze(1) + features = self.patch_embedding(fbank) + features = features.reshape(features.shape[0], features.shape[1], -1) + features = features.transpose(1, 2) + features = self.layer_norm(features) + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(features, padding_mask) + + if self.post_extract_proj is not None: + features = self.post_extract_proj(features) + + x = self.dropout_input(features) + + x, layer_results = self.encoder( + x, + padding_mask=padding_mask, + ) + + quantize_input = self.quantize_layer(x) + quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input) + + return embed_ind + diff --git a/lavis/models/beats/__pycache__/Tokenizers.cpython-310.pyc b/lavis/models/beats/__pycache__/Tokenizers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e0aa1a804c7afda68a51d7d0cb42523fd2234df Binary files /dev/null and b/lavis/models/beats/__pycache__/Tokenizers.cpython-310.pyc differ diff --git a/lavis/models/beats/__pycache__/backbone.cpython-310.pyc b/lavis/models/beats/__pycache__/backbone.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45cb783b1dfcd5ab37957b0f901aaa132f1e58b7 Binary files /dev/null and b/lavis/models/beats/__pycache__/backbone.cpython-310.pyc differ diff --git a/lavis/models/beats/__pycache__/modules.cpython-310.pyc b/lavis/models/beats/__pycache__/modules.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b98544ce718eec8fd930201eb14d4a9ad827305 Binary files /dev/null and b/lavis/models/beats/__pycache__/modules.cpython-310.pyc differ diff --git a/lavis/models/beats/__pycache__/quantizer.cpython-310.pyc b/lavis/models/beats/__pycache__/quantizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2f5ec6f1ce24ec0aff24d68a444193672a03896 Binary files /dev/null and b/lavis/models/beats/__pycache__/quantizer.cpython-310.pyc differ diff --git a/lavis/models/beats/backbone.py b/lavis/models/beats/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..ea843ff650d67580a2b8aa045fc612f100fbf2d5 --- /dev/null +++ b/lavis/models/beats/backbone.py @@ -0,0 +1,783 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on fairseq code bases +# https://github.com/pytorch/fairseq +# -------------------------------------------------------- + +import math +import numpy as np +from typing import Dict, Optional, Tuple +import torch +from torch import Tensor, nn +import torch.nn.functional as F +from torch.nn import LayerNorm, Parameter +from lavis.models.beats.modules import ( + GradMultiply, + SamePad, + get_activation_fn, + GLU_Linear, + quant_noise, +) + + +class TransformerEncoder(nn.Module): + def __init__(self, args): + super().__init__() + + self.dropout = args.dropout + self.embedding_dim = args.encoder_embed_dim + + self.pos_conv = nn.Conv1d( + self.embedding_dim, + self.embedding_dim, + kernel_size=args.conv_pos, + padding=args.conv_pos // 2, + groups=args.conv_pos_groups, + ) + dropout = 0 + std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) + nn.init.normal_(self.pos_conv.weight, mean=0, std=std) + nn.init.constant_(self.pos_conv.bias, 0) + + self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) + self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) + + if hasattr(args, "relative_position_embedding"): + self.relative_position_embedding = args.relative_position_embedding + self.num_buckets = args.num_buckets + self.max_distance = args.max_distance + else: + self.relative_position_embedding = False + self.num_buckets = 0 + self.max_distance = 0 + + self.layers = nn.ModuleList( + [ + TransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + deep_norm=args.deep_norm, + has_relative_attention_bias=self.relative_position_embedding, + num_buckets=self.num_buckets, + max_distance=self.max_distance, + gru_rel_pos=args.gru_rel_pos, + encoder_layers=args.encoder_layers, + ) + for i in range(args.encoder_layers) + ] + ) + if self.relative_position_embedding: + for i in range(1, args.encoder_layers): + del self.layers[i].self_attn.relative_attention_bias + self.layers[i].self_attn.relative_attention_bias = self.layers[0].self_attn.relative_attention_bias + + self.layer_norm_first = args.layer_norm_first + self.layer_norm = LayerNorm(self.embedding_dim) + self.layerdrop = args.encoder_layerdrop + + self.apply(init_bert_params) + + if args.deep_norm: + deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4) + for i in range(args.encoder_layers): + nn.init.xavier_normal_(self.layers[i].self_attn.k_proj.weight, gain=1) + nn.init.xavier_normal_(self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta) + nn.init.xavier_normal_(self.layers[i].self_attn.q_proj.weight, gain=1) + nn.init.xavier_normal_(self.layers[i].self_attn.out_proj.weight, gain=deep_norm_beta) + nn.init.xavier_normal_(self.layers[i].fc1.weight, gain=deep_norm_beta) + nn.init.xavier_normal_(self.layers[i].fc2.weight, gain=deep_norm_beta) + + self.layer_wise_gradient_decay_ratio = getattr(args, "layer_wise_gradient_decay_ratio", 1) + + def forward(self, x, padding_mask=None, layer=None): + x, layer_results = self.extract_features(x, padding_mask, layer) + + if self.layer_norm_first and layer is None: + x = self.layer_norm(x) + + return x, layer_results + + def extract_features(self, x, padding_mask=None, tgt_layer=None): + + if padding_mask is not None: + x[padding_mask] = 0 + + x_conv = self.pos_conv(x.transpose(1, 2)) + x_conv = x_conv.transpose(1, 2) + x = x + x_conv + + if not self.layer_norm_first: + x = self.layer_norm(x) + + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + layer_results = [] + z = None + if tgt_layer is not None: + layer_results.append((x, z)) + r = None + pos_bias = None + for i, layer in enumerate(self.layers): + if self.layer_wise_gradient_decay_ratio != 1.0: + x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio) + dropout_probability = np.random.random() + if not self.training or (dropout_probability > self.layerdrop): + x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, pos_bias=pos_bias) + if tgt_layer is not None: + layer_results.append((x, z)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, layer_results + + +class TransformerSentenceEncoderLayer(nn.Module): + def __init__( + self, + embedding_dim: float = 768, + ffn_embedding_dim: float = 3072, + num_attention_heads: float = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + activation_fn: str = "relu", + layer_norm_first: bool = False, + deep_norm: bool = False, + has_relative_attention_bias: bool = False, + num_buckets: int = 0, + max_distance: int = 0, + rescale_init: bool = False, + gru_rel_pos: bool = False, + encoder_layers: int = 0, + ) -> None: + + super().__init__() + self.embedding_dim = embedding_dim + self.dropout = dropout + self.activation_dropout = activation_dropout + + self.activation_name = activation_fn + self.activation_fn = get_activation_fn(activation_fn) + self.self_attn = MultiheadAttention( + self.embedding_dim, + num_attention_heads, + dropout=attention_dropout, + self_attention=True, + has_relative_attention_bias=has_relative_attention_bias, + num_buckets=num_buckets, + max_distance=max_distance, + rescale_init=rescale_init, + gru_rel_pos=gru_rel_pos, + ) + + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(self.activation_dropout) + self.dropout3 = nn.Dropout(dropout) + + self.layer_norm_first = layer_norm_first + + self.self_attn_layer_norm = LayerNorm(self.embedding_dim) + + if self.activation_name == "glu": + self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish") + else: + self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) + self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) + + self.final_layer_norm = LayerNorm(self.embedding_dim) + + self.deep_norm = deep_norm + if self.deep_norm: + self.deep_norm_alpha = math.pow(2 * encoder_layers, 1 / 4) + else: + self.deep_norm_alpha = 1 + + def forward( + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + need_weights: bool = False, + pos_bias=None + ): + residual = x + + if self.layer_norm_first: + x = self.self_attn_layer_norm(x) + x, attn, pos_bias = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + need_weights=False, + attn_mask=self_attn_mask, + position_bias=pos_bias + ) + x = self.dropout1(x) + x = residual + x + + residual = x + x = self.final_layer_norm(x) + if self.activation_name == "glu": + x = self.fc1(x) + else: + x = self.activation_fn(self.fc1(x)) + x = self.dropout2(x) + x = self.fc2(x) + x = self.dropout3(x) + x = residual + x + else: + x, attn, pos_bias = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + need_weights=need_weights, + attn_mask=self_attn_mask, + position_bias=pos_bias + ) + + x = self.dropout1(x) + x = residual * self.deep_norm_alpha + x + + x = self.self_attn_layer_norm(x) + + residual = x + if self.activation_name == "glu": + x = self.fc1(x) + else: + x = self.activation_fn(self.fc1(x)) + x = self.dropout2(x) + x = self.fc2(x) + x = self.dropout3(x) + x = residual * self.deep_norm_alpha + x + x = self.final_layer_norm(x) + + return x, attn, pos_bias + + +class MultiheadAttention(nn.Module): + """Multi-headed attention. + + See "Attention Is All You Need" for more details. + """ + + def __init__( + self, + embed_dim, + num_heads, + kdim=None, + vdim=None, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + self_attention=False, + encoder_decoder_attention=False, + q_noise=0.0, + qn_block_size=8, + has_relative_attention_bias=False, + num_buckets=32, + max_distance=128, + gru_rel_pos=False, + rescale_init=False, + ): + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout_module = nn.Dropout(dropout) + + self.has_relative_attention_bias = has_relative_attention_bias + self.num_buckets = num_buckets + self.max_distance = max_distance + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(num_buckets, num_heads) + + self.head_dim = embed_dim // num_heads + self.q_head_dim = self.head_dim + self.k_head_dim = self.head_dim + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim ** -0.5 + + self.self_attention = self_attention + self.encoder_decoder_attention = encoder_decoder_attention + + assert not self.self_attention or self.qkv_same_dim, ( + "Self-attention requires query, key and " "value to be of the same size" + ) + + k_bias = True + if rescale_init: + k_bias = False + + k_embed_dim = embed_dim + q_embed_dim = embed_dim + + self.k_proj = quant_noise( + nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise, qn_block_size + ) + self.v_proj = quant_noise( + nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size + ) + self.q_proj = quant_noise( + nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise, qn_block_size + ) + + self.out_proj = quant_noise( + nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size + ) + + if add_bias_kv: + self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) + self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self.gru_rel_pos = gru_rel_pos + if self.gru_rel_pos: + self.grep_linear = nn.Linear(self.q_head_dim, 8) + self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1)) + + self.reset_parameters() + + def reset_parameters(self): + if self.qkv_same_dim: + # Empirically observed the convergence to be much better with + # the scaled initialization + nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) + else: + nn.init.xavier_uniform_(self.k_proj.weight) + nn.init.xavier_uniform_(self.v_proj.weight) + nn.init.xavier_uniform_(self.q_proj.weight) + + nn.init.xavier_uniform_(self.out_proj.weight) + if self.out_proj.bias is not None: + nn.init.constant_(self.out_proj.bias, 0.0) + if self.bias_k is not None: + nn.init.xavier_normal_(self.bias_k) + if self.bias_v is not None: + nn.init.xavier_normal_(self.bias_v) + if self.has_relative_attention_bias: + nn.init.xavier_normal_(self.relative_attention_bias.weight) + + def _relative_positions_bucket(self, relative_positions, bidirectional=True): + num_buckets = self.num_buckets + max_distance = self.max_distance + relative_buckets = 0 + + if bidirectional: + num_buckets = num_buckets // 2 + relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets + relative_positions = torch.abs(relative_positions) + else: + relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions)) + + max_exact = num_buckets // 2 + is_small = relative_positions < max_exact + + relative_postion_if_large = max_exact + ( + torch.log(relative_positions.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.long) + relative_postion_if_large = torch.min( + relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1) + ) + + relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large) + return relative_buckets + + def compute_bias(self, query_length, key_length): + context_position = torch.arange(query_length, dtype=torch.long)[:, None] + memory_position = torch.arange(key_length, dtype=torch.long)[None, :] + relative_position = memory_position - context_position + relative_position_bucket = self._relative_positions_bucket( + relative_position, + bidirectional=True + ) + relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device) + values = self.relative_attention_bias(relative_position_bucket) + values = values.permute([2, 0, 1]) + return values + + def forward( + self, + query, + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + need_weights: bool = True, + static_kv: bool = False, + attn_mask: Optional[Tensor] = None, + before_softmax: bool = False, + need_head_weights: bool = False, + position_bias: Optional[Tensor] = None + ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: + """Input shape: Time x Batch x Channel + + Args: + key_padding_mask (ByteTensor, optional): mask to exclude + keys that are pads, of shape `(batch, src_len)`, where + padding elements are indicated by 1s. + need_weights (bool, optional): return the attention weights, + averaged over heads (default: False). + attn_mask (ByteTensor, optional): typically used to + implement causal attention, where the mask prevents the + attention from looking forward in time (default: None). + before_softmax (bool, optional): return the raw attention + weights and values before the attention softmax. + need_head_weights (bool, optional): return the attention + weights for each head. Implies *need_weights*. Default: + return the average attention weights over all heads. + """ + if need_head_weights: + need_weights = True + + is_tpu = query.device.type == "xla" + + tgt_len, bsz, embed_dim = query.size() + src_len = tgt_len + assert embed_dim == self.embed_dim + assert list(query.size()) == [tgt_len, bsz, embed_dim] + if key is not None: + src_len, key_bsz, _ = key.size() + if not torch.jit.is_scripting(): + assert key_bsz == bsz + assert value is not None + assert src_len, bsz == value.shape[:2] + + if self.has_relative_attention_bias and position_bias is None: + position_bias = self.compute_bias(tgt_len, src_len) + position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len) + + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if saved_state is not None and "prev_key" in saved_state: + # previous time steps are cached - no need to recompute + # key and value if they are static + if static_kv: + assert self.encoder_decoder_attention and not self.self_attention + key = value = None + else: + saved_state = None + + if self.self_attention: + q = self.q_proj(query) + k = self.k_proj(query) + v = self.v_proj(query) + elif self.encoder_decoder_attention: + # encoder-decoder attention + q = self.q_proj(query) + if key is None: + assert value is None + k = v = None + else: + k = self.k_proj(key) + v = self.v_proj(key) + + else: + assert key is not None and value is not None + q = self.q_proj(query) + k = self.k_proj(key) + v = self.v_proj(value) + q *= self.scaling + alpha = 32 + q *= 1 / alpha + + if self.bias_k is not None: + assert self.bias_v is not None + k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + key_padding_mask.new_zeros(key_padding_mask.size(0), 1), + ], + dim=1, + ) + + q = ( + q.contiguous() + .view(tgt_len, bsz * self.num_heads, self.q_head_dim) + .transpose(0, 1) + ) + if k is not None: + k = ( + k.contiguous() + .view(-1, bsz * self.num_heads, self.k_head_dim) + .transpose(0, 1) + ) + if v is not None: + v = ( + v.contiguous() + .view(-1, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + if saved_state is not None: + # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) + if "prev_key" in saved_state: + _prev_key = saved_state["prev_key"] + assert _prev_key is not None + prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) + if static_kv: + k = prev_key + else: + assert k is not None + k = torch.cat([prev_key, k], dim=1) + src_len = k.size(1) + if "prev_value" in saved_state: + _prev_value = saved_state["prev_value"] + assert _prev_value is not None + prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) + if static_kv: + v = prev_value + else: + assert v is not None + v = torch.cat([prev_value, v], dim=1) + prev_key_padding_mask: Optional[Tensor] = None + if "prev_key_padding_mask" in saved_state: + prev_key_padding_mask = saved_state["prev_key_padding_mask"] + assert k is not None and v is not None + key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( + key_padding_mask=key_padding_mask, + prev_key_padding_mask=prev_key_padding_mask, + batch_size=bsz, + src_len=k.size(1), + static_kv=static_kv, + ) + + saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_key_padding_mask"] = key_padding_mask + # In this branch incremental_state is never None + assert incremental_state is not None + incremental_state = self._set_input_buffer(incremental_state, saved_state) + assert k is not None + assert k.size(1) == src_len + + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.dim() == 0: + key_padding_mask = None + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if self.add_zero_attn: + assert v is not None + src_len += 1 + k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) + v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + torch.zeros(key_padding_mask.size(0), 1).type_as( + key_padding_mask + ), + ], + dim=1, + ) + + attn_weights = torch.bmm(q, k.transpose(1, 2)) + attn_weights = (attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha + attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) + + assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(0) + attn_weights += attn_mask + + if key_padding_mask is not None: + # don't attend to padding symbols + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + if not is_tpu: + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + float("-inf"), + ) + else: + attn_weights = attn_weights.transpose(0, 2) + attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) + attn_weights = attn_weights.transpose(0, 2) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if before_softmax: + return attn_weights, v, position_bias + + if position_bias is not None: + attn_mask_rel_pos = position_bias + if self.gru_rel_pos == 1: + query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) * alpha / self.scaling + _B, _H, _L, __ = query_layer.size() + gate_a, gate_b = torch.sigmoid(self.grep_linear(query_layer).view( + _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, dim=-1) + gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0 + attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len, 1) * position_bias + + attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size()) + + attn_weights = attn_weights + attn_mask_rel_pos + + attn_weights_float = F.softmax( + attn_weights, dim=-1 + ) + attn_weights = attn_weights_float.type_as(attn_weights) + attn_probs = self.dropout_module(attn_weights) + + assert v is not None + attn = torch.bmm(attn_probs, v) + assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn = self.out_proj(attn) + attn_weights: Optional[Tensor] = None + if need_weights: + attn_weights = attn_weights_float.view( + bsz, self.num_heads, tgt_len, src_len + ).transpose(1, 0) + if not need_head_weights: + # average attention weights over heads + attn_weights = attn_weights.mean(dim=0) + + return attn, attn_weights, position_bias + + @staticmethod + def _append_prev_key_padding_mask( + key_padding_mask: Optional[Tensor], + prev_key_padding_mask: Optional[Tensor], + batch_size: int, + src_len: int, + static_kv: bool, + ) -> Optional[Tensor]: + # saved key padding masks have shape (bsz, seq_len) + if prev_key_padding_mask is not None and static_kv: + new_key_padding_mask = prev_key_padding_mask + elif prev_key_padding_mask is not None and key_padding_mask is not None: + new_key_padding_mask = torch.cat( + [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1 + ) + # During incremental decoding, as the padding token enters and + # leaves the frame, there will be a time when prev or current + # is None + elif prev_key_padding_mask is not None: + if src_len > prev_key_padding_mask.size(1): + filler = torch.zeros( + (batch_size, src_len - prev_key_padding_mask.size(1)), + device=prev_key_padding_mask.device, + ) + new_key_padding_mask = torch.cat( + [prev_key_padding_mask.float(), filler.float()], dim=1 + ) + else: + new_key_padding_mask = prev_key_padding_mask.float() + elif key_padding_mask is not None: + if src_len > key_padding_mask.size(1): + filler = torch.zeros( + (batch_size, src_len - key_padding_mask.size(1)), + device=key_padding_mask.device, + ) + new_key_padding_mask = torch.cat( + [filler.float(), key_padding_mask.float()], dim=1 + ) + else: + new_key_padding_mask = key_padding_mask.float() + else: + new_key_padding_mask = prev_key_padding_mask + return new_key_padding_mask + + def _get_input_buffer( + self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + ) -> Dict[str, Optional[Tensor]]: + result = self.get_incremental_state(incremental_state, "attn_state") + if result is not None: + return result + else: + empty_result: Dict[str, Optional[Tensor]] = {} + return empty_result + + def _set_input_buffer( + self, + incremental_state: Dict[str, Dict[str, Optional[Tensor]]], + buffer: Dict[str, Optional[Tensor]], + ): + return self.set_incremental_state(incremental_state, "attn_state", buffer) + + def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int): + return attn_weights + + +def init_bert_params(module): + """ + Initialize the weights specific to the BERT Model. + This overrides the default initializations depending on the specified arguments. + 1. If normal_init_linear_weights is set then weights of linear + layer will be initialized using the normal distribution and + bais will be set to the specified value. + 2. If normal_init_embed_weights is set then weights of embedding + layer will be initialized using the normal distribution. + 3. If normal_init_proj_weights is set then weights of + in_project_weight for MultiHeadAttention initialized using + the normal distribution (to be validated). + """ + + def normal_(data): + # with FSDP, module params will be on CUDA, so we cast them back to CPU + # so that the RNG is consistent with and without FSDP + data.copy_( + data.cpu().normal_(mean=0.0, std=0.02).to(data.device) + ) + + if isinstance(module, nn.Linear): + normal_(module.weight.data) + if module.bias is not None: + module.bias.data.zero_() + if isinstance(module, nn.Embedding): + normal_(module.weight.data) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if isinstance(module, MultiheadAttention): + normal_(module.q_proj.weight.data) + normal_(module.k_proj.weight.data) + normal_(module.v_proj.weight.data) diff --git a/lavis/models/beats/modules.py b/lavis/models/beats/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..7772b2d7448edca5ec2aa5fcd6278429b98e35a4 --- /dev/null +++ b/lavis/models/beats/modules.py @@ -0,0 +1,219 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on fairseq code bases +# https://github.com/pytorch/fairseq +# -------------------------------------------------------- + +import math +import warnings +import torch +from torch import Tensor, nn +import torch.nn.functional as F + + +class GradMultiply(torch.autograd.Function): + @staticmethod + def forward(ctx, x, scale): + ctx.scale = scale + res = x.new(x) + return res + + @staticmethod + def backward(ctx, grad): + return grad * ctx.scale, None + + +class SamePad(nn.Module): + def __init__(self, kernel_size, causal=False): + super().__init__() + if causal: + self.remove = kernel_size - 1 + else: + self.remove = 1 if kernel_size % 2 == 0 else 0 + + def forward(self, x): + if self.remove > 0: + x = x[:, :, : -self.remove] + return x + + +class Swish(nn.Module): + def __init__(self): + super(Swish, self).__init__() + self.act = torch.nn.Sigmoid() + + def forward(self, x): + return x * self.act(x) + + +class GLU_Linear(nn.Module): + def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True): + super(GLU_Linear, self).__init__() + + self.glu_type = glu_type + self.output_dim = output_dim + + if glu_type == "sigmoid": + self.glu_act = torch.nn.Sigmoid() + elif glu_type == "swish": + self.glu_act = Swish() + elif glu_type == "relu": + self.glu_act = torch.nn.ReLU() + elif glu_type == "gelu": + self.glu_act = torch.nn.GELU() + + if bias_in_glu: + self.linear = nn.Linear(input_dim, output_dim * 2, True) + else: + self.linear = nn.Linear(input_dim, output_dim * 2, False) + + def forward(self, x): + # to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case + x = self.linear(x) + + if self.glu_type == "bilinear": + x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2]) + else: + x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2])) + + return x + + +def gelu_accurate(x): + if not hasattr(gelu_accurate, "_a"): + gelu_accurate._a = math.sqrt(2 / math.pi) + return ( + 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3)))) + ) + + +def gelu(x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.gelu(x.float()).type_as(x) + + +def get_activation_fn(activation: str): + """Returns the activation function corresponding to `activation`""" + + if activation == "relu": + return F.relu + elif activation == "gelu": + return gelu + elif activation == "gelu_fast": + warnings.warn( + "--activation-fn=gelu_fast has been renamed to gelu_accurate" + ) + return gelu_accurate + elif activation == "gelu_accurate": + return gelu_accurate + elif activation == "tanh": + return torch.tanh + elif activation == "linear": + return lambda x: x + elif activation == "glu": + return lambda x: x + else: + raise RuntimeError("--activation-fn {} not supported".format(activation)) + + +def quant_noise(module, p, block_size): + """ + Wraps modules and applies quantization noise to the weights for + subsequent quantization with Iterative Product Quantization as + described in "Training with Quantization Noise for Extreme Model Compression" + + Args: + - module: nn.Module + - p: amount of Quantization Noise + - block_size: size of the blocks for subsequent quantization with iPQ + + Remarks: + - Module weights must have the right sizes wrt the block size + - Only Linear, Embedding and Conv2d modules are supported for the moment + - For more detail on how to quantize by blocks with convolutional weights, + see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks" + - We implement the simplest form of noise here as stated in the paper + which consists in randomly dropping blocks + """ + + # if no quantization noise, don't register hook + if p <= 0: + return module + + # supported modules + assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) + + # test whether module.weight has the right sizes wrt block_size + is_conv = module.weight.ndim == 4 + + # 2D matrix + if not is_conv: + assert ( + module.weight.size(1) % block_size == 0 + ), "Input features must be a multiple of block sizes" + + # 4D matrix + else: + # 1x1 convolutions + if module.kernel_size == (1, 1): + assert ( + module.in_channels % block_size == 0 + ), "Input channels must be a multiple of block sizes" + # regular convolutions + else: + k = module.kernel_size[0] * module.kernel_size[1] + assert k % block_size == 0, "Kernel size must be a multiple of block size" + + def _forward_pre_hook(mod, input): + # no noise for evaluation + if mod.training: + if not is_conv: + # gather weight and sizes + weight = mod.weight + in_features = weight.size(1) + out_features = weight.size(0) + + # split weight matrix into blocks and randomly drop selected blocks + mask = torch.zeros( + in_features // block_size * out_features, device=weight.device + ) + mask.bernoulli_(p) + mask = mask.repeat_interleave(block_size, -1).view(-1, in_features) + + else: + # gather weight and sizes + weight = mod.weight + in_channels = mod.in_channels + out_channels = mod.out_channels + + # split weight matrix into blocks and randomly drop selected blocks + if mod.kernel_size == (1, 1): + mask = torch.zeros( + int(in_channels // block_size * out_channels), + device=weight.device, + ) + mask.bernoulli_(p) + mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels) + else: + mask = torch.zeros( + weight.size(0), weight.size(1), device=weight.device + ) + mask.bernoulli_(p) + mask = ( + mask.unsqueeze(2) + .unsqueeze(3) + .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]) + ) + + # scale weights and apply mask + mask = mask.to( + torch.bool + ) # x.bool() is not currently supported in TorchScript + s = 1 / (1 - p) + mod.weight.data = s * weight.masked_fill(mask, 0) + + module.register_forward_pre_hook(_forward_pre_hook) + return module + diff --git a/lavis/models/beats/quantizer.py b/lavis/models/beats/quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..5370d02e7f8f10723128b9bbc34afd3342cfcd86 --- /dev/null +++ b/lavis/models/beats/quantizer.py @@ -0,0 +1,215 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on VQGAN code bases +# https://github.com/CompVis/taming-transformers +# --------------------------------------------------------' + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as distributed + +try: + from einops import rearrange, repeat +except ImportError: + pass + + +def l2norm(t): + return F.normalize(t, p=2, dim=-1) + + +def ema_inplace(moving_avg, new, decay): + moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay)) + + +def sample_vectors(samples, num): + num_samples, device = samples.shape[0], samples.device + + if num_samples >= num: + indices = torch.randperm(num_samples, device=device)[:num] + else: + indices = torch.randint(0, num_samples, (num,), device=device) + + return samples[indices] + + +def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False): + dim, dtype, device = samples.shape[-1], samples.dtype, samples.device + + means = sample_vectors(samples, num_clusters) + + for _ in range(num_iters): + if use_cosine_sim: + dists = samples @ means.t() + else: + diffs = rearrange(samples, 'n d -> n () d') \ + - rearrange(means, 'c d -> () c d') + dists = -(diffs ** 2).sum(dim=-1) + + buckets = dists.max(dim=-1).indices + bins = torch.bincount(buckets, minlength=num_clusters) + zero_mask = bins == 0 + bins_min_clamped = bins.masked_fill(zero_mask, 1) + + new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype) + new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d=dim), samples) + new_means = new_means / bins_min_clamped[..., None] + + if use_cosine_sim: + new_means = l2norm(new_means) + + means = torch.where(zero_mask[..., None], means, new_means) + + return means, bins + + +class EmbeddingEMA(nn.Module): + def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''): + super().__init__() + self.num_tokens = num_tokens + self.codebook_dim = codebook_dim + self.decay = decay + self.eps = eps + if codebook_init_path == '': + if not kmeans_init: + weight = torch.randn(num_tokens, codebook_dim) + weight = l2norm(weight) + else: + weight = torch.zeros(num_tokens, codebook_dim) + self.register_buffer('initted', torch.Tensor([not kmeans_init])) + else: + print(f"load init codebook weight from {codebook_init_path}") + codebook_ckpt_weight = torch.load(codebook_init_path, map_location='cpu') + weight = codebook_ckpt_weight.clone() + self.register_buffer('initted', torch.Tensor([True])) + + self.weight = nn.Parameter(weight, requires_grad=False) + self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad=False) + self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False) + # self.register_buffer('initted', torch.Tensor([not kmeans_init])) + self.update = True + + @torch.jit.ignore + def init_embed_(self, data): + if self.initted: + return + print("Performing Kemans init for codebook") + embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True) + self.weight.data.copy_(embed) + self.cluster_size.data.copy_(cluster_size) + self.initted.data.copy_(torch.Tensor([True])) + + def forward(self, embed_id): + return F.embedding(embed_id, self.weight) + + def cluster_size_ema_update(self, new_cluster_size): + self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay) + + def embed_avg_ema_update(self, new_embed_avg): + self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay) + + def weight_update(self, num_tokens): + n = self.cluster_size.sum() + smoothed_cluster_size = ( + (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n + ) + # normalize embedding average with smoothed cluster size + embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1) + # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1)) + self.weight.data.copy_(embed_normalized) + + +def norm_ema_inplace(moving_avg, new, decay): + moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay)) + moving_avg.data.copy_(l2norm(moving_avg.data)) + + +class NormEMAVectorQuantizer(nn.Module): + def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, + statistic_code_usage=True, kmeans_init=False, codebook_init_path=''): + super().__init__() + self.codebook_dim = embedding_dim + self.num_tokens = n_embed + self.beta = beta + self.decay = decay + + # learnable = True if orthogonal_reg_weight > 0 else False + self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path) + + self.statistic_code_usage = statistic_code_usage + if statistic_code_usage: + self.register_buffer('cluster_size', torch.zeros(n_embed)) + if distributed.is_available() and distributed.is_initialized(): + print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!") + self.all_reduce_fn = distributed.all_reduce + else: + self.all_reduce_fn = nn.Identity() + + def reset_cluster_size(self, device): + if self.statistic_code_usage: + self.register_buffer('cluster_size', torch.zeros(self.num_tokens)) + self.cluster_size = self.cluster_size.to(device) + + def forward(self, z): + # reshape z -> (batch, height, width, channel) and flatten + # z, 'b c h w -> b h w c' + # z = rearrange(z, 'b c h w -> b h w c') + # z = z.transpose(1, 2) + z = l2norm(z) + z_flattened = z.reshape(-1, self.codebook_dim) + + self.embedding.init_embed_(z_flattened) + + d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \ + self.embedding.weight.pow(2).sum(dim=1) - 2 * \ + torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n' + + encoding_indices = torch.argmin(d, dim=1) + + z_q = self.embedding(encoding_indices).view(z.shape) + + encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype) + + if not self.training: + with torch.no_grad(): + cluster_size = encodings.sum(0) + self.all_reduce_fn(cluster_size) + ema_inplace(self.cluster_size, cluster_size, self.decay) + + if self.training and self.embedding.update: + # EMA cluster size + + bins = encodings.sum(0) + self.all_reduce_fn(bins) + + # self.embedding.cluster_size_ema_update(bins) + ema_inplace(self.cluster_size, bins, self.decay) + + zero_mask = (bins == 0) + bins = bins.masked_fill(zero_mask, 1.) + + embed_sum = z_flattened.t() @ encodings + self.all_reduce_fn(embed_sum) + + embed_normalized = (embed_sum / bins.unsqueeze(0)).t() + embed_normalized = l2norm(embed_normalized) + + embed_normalized = torch.where(zero_mask[..., None], self.embedding.weight, + embed_normalized) + norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay) + + # compute loss for embedding + loss = self.beta * F.mse_loss(z_q.detach(), z) + + # preserve gradients + z_q = z + (z_q - z).detach() + + # reshape back to match original input shape + # z_q, 'b h w c -> b c h w' + # z_q = rearrange(z_q, 'b h w c -> b c h w') + # z_q = z_q.transpose(1, 2) + return z_q, loss, encoding_indices diff --git a/lavis/models/beats_encoder.py b/lavis/models/beats_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..3d28d604d932a2bf06ce0a647ff8faf66acf4c95 --- /dev/null +++ b/lavis/models/beats_encoder.py @@ -0,0 +1,45 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.models.base_model import BaseEncoder +from lavis.models.beats.BEATs import BEATs, BEATsConfig +import torch +from lavis.common.utils import is_url +from lavis.common.dist_utils import download_cached_file +import os + + +ckp_path = "https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D" + +class BeatsEncoder(BaseEncoder): + def __init__(self, checkpoint_path=ckp_path): + super().__init__() + + # load the pre-trained checkpoints + if is_url(checkpoint_path): + cached_file = download_cached_file( + checkpoint_path, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file) + elif os.path.isfile(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + + cfg = BEATsConfig(checkpoint['cfg']) + self.num_features = cfg.encoder_embed_dim + self.model = BEATs(cfg) + self.model.load_state_dict(checkpoint['model']) + self.model.eval() + + @classmethod + def from_config(cls, cfg): + checkpoint_path = cfg.get("checkpoint_path",ckp_path) + return cls(checkpoint_path) + + def forward(self, x): + with torch.no_grad(): + return self.model.extract_features(x.squeeze(1))[0] + diff --git a/lavis/models/blip2_models/Qformer.py b/lavis/models/blip2_models/Qformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e71b12375e10511858a9c505dc795181e6ce5603 --- /dev/null +++ b/lavis/models/blip2_models/Qformer.py @@ -0,0 +1,1216 @@ +""" + * Copyright (c) 2023, salesforce.com, inc. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause + * By Junnan Li + * Based on huggingface code base + * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert +""" + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple, Dict, Any + +import torch +from torch import Tensor, device, dtype, nn +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss +import torch.nn.functional as F + +from transformers.activations import ACT2FN +from transformers.file_utils import ( + ModelOutput, +) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from transformers.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from transformers.utils import logging +from transformers.models.bert.configuration_bert import BertConfig + +logger = logging.get_logger(__name__) + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id + ) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + ) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + + self.config = config + + def forward( + self, + input_ids=None, + position_ids=None, + query_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + seq_length = input_ids.size()[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[ + :, past_key_values_length : seq_length + past_key_values_length + ].clone() + + if input_ids is not None: + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + + if query_embeds is not None: + embeddings = torch.cat((query_embeds, embeddings), dim=1) + else: + embeddings = query_embeds + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, "embedding_size" + ): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_width, self.all_head_size) + self.value = nn.Linear(config.encoder_width, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, + self.attention_head_size, + ) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1 + ) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype + ) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + relative_position_scores_key = torch.einsum( + "bhrd,lrd->bhlr", key_layer, positional_embedding + ) + attention_scores = ( + attention_scores + + relative_position_scores_query + + relative_position_scores_key + ) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = ( + (context_layer, attention_probs) if output_attentions else (context_layer,) + ) + + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.self = BertSelfAttention(config, is_cross_attention) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.self.num_attention_heads, + self.self.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = ( + self.self.attention_head_size * self.self.num_attention_heads + ) + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[ + 1: + ] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.layer_num = layer_num + if ( + self.config.add_cross_attention + and layer_num % self.config.cross_attention_freq == 0 + ): + self.crossattention = BertAttention( + config, is_cross_attention=self.config.add_cross_attention + ) + self.has_cross_attention = True + else: + self.has_cross_attention = False + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + self.intermediate_query = BertIntermediate(config) + self.output_query = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = ( + past_key_value[:2] if past_key_value is not None else None + ) + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + assert ( + encoder_hidden_states is not None + ), "encoder_hidden_states must be given for cross-attention layers" + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + outputs = ( + outputs + cross_attention_outputs[1:-1] + ) # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config, i) for i in range(config.num_hidden_layers)] + ) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + () if output_attentions and self.config.add_cross_attention else None + ) + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module( + *inputs, past_key_value, output_attentions, query_length + ) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BertModel(BertPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=False): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: Tensor, + input_shape: Tuple[int], + device: device, + is_decoder: bool, + has_query: bool = False, + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = ( + seq_ids[None, None, :].repeat(batch_size, seq_length, 1) + <= seq_ids[None, :, None] + ) + + # add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + if has_query: # UniLM style attention mask + causal_mask = torch.cat( + [ + torch.zeros( + (batch_size, prefix_seq_len, seq_length), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=1, + ) + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, causal_mask.shape[1], prefix_seq_len), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=-1, + ) + extended_attention_mask = ( + causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype + ) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # use_cache = use_cache if use_cache is not None else self.config.use_cache + + if input_ids is None: + assert ( + query_embeds is not None + ), "You have to specify query_embeds when input_ids is None" + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length + if past_key_values is not None + else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + query_embeds=query_embeds, + past_key_values_length=past_key_values_length, + ) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), device=device + ) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if is_decoder: + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, + input_ids.shape, + device, + is_decoder, + has_query=(query_embeds is not None), + ) + else: + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ + 0 + ].size() + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) for mask in encoder_attention_mask + ] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class BertLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + reduction="mean", + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + Returns: + Example:: + >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig + >>> import torch + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.logits + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + if labels is not None: + use_cache = False + if past_key_values is not None: + query_embeds = None + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + query_embeds=query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + + sequence_output = outputs[0] + if query_embeds is not None: + sequence_output = outputs[0][:, query_embeds.shape[1] :, :] + + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1), + ) + if reduction == "none": + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs + ): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + query_mask = input_ids.new_ones(query_embeds.shape[:-1]) + attention_mask = torch.cat([query_mask, attention_mask], dim=-1) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "query_embeds": query_embeds, + "attention_mask": attention_mask, + "past_key_values": past, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), + "is_decoder": True, + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx) for past_state in layer_past + ), + ) + return reordered_past + + +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=False, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + query_embeds=query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + + if query_embeds is not None: + sequence_output = outputs[0][:, query_embeds.shape[1] :, :] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), labels.view(-1) + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ( + ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + ) + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/lavis/models/blip2_models/__init__.py b/lavis/models/blip2_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lavis/models/blip2_models/__pycache__/Qformer.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/Qformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6f0cd1c21eb0ec8ab197984ee394ddbed11d97f Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/Qformer.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/__init__.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2f79f125113629c63a9b99537ff1447bb02e0ac Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/blip2.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/blip2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b0119ab5591dba03862dfd07f0e54997ffa19f6 Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/blip2.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/blip2_image_text_matching.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/blip2_image_text_matching.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7dc412868907439dea7a4573fb6498448b05e4d7 Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/blip2_image_text_matching.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/blip2_opt.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/blip2_opt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74ed7558eafa093c184308311e61d8be5626fffe Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/blip2_opt.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/blip2_qformer.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/blip2_qformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e62bcb66a21756c75999cd1e955f991816cdb5c Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/blip2_qformer.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/blip2_t5.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/blip2_t5.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce80808876cff5271029c7fd1fac8a8a43dbfa1d Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/blip2_t5.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/blip2_t5_instruct.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/blip2_t5_instruct.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b853368308822e5ff837f1119ff86ab8e41f2dbc Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/blip2_t5_instruct.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/blip2_vicuna_instruct.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/blip2_vicuna_instruct.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5122b19e2f764c68910eef7c7065a8525be99da8 Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/blip2_vicuna_instruct.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/blip2_vicuna_xinstruct.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/blip2_vicuna_xinstruct.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33924b6278d0de8e94f75e427bf23b2c410108d2 Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/blip2_vicuna_xinstruct.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/modeling_llama.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/modeling_llama.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..562776c6bd6cdc59cbd4c71035b0b2447c20c5ec Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/modeling_llama.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/__pycache__/modeling_t5.cpython-310.pyc b/lavis/models/blip2_models/__pycache__/modeling_t5.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a97264ea92003aa5a1fc887f52914ae1a535e5c4 Binary files /dev/null and b/lavis/models/blip2_models/__pycache__/modeling_t5.cpython-310.pyc differ diff --git a/lavis/models/blip2_models/blip2.py b/lavis/models/blip2_models/blip2.py new file mode 100644 index 0000000000000000000000000000000000000000..6446f476b809b9e5ba0d4f2b53212435900e42e5 --- /dev/null +++ b/lavis/models/blip2_models/blip2.py @@ -0,0 +1,315 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import contextlib +import logging +import os +import time +import datetime + +import torch +import torch.nn as nn +import torch.distributed as dist +import torch.nn.functional as F + +import lavis.common.dist_utils as dist_utils +from lavis.common.dist_utils import download_cached_file +from lavis.common.utils import is_url +from lavis.common.logger import MetricLogger +from lavis.models.base_model import BaseModel +from lavis.models.blip2_models.Qformer import BertConfig, BertLMHeadModel +from lavis.models.eva_vit import create_eva_vit_g +from lavis.models.clip_vit import create_clip_vit_L +from transformers import BertTokenizer + + +class Blip2Base(BaseModel): + @classmethod + def init_tokenizer(cls, truncation_side="right"): + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side=truncation_side) + tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + return tokenizer + + def maybe_autocast(self, dtype=torch.float16): + # if on cpu, don't use autocast + # if on gpu, use autocast with dtype if provided, otherwise use torch.float16 + enable_autocast = self.device != torch.device("cpu") + + if enable_autocast: + return torch.cuda.amp.autocast(dtype=dtype) + else: + return contextlib.nullcontext() + + @classmethod + def init_Qformer(cls, num_query_token, vision_width, cross_attention_freq=2): + encoder_config = BertConfig.from_pretrained("bert-base-uncased") + encoder_config.encoder_width = vision_width + # insert cross-attention layer every other block + encoder_config.add_cross_attention = True + encoder_config.cross_attention_freq = cross_attention_freq + encoder_config.query_length = num_query_token + Qformer = BertLMHeadModel.from_pretrained( + "bert-base-uncased", config=encoder_config + ) + query_tokens = nn.Parameter( + torch.zeros(1, num_query_token, encoder_config.hidden_size) + ) + query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range) + return Qformer, query_tokens + + def init_vision_encoder( + self, model_name, img_size, drop_path_rate, use_grad_checkpoint, precision + ): + assert model_name in [ + "eva_clip_g", + "eva2_clip_L", + "clip_L", + ], "vit model must be eva_clip_g, eva2_clip_L or clip_L" + if model_name == "eva_clip_g": + visual_encoder = create_eva_vit_g( + img_size, drop_path_rate, use_grad_checkpoint, precision + ) +# elif model_name == "eva2_clip_L": +# visual_encoder = create_eva2_vit_L( +# img_size, drop_path_rate, use_grad_checkpoint, precision +# ) + elif model_name == "clip_L": + visual_encoder = create_clip_vit_L(img_size, use_grad_checkpoint, precision) + ln_vision = LayerNorm(visual_encoder.num_features) + self.vit_name = model_name + return visual_encoder, ln_vision + + def load_from_pretrained(self, url_or_filename): + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + state_dict = checkpoint["model"] + + msg = self.load_state_dict(state_dict, strict=False) + + # logging.info("Missing keys {}".format(msg.missing_keys)) + logging.info("load checkpoint from %s" % url_or_filename) + + return msg + + def get_optimizer_params(self, weight_decay, lr_scale=1): + + vit_num_layers = self.visual_encoder.get_num_layer() + lr_scales = list(lr_scale ** (vit_num_layers + 1 - i) for i in range(vit_num_layers + 2)) + + parameter_group_names = {} + parameter_group_vars = {} + + for name, param in self.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if len(param.shape) == 1 or name.endswith(".bias"): + group_name = "no_decay" + this_weight_decay = 0. + else: + group_name = "decay" + this_weight_decay = weight_decay + if 'visual_encoder' in name: + layer_id = self.visual_encoder.get_num_layer(name.replace('visual_encoder.','')) + group_name = "vit_layer_%d_%s" % (layer_id, group_name) + else: + layer_id = None + + if group_name not in parameter_group_names: + if layer_id is not None: + scale = lr_scales[layer_id] + else: + scale = 1 + parameter_group_names[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "lr_scale": scale + } + parameter_group_vars[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "lr_scale": scale + } + parameter_group_vars[group_name]["params"].append(param) + parameter_group_names[group_name]["params"].append(name) + # import json + # print("Param groups = %s" % json.dumps(parameter_group_names, indent=2)) + optim_params = list(parameter_group_vars.values()) + return optim_params + + def _lemmatize(self, answers): + def apply(answer): + doc = self.lemmatizer(answer) + + words = [] + for token in doc: + if token.pos_ in ["NOUN", "VERB"]: + words.append(token.lemma_) + else: + words.append(token.text) + answer = " ".join(words) + + return answer + + return [apply(answer) for answer in answers] + + @property + def lemmatizer(self): + if self._lemmatizer is None: + try: + import spacy + + self._lemmatizer = spacy.load("en_core_web_sm") + except ImportError: + logging.error( + """ + Please install spacy and en_core_web_sm model to apply lemmatization. + python -m spacy download en_core_web_sm + OR + import spacy.cli + spacy.cli.download("en_core_web_sm") + """ + ) + exit(1) + + return self._lemmatizer + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +def compute_sim_matrix(model, data_loader, **kwargs): + k_test = kwargs.pop("k_test") + + metric_logger = MetricLogger(delimiter=" ") + header = "Evaluation:" + + logging.info("Computing features for evaluation...") + start_time = time.time() + + texts = data_loader.dataset.text + num_text = len(texts) + text_bs = 256 + text_ids = [] + text_embeds = [] + text_atts = [] + for i in range(0, num_text, text_bs): + text = texts[i : min(num_text, i + text_bs)] + text_input = model.tokenizer( + text, + padding="max_length", + truncation=True, + max_length=35, + return_tensors="pt", + ).to(model.device) + text_feat = model.forward_text(text_input) + text_embed = F.normalize(model.text_proj(text_feat)) + text_embeds.append(text_embed) + text_ids.append(text_input.input_ids) + text_atts.append(text_input.attention_mask) + + text_embeds = torch.cat(text_embeds, dim=0) + text_ids = torch.cat(text_ids, dim=0) + text_atts = torch.cat(text_atts, dim=0) + + vit_feats = [] + image_embeds = [] + for samples in data_loader: + image = samples["image"] + + image = image.to(model.device) + image_feat, vit_feat = model.forward_image(image) + image_embed = model.vision_proj(image_feat) + image_embed = F.normalize(image_embed, dim=-1) + + vit_feats.append(vit_feat.cpu()) + image_embeds.append(image_embed) + + vit_feats = torch.cat(vit_feats, dim=0) + image_embeds = torch.cat(image_embeds, dim=0) + + sims_matrix = [] + for image_embed in image_embeds: + sim_q2t = image_embed @ text_embeds.t() + sim_i2t, _ = sim_q2t.max(0) + sims_matrix.append(sim_i2t) + sims_matrix = torch.stack(sims_matrix, dim=0) + + score_matrix_i2t = torch.full( + (len(data_loader.dataset.image), len(texts)), -100.0 + ).to(model.device) + + num_tasks = dist_utils.get_world_size() + rank = dist_utils.get_rank() + step = sims_matrix.size(0) // num_tasks + 1 + start = rank * step + end = min(sims_matrix.size(0), start + step) + + for i, sims in enumerate( + metric_logger.log_every(sims_matrix[start:end], 50, header) + ): + topk_sim, topk_idx = sims.topk(k=k_test, dim=0) + image_inputs = vit_feats[start + i].repeat(k_test, 1, 1).to(model.device) + score = model.compute_itm( + image_inputs=image_inputs, + text_ids=text_ids[topk_idx], + text_atts=text_atts[topk_idx], + ).float() + score_matrix_i2t[start + i, topk_idx] = score + topk_sim + + sims_matrix = sims_matrix.t() + score_matrix_t2i = torch.full( + (len(texts), len(data_loader.dataset.image)), -100.0 + ).to(model.device) + + step = sims_matrix.size(0) // num_tasks + 1 + start = rank * step + end = min(sims_matrix.size(0), start + step) + + for i, sims in enumerate( + metric_logger.log_every(sims_matrix[start:end], 50, header) + ): + topk_sim, topk_idx = sims.topk(k=k_test, dim=0) + image_inputs = vit_feats[topk_idx.cpu()].to(model.device) + score = model.compute_itm( + image_inputs=image_inputs, + text_ids=text_ids[start + i].repeat(k_test, 1), + text_atts=text_atts[start + i].repeat(k_test, 1), + ).float() + score_matrix_t2i[start + i, topk_idx] = score + topk_sim + + if dist_utils.is_dist_avail_and_initialized(): + dist.barrier() + torch.distributed.all_reduce( + score_matrix_i2t, op=torch.distributed.ReduceOp.SUM + ) + torch.distributed.all_reduce( + score_matrix_t2i, op=torch.distributed.ReduceOp.SUM + ) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + logging.info("Evaluation time {}".format(total_time_str)) + + return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy() diff --git a/lavis/models/blip2_models/blip2_image_text_matching.py b/lavis/models/blip2_models/blip2_image_text_matching.py new file mode 100644 index 0000000000000000000000000000000000000000..f32db24d09e23c92d61453e569d58f1d7da18969 --- /dev/null +++ b/lavis/models/blip2_models/blip2_image_text_matching.py @@ -0,0 +1,116 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.blip2_models.blip2_qformer import Blip2Qformer + + +@registry.register_model("blip2_image_text_matching") +class Blip2ITM(Blip2Qformer): + """ + BLIP Image-Text Matching (ITM) model. + Supported model types: + - pretrained: pretrained model + - coco: fintuned model on coco + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip2_image_text_matching", "pretrained") + >>> model = load_model("blip2_image_text_matching", "coco") + """ + + def __init__( + self, + vit_model="eva_clip_g", + img_size=224, + drop_path_rate=0, + use_grad_checkpoint=False, + vit_precision="fp16", + freeze_vit=True, + num_query_token=32, + cross_attention_freq=2, + embed_dim=256, + max_txt_len=32, + ): + super().__init__( + vit_model=vit_model, + img_size=img_size, + drop_path_rate=drop_path_rate, + use_grad_checkpoint=use_grad_checkpoint, + vit_precision=vit_precision, + freeze_vit=freeze_vit, + num_query_token=num_query_token, + cross_attention_freq=cross_attention_freq, + embed_dim=embed_dim, + max_txt_len=max_txt_len, + ) + + def forward(self, samples, match_head="itm"): + image = samples["image"] + caption = samples["text_input"] + + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_embeds = image_embeds.float() + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + text = self.tokenizer( + caption, + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + + if match_head == "itm": + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( + image.device + ) + attention_mask = torch.cat([query_atts, text.attention_mask], dim=1) + output_itm = self.Qformer.bert( + text.input_ids, + query_embeds=query_tokens, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + itm_embeddings = output_itm.last_hidden_state[:, : query_tokens.size(1), :] + itm_logit = self.itm_head(itm_embeddings) + itm_logit = itm_logit.mean(dim=1) + + return itm_logit + + elif match_head == "itc": + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + image_feats = F.normalize( + self.vision_proj(query_output.last_hidden_state), dim=-1 + ) + + text_output = self.Qformer.bert( + text.input_ids, + attention_mask=text.attention_mask, + return_dict=True, + ) + text_feat = F.normalize( + self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1 + ) + + sims = torch.bmm(image_feats, text_feat.unsqueeze(-1)) + sim, _ = torch.max(sims, dim=1) + + return sim diff --git a/lavis/models/blip2_models/blip2_opt.py b/lavis/models/blip2_models/blip2_opt.py new file mode 100644 index 0000000000000000000000000000000000000000..2b525b485bde4c9ef7e2f377d5380ec20f367930 --- /dev/null +++ b/lavis/models/blip2_models/blip2_opt.py @@ -0,0 +1,425 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import logging +from packaging import version + +import torch +from torch.cuda.amp import autocast as autocast +import torch.nn as nn + +from lavis.common.registry import registry +from lavis.models.blip2_models.blip2 import Blip2Base, disabled_train +# from lavis.models.blip2_models.modeling_opt import OPTForCausalLM, OPTConfig +from transformers import AutoTokenizer, OPTForCausalLM, OPTConfig +import transformers + + +@registry.register_model("blip2_opt") +class Blip2OPT(Blip2Base): + """ + BLIP2 OPT model. + Supported model types: + - pretrained_opt2.7b: pretrained model with OPT2.7b + - pretrained_opt6.7b: pretrained model with OPT6.7b + - caption_coco_opt2.7b: fintuned image captioning model with OPT2.7b + - caption_coco_opt6.7b: fintuned image captioning model with OPT6.7b + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip2_opt", "caption_coco_opt2.7b") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "pretrain_opt2.7b": "configs/models/blip2/blip2_pretrain_opt2.7b.yaml", + "pretrain_opt6.7b": "configs/models/blip2/blip2_pretrain_opt6.7b.yaml", + "caption_coco_opt2.7b": "configs/models/blip2/blip2_caption_opt2.7b.yaml", + "caption_coco_opt6.7b": "configs/models/blip2/blip2_caption_opt6.7b.yaml", + } + + def __init__( + self, + vit_model="eva_clip_g", + img_size=224, + drop_path_rate=0, + use_grad_checkpoint=False, + vit_precision="fp16", + freeze_vit=True, + num_query_token=32, + opt_model="facebook/opt-2.7b", + prompt="", + max_txt_len=32, + apply_lemmatizer=False, + ): + """ + apply_lemmatizer: when set to True, postprocess predict_answers() result with lemmas. + """ + super().__init__() + transformers_version = version.parse(transformers.__version__) + assert transformers_version >= version.parse("4.27"), "BLIP-2 OPT requires transformers>=4.27" + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder, self.ln_vision = self.init_vision_encoder( + vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision + ) + if freeze_vit: + for name, param in self.visual_encoder.named_parameters(): + param.requires_grad = False + self.visual_encoder = self.visual_encoder.eval() + self.visual_encoder.train = disabled_train + logging.info("freeze vision encoder") + + self.Qformer, self.query_tokens = self.init_Qformer( + num_query_token, self.visual_encoder.num_features + ) + self.Qformer.cls = None + self.Qformer.bert.embeddings.word_embeddings = None + self.Qformer.bert.embeddings.position_embeddings = None + for layer in self.Qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + + self.opt_tokenizer = AutoTokenizer.from_pretrained(opt_model, use_fast=False) + self.opt_model = OPTForCausalLM.from_pretrained( + opt_model, torch_dtype=torch.float16 + ) + for name, param in self.opt_model.named_parameters(): + param.requires_grad = False + self.eos_token_id = self.opt_tokenizer( + "\n", add_special_tokens=False + ).input_ids[0] + + self.opt_proj = nn.Linear( + self.Qformer.config.hidden_size, self.opt_model.config.hidden_size + ) + + self.max_txt_len = max_txt_len + self.prompt = prompt + prompt_tokens = self.opt_tokenizer(self.prompt, return_tensors="pt") + self.prompt_length = prompt_tokens.attention_mask.sum(1) + + self._apply_lemmatizer = apply_lemmatizer + self._lemmatizer = None + + def forward(self, samples): + image = samples["image"] + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_opt = self.opt_proj(query_output.last_hidden_state) + atts_opt = torch.ones(inputs_opt.size()[:-1], dtype=torch.long).to(image.device) + + self.opt_tokenizer.padding_side = "right" + + text = [t + "\n" for t in samples["text_input"]] + + opt_tokens = self.opt_tokenizer( + text, + return_tensors="pt", + padding="longest", + truncation=True, + max_length=self.max_txt_len, + ).to(image.device) + + targets = opt_tokens.input_ids.masked_fill( + opt_tokens.input_ids == self.opt_tokenizer.pad_token_id, -100 + ) + if self.prompt: + targets[:, : self.prompt_length] = -100 # do not apply loss to the prompt + + empty_targets = ( + torch.ones(atts_opt.size(), dtype=torch.long).to(image.device).fill_(-100) + ) + targets = torch.cat([empty_targets, targets], dim=1) + + inputs_embeds = self.opt_model.model.decoder.embed_tokens(opt_tokens.input_ids) + inputs_embeds = torch.cat([inputs_opt, inputs_embeds], dim=1) + attention_mask = torch.cat([atts_opt, opt_tokens.attention_mask], dim=1) + + with self.maybe_autocast(): + outputs = self.opt_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict=True, + labels=targets, + ) + loss = outputs.loss + + return {"loss": loss} + + @torch.no_grad() + def generate( + self, + samples, + use_nucleus_sampling=False, + num_beams=5, + max_length=30, + min_length=1, + top_p=0.9, + repetition_penalty=1.0, + length_penalty=1.0, + num_captions=1, + temperature=1, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + use_nucleus_sampling (bool): Whether to use nucleus sampling. If False, use top-k sampling. + num_beams (int): Number of beams for beam search. 1 means no beam search. + max_length (int): The maximum length of the sequence to be generated. + min_length (int): The minimum length of the sequence to be generated. + top_p (float): The cumulative probability for nucleus sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_captions (int): Number of captions to be generated for each image. + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + image = samples["image"] + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_opt = self.opt_proj(query_output.last_hidden_state) + atts_opt = torch.ones(inputs_opt.size()[:-1], dtype=torch.long).to( + image.device + ) + + if "prompt" in samples.keys(): + prompt = samples["prompt"] + else: + prompt = self.prompt + + prompt = [prompt] * image.size(0) + + opt_tokens = self.opt_tokenizer( + prompt, + return_tensors="pt", + padding="longest", + truncation=True, + max_length=self.max_txt_len, + ).to(image.device) + attention_mask = torch.cat([atts_opt, opt_tokens.attention_mask], dim=1) + + # new version for transformers>=4.27 + inputs_embeds = self.opt_model.get_input_embeddings()(opt_tokens.input_ids) + inputs_embeds = torch.cat([inputs_opt,inputs_embeds],dim=1) + + outputs = self.opt_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + do_sample=use_nucleus_sampling, + top_p=top_p, + temperature=temperature, + num_beams=num_beams, + max_length=max_length, + min_length=min_length, + eos_token_id=self.eos_token_id, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + num_return_sequences=num_captions, + ) + output_text = self.opt_tokenizer.batch_decode( + outputs, skip_special_tokens=True + ) + + # previous version for transformers<4.27 + # if use_nucleus_sampling: + # query_embeds = inputs_opt.repeat_interleave(num_captions, dim=0) + # num_beams = 1 + # else: + # query_embeds = inputs_opt.repeat_interleave(num_beams, dim=0) + + # outputs = self.opt_model.generate( + # input_ids=input_ids, + # query_embeds=query_embeds, + # attention_mask=attention_mask, + # do_sample=use_nucleus_sampling, + # top_p=top_p, + # temperature=temperature, + # num_beams=num_beams, + # max_new_tokens=max_length, + # min_length=min_length, + # eos_token_id=self.eos_token_id, + # repetition_penalty=repetition_penalty, + # length_penalty=length_penalty, + # num_return_sequences=num_captions, + # ) + + # prompt_length = opt_tokens.input_ids.shape[1] + # output_text = self.opt_tokenizer.batch_decode( + # outputs[:, prompt_length:], skip_special_tokens=True + # ) + + output_text = [text.strip() for text in output_text] + return output_text + + + def predict_answers( + self, + samples, + num_beams=5, + inference_method="generate", + max_len=10, + min_len=1, + num_ans_candidates=128, + answer_list=None, + prompt="", + length_penalty=0, + **kwargs + ): + image = samples["image"] + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_opt = self.opt_proj(query_output.last_hidden_state) + atts_opt = torch.ones(inputs_opt.size()[:-1], dtype=torch.long).to( + image.device + ) + + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] + if prompt: + text_input = [prompt.format(question) for question in samples["text_input"]] + else: + text_input = samples["text_input"] + + self.opt_tokenizer.padding_side = "left" + opt_tokens = self.opt_tokenizer( + text_input, + return_tensors="pt", + padding="longest", + truncation=True, + max_length=self.max_txt_len, + ).to(image.device) + + attention_mask = torch.cat([atts_opt, opt_tokens.attention_mask], dim=1) + + # require transformers>=4.27 + inputs_embeds = self.opt_model.get_input_embeddings()(opt_tokens.input_ids) + inputs_embeds = torch.cat([inputs_opt,inputs_embeds],dim=1) + + outputs = self.opt_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + do_sample=False, + num_beams=num_beams, + max_new_tokens=max_len, + min_length=min_len, + eos_token_id=self.eos_token_id, + length_penalty=length_penalty, + ) + output_text = self.opt_tokenizer.batch_decode( + outputs, skip_special_tokens=True + ) + output_text = [text.strip() for text in output_text] + if self._apply_lemmatizer or ("apply_lemmatizer" in samples.keys() and samples["apply_lemmatizer"]): + output_text = self._lemmatize(output_text) + + return output_text + + def _lemmatize(self, answers): + def apply(answer): + doc = self.lemmatizer(answer) + + words = [] + for token in doc: + if token.pos_ in ["NOUN", "VERB"]: + words.append(token.lemma_) + else: + words.append(token.text) + answer = " ".join(words) + + return answer + + return [apply(answer) for answer in answers] + + @property + def lemmatizer(self): + if self._lemmatizer is None: + try: + import spacy + + self._lemmatizer = spacy.load("en_core_web_sm") + except ImportError: + logging.error( + """ + Please install spacy and en_core_web_sm model to apply lemmatization. + python -m spacy download en_core_web_sm + OR + import spacy.cli + spacy.cli.download("en_core_web_sm") + """ + ) + exit(1) + + return self._lemmatizer + + @classmethod + def from_config(cls, cfg): + vit_model = cfg.get("vit_model", "eva_clip_g") + img_size = cfg.get("image_size") + num_query_token = cfg.get("num_query_token") + opt_model = cfg.get("opt_model") + + drop_path_rate = cfg.get("drop_path_rate", 0) + use_grad_checkpoint = cfg.get("use_grad_checkpoint", False) + vit_precision = cfg.get("vit_precision", "fp16") + freeze_vit = cfg.get("freeze_vit", True) + + prompt = cfg.get("prompt", "") + max_txt_len = cfg.get("max_txt_len", 32) + + apply_lemmatizer = cfg.get("apply_lemmatizer", False) + + model = cls( + vit_model=vit_model, + img_size=img_size, + drop_path_rate=drop_path_rate, + use_grad_checkpoint=use_grad_checkpoint, + vit_precision=vit_precision, + freeze_vit=freeze_vit, + num_query_token=num_query_token, + opt_model=opt_model, + prompt=prompt, + max_txt_len=max_txt_len, + apply_lemmatizer=apply_lemmatizer, + ) + model.load_checkpoint_from_config(cfg) + + return model diff --git a/lavis/models/blip2_models/blip2_qformer.py b/lavis/models/blip2_models/blip2_qformer.py new file mode 100644 index 0000000000000000000000000000000000000000..5d94db7ecce5ac892b746c25240562a7bcdb0015 --- /dev/null +++ b/lavis/models/blip2_models/blip2_qformer.py @@ -0,0 +1,535 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import logging + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.cuda.amp import autocast as autocast +from torch.nn import functional as F + +from lavis.common.registry import registry +from lavis.models.base_model import all_gather_with_grad, concat_all_gather +from lavis.models.blip2_models.blip2 import ( + Blip2Base, + compute_sim_matrix, + disabled_train, +) +from lavis.models.blip_models.blip_outputs import BlipOutput, BlipOutputFeatures + + +@registry.register_model("blip2") +@registry.register_model("blip2_feature_extractor") +class Blip2Qformer(Blip2Base): + """ + BLIP2 first-stage model with Q-former and ViT. + Supported model types: + - pretrained: pretrained model with vit-g + - pretrain_vitL: pretrained model with vit-large + - coco: fintuned model on coco + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip2", "pretrain") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "pretrain": "configs/models/blip2/blip2_pretrain.yaml", + "pretrain_vitL": "configs/models/blip2/blip2_pretrain_vitL.yaml", + "coco": "configs/models/blip2/blip2_coco.yaml", + } + + def __init__( + self, + vit_model="eva_clip_g", + img_size=224, + drop_path_rate=0, + use_grad_checkpoint=False, + vit_precision="fp16", + freeze_vit=True, + num_query_token=32, + cross_attention_freq=2, + embed_dim=256, + max_txt_len=32, + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder, self.ln_vision = self.init_vision_encoder( + vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision + ) + if freeze_vit: + for name, param in self.visual_encoder.named_parameters(): + param.requires_grad = False + self.visual_encoder = self.visual_encoder.eval() + self.visual_encoder.train = disabled_train + logging.info("freeze vision encoder") + self.Qformer, self.query_tokens = self.init_Qformer( + num_query_token, self.visual_encoder.num_features, cross_attention_freq + ) + self.Qformer.resize_token_embeddings(len(self.tokenizer)) + state_dict = self.Qformer.state_dict() + for name, param in self.Qformer.named_parameters(): + if "_query" in name: + key_orig = name.replace("_query", "") + param.data.copy_(state_dict[key_orig]) + + self.vision_proj = nn.Linear(self.Qformer.config.hidden_size, embed_dim) + self.text_proj = nn.Linear(self.Qformer.config.hidden_size, embed_dim) + + self.itm_head = nn.Linear(self.Qformer.config.hidden_size, 2) + + self.temp = nn.Parameter(0.07 * torch.ones([])) + + self.max_txt_len = max_txt_len + + def forward(self, samples): + image = samples["image"] + text = samples["text_input"] + + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + use_cache=True, + return_dict=True, + ) + + image_feats = F.normalize( + self.vision_proj(query_output.last_hidden_state), dim=-1 + ) + + text_tokens = self.tokenizer( + text, + padding="max_length", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + text_output = self.Qformer.bert( + text_tokens.input_ids, + attention_mask=text_tokens.attention_mask, + return_dict=True, + ) + text_feat = F.normalize( + self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1 + ) + + ###============== Image-text Contrastive ===================### + image_feats_all = concat_all_gather( + image_feats + ) # [batch_size*num_gpu, num_query_tokens, embed_dim] + text_feat_all = concat_all_gather(text_feat) # [batch_size*num_gpu, embed_dim] + + sim_q2t = torch.matmul( + image_feats.unsqueeze(1), text_feat_all.unsqueeze(-1) + ).squeeze() + # [batch_size, batch_size*num_gpu, num_query_tokens] + + # image-text similarity: aggregate across all query tokens + sim_i2t, _ = sim_q2t.max(-1) + sim_i2t = sim_i2t / self.temp + + # text-query similarity: [batch_size, batch_size*num_gpu, num_query_tokens] + sim_t2q = torch.matmul( + text_feat.unsqueeze(1).unsqueeze(1), image_feats_all.permute(0, 2, 1) + ).squeeze() + + # text-image similarity: aggregate across all query tokens + sim_t2i, _ = sim_t2q.max(-1) + sim_t2i = sim_t2i / self.temp # [batch_size, batch_size*num_gpu] + + rank = dist.get_rank() + bs = image.size(0) + targets = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to( + image.device + ) + + if "image_id" in samples.keys(): #coco retrieval finetuning + image_ids = samples["image_id"].view(-1,1) + image_ids_all = concat_all_gather(image_ids) + pos_idx = torch.eq(image_ids, image_ids_all.t()).float() + sim_targets = pos_idx / pos_idx.sum(1,keepdim=True) + sim_targets = 0.9 * sim_targets + 0.1 * torch.ones_like(sim_targets) / sim_targets.size(1) + + loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1)*sim_targets,dim=1).mean() + loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1)*sim_targets,dim=1).mean() + loss_itc = (loss_t2i+loss_i2t)/2 + else: + loss_itc = ( + F.cross_entropy(sim_i2t, targets, label_smoothing=0.1) + + F.cross_entropy(sim_t2i, targets, label_smoothing=0.1) + ) / 2 + + ###============== Image-text Matching ===================### + text_input_ids_world = concat_all_gather(text_tokens.input_ids) + text_attention_mask_world = concat_all_gather(text_tokens.attention_mask) + image_embeds_world = all_gather_with_grad(image_embeds) + with torch.no_grad(): + if "image_id" in samples.keys(): + mask = torch.eq(image_ids, image_ids_all.t()) + sim_t2i.masked_fill_(mask, -10000) + sim_i2t.masked_fill_(mask, -10000) + else: + sim_t2i[:, rank * bs : rank * bs + bs].fill_diagonal_(-10000) + sim_i2t[:, rank * bs : rank * bs + bs].fill_diagonal_(-10000) + + weights_t2i = F.softmax(sim_t2i, dim=1) + weights_i2t = F.softmax(sim_i2t, dim=1) + + # select a negative image for each text + image_embeds_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_t2i[b], 1).item() + image_embeds_neg.append(image_embeds_world[neg_idx]) + image_embeds_neg = torch.stack(image_embeds_neg, dim=0) + + # select a negative text for each image + text_ids_neg = [] + text_atts_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_i2t[b], 1).item() + text_ids_neg.append(text_input_ids_world[neg_idx]) + text_atts_neg.append(text_attention_mask_world[neg_idx]) + + text_ids_neg = torch.stack(text_ids_neg, dim=0) + text_atts_neg = torch.stack(text_atts_neg, dim=0) + + text_ids_all = torch.cat( + [text_tokens.input_ids, text_tokens.input_ids, text_ids_neg], dim=0 + ) # pos, pos, neg + text_atts_all = torch.cat( + [text_tokens.attention_mask, text_tokens.attention_mask, text_atts_neg], + dim=0, + ) + + query_tokens_itm = self.query_tokens.expand(text_ids_all.shape[0], -1, -1) + query_atts_itm = torch.ones(query_tokens_itm.size()[:-1], dtype=torch.long).to( + image.device + ) + attention_mask_all = torch.cat([query_atts_itm, text_atts_all], dim=1) + + image_embeds_all = torch.cat( + [image_embeds, image_embeds_neg, image_embeds], dim=0 + ) # pos, neg, pos + image_atts_all = torch.ones(image_embeds_all.size()[:-1], dtype=torch.long).to( + image.device + ) + + output_itm = self.Qformer.bert( + text_ids_all, + query_embeds=query_tokens_itm, + attention_mask=attention_mask_all, + encoder_hidden_states=image_embeds_all, + encoder_attention_mask=image_atts_all, + return_dict=True, + ) + + vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.size(1), :] + vl_output = self.itm_head(vl_embeddings) + logits = vl_output.mean(dim=1) + + itm_labels = torch.cat( + [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)], + dim=0, + ).to(image.device) + loss_itm = F.cross_entropy(logits, itm_labels) + + ##================= Image Captioning ========================## + decoder_input_ids = text_tokens.input_ids.clone() + decoder_input_ids[:, 0] = self.tokenizer.bos_token_id + labels = decoder_input_ids.masked_fill( + decoder_input_ids == self.tokenizer.pad_token_id, -100 + ) + + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( + image.device + ) + attention_mask = torch.cat([query_atts, text_tokens.attention_mask], dim=1) + lm_output = self.Qformer( + decoder_input_ids, + attention_mask=attention_mask, + past_key_values=query_output.past_key_values, + return_dict=True, + labels=labels, + ) + + loss_lm = lm_output.loss + + return BlipOutput( + loss=loss_itc + loss_itm + loss_lm, + loss_itc=loss_itc, + loss_itm=loss_itm, + loss_lm=loss_lm, + ) + + @torch.no_grad() + def generate( + self, + samples, + use_nucleus_sampling=False, + num_beams=3, + max_length=30, + min_length=10, + top_p=0.9, + repetition_penalty=1.0, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + use_nucleus_sampling (bool): Whether to use nucleus sampling. If False, use top-k sampling. + num_beams (int): Number of beams for beam search. 1 means no beam search. + max_length (int): The maximum length of the sequence to be generated. + min_length (int): The minimum length of the sequence to be generated. + top_p (float): The cumulative probability for nucleus sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_captions (int): Number of captions to be generated for each image. + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + image = samples["image"] + image_embeds = self.ln_vision(self.visual_encoder(image)) + + if not use_nucleus_sampling: + image_embeds = image_embeds.repeat_interleave(num_beams, dim=0) + else: + num_beams = 1 + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + model_kwargs = { + "encoder_hidden_states": image_embeds, + "encoder_attention_mask": image_atts, + } + + input_ids = ( + torch.LongTensor(image.size(0), 1) + .fill_(self.tokenizer.bos_token_id) + .to(image.device) + ) + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + + outputs = self.Qformer.generate( + input_ids=input_ids, + query_embeds=query_tokens, + max_length=max_length, + min_length=min_length, + num_beams=num_beams, + do_sample=use_nucleus_sampling, + top_p=top_p, + eos_token_id=self.tokenizer.sep_token_id, + pad_token_id=self.tokenizer.pad_token_id, + **model_kwargs + ) + captions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) + return captions + + def forward_image(self, image): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + return query_output.last_hidden_state, image_embeds + + def forward_text(self, text_tokens): + text_output = self.Qformer.bert( + text_tokens.input_ids, + attention_mask=text_tokens.attention_mask, + return_dict=True, + ) + return text_output.last_hidden_state[:, 0, :] + + def compute_itm(self, image_inputs, text_ids, text_atts): + image_atts = torch.ones(image_inputs.size()[:-1], dtype=torch.long).to( + image_inputs.device + ) + query_tokens = self.query_tokens.expand(image_inputs.shape[0], -1, -1) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( + image_inputs.device + ) + attention_mask = torch.cat([query_atts, text_atts], dim=1) + output_itm = self.Qformer.bert( + text_ids, + query_embeds=query_tokens, + attention_mask=attention_mask, + encoder_hidden_states=image_inputs, + encoder_attention_mask=image_atts, + return_dict=True, + ) + vl_embeddings = output_itm.last_hidden_state[:, : query_tokens.size(1), :] + itm_logit = self.itm_head(vl_embeddings) + itm_logit = itm_logit[:, :, 1].mean(dim=1) + return itm_logit + + @torch.no_grad() + def extract_features(self, samples, mode="multimodal"): + """ + Extract features for multimodal or unimodal samples. + Args: + samples (dict): A dictionary of samples, containing the following keys: + - image (torch.Tensor): A tensor of shape (B, C, H, W) containing the image. + Raw images should be preprocessed before being passed to feature extractor. + - text_input (list): A list of strings containing the text, length B. + mode (str): The mode of feature extraction. Can be either "multimodal", "text" or "image". + If "multimodal", return image features and multimodal features; + if "text", return text features; + if "image", return image features. + Default: "multimodal". + Returns: + BlipOutputFeatures: A BlipOutputFeatures object containing the features. + See lavis/models/blip_models/blip_outputs.py for more details. + """ + image = samples.get("image") + caption = samples.get("text_input") + + # assert mode is one of "image", "text", "multimodal" + assert mode in [ + "image", + "text", + "multimodal", + ], "mode must be one of 'image', 'text', 'multimodal'" + + # initalize output + image_embeds, text_embeds, multimodal_embeds = None, None, None + image_features, text_features = None, None + + if mode == "image": + assert ( + image is not None + ), "Image is not provided for mode 'image' or 'multimodal'" + # return query features + with self.maybe_autocast(): + image_embeds_frozen = self.ln_vision(self.visual_encoder(image)) + image_embeds_frozen = image_embeds_frozen.float() + image_atts = torch.ones( + image_embeds_frozen.size()[:-1], dtype=torch.long + ).to(self.device) + query_tokens = self.query_tokens.expand( + image_embeds_frozen.shape[0], -1, -1 + ) + + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds_frozen, + encoder_attention_mask=image_atts, + return_dict=True, + ) + image_embeds = query_output.last_hidden_state + image_features = F.normalize(self.vision_proj(image_embeds), dim=-1) + + elif mode == "text": + assert ( + caption is not None + ), "text input is None for mode 'text' or 'multimodal'" + + # return text features + text = self.tokenizer(caption, return_tensors="pt", padding=True).to( + self.device + ) + + text_output = self.Qformer.bert( + text.input_ids, + attention_mask=text.attention_mask, + return_dict=True, + ) + text_embeds = text_output.last_hidden_state + text_features = self.text_proj(text_embeds) + text_features = F.normalize(text_features, dim=-1) + + elif mode == "multimodal": + # return multimodel query features + with self.maybe_autocast(): + image_embeds_frozen = self.ln_vision(self.visual_encoder(image)) + image_embeds_frozen = image_embeds_frozen.float() + image_atts = torch.ones( + image_embeds_frozen.size()[:-1], dtype=torch.long + ).to(self.device) + query_tokens = self.query_tokens.expand( + image_embeds_frozen.shape[0], -1, -1 + ) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( + self.device + ) + + text = self.tokenizer(caption, return_tensors="pt", padding=True).to( + self.device + ) + attention_mask = torch.cat([query_atts, text.attention_mask], dim=1) + + output = self.Qformer.bert( + text.input_ids, + query_embeds=query_tokens, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds_frozen, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + multimodal_embeds = output.last_hidden_state[:, : query_tokens.size(1), :] + + return BlipOutputFeatures( + image_embeds=image_embeds, + image_embeds_proj=image_features, + text_embeds=text_embeds, + text_embeds_proj=text_features, + multimodal_embeds=multimodal_embeds, + ) + + @classmethod + def from_config(cls, cfg): + vit_model = cfg.get("vit_model", "eva_clip_g") + img_size = cfg.get("image_size") + num_query_token = cfg.get("num_query_token") + cross_attention_freq = cfg.get("cross_attention_freq", 2) + + drop_path_rate = cfg.get("drop_path_rate", 0) + use_grad_checkpoint = cfg.get("use_grad_checkpoint", False) + vit_precision = cfg.get("vit_precision", "fp16") + freeze_vit = cfg.get("freeze_vit", True) + + max_txt_len = cfg.get("max_txt_len", 32) + + model = cls( + vit_model=vit_model, + img_size=img_size, + drop_path_rate=drop_path_rate, + use_grad_checkpoint=use_grad_checkpoint, + vit_precision=vit_precision, + freeze_vit=freeze_vit, + num_query_token=num_query_token, + cross_attention_freq=cross_attention_freq, + max_txt_len=max_txt_len, + ) + model.load_checkpoint_from_config(cfg) + + return model + + def compute_sim_matrix(self, data_loader, task_cfg): + """ + Compute similarity i2t, t2i matrix for the given data loader. + """ + k_test = task_cfg.k_test + + return compute_sim_matrix(model=self, data_loader=data_loader, k_test=k_test) diff --git a/lavis/models/blip2_models/blip2_t5.py b/lavis/models/blip2_models/blip2_t5.py new file mode 100644 index 0000000000000000000000000000000000000000..ba98e431854674ef92d6616a3b0daad432e4801e --- /dev/null +++ b/lavis/models/blip2_models/blip2_t5.py @@ -0,0 +1,383 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import logging + +import torch +import torch.nn as nn +from torch.cuda.amp import autocast as autocast +from transformers import T5TokenizerFast + +from lavis.common.registry import registry +from lavis.models.blip2_models.blip2 import Blip2Base, disabled_train +from lavis.models.blip2_models.modeling_t5 import T5Config, T5ForConditionalGeneration + + +@registry.register_model("blip2_t5") +class Blip2T5(Blip2Base): + """ + BLIP2 T5 model. + Supported model types: + - pretrain_flant5xl: pretrained model with FlanT5-XL + - pretrain_flant5xl_vitL: pretrained model with FlanT5-XL + - pretrain_flant5xxl: pretrained model with FlanT5-XXL + - caption_coco_flant5xl: fintuned image captioning model with FlanT5-XL + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip2_t5", "pretrain_flant5xl") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "pretrain_flant5xl": "configs/models/blip2/blip2_pretrain_flant5xl.yaml", + "pretrain_flant5xl_vitL": "configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml", + "pretrain_flant5xxl": "configs/models/blip2/blip2_pretrain_flant5xxl.yaml", + "caption_coco_flant5xl": "configs/models/blip2/blip2_caption_flant5xl.yaml", + } + + def __init__( + self, + vit_model="eva_clip_g", + img_size=224, + drop_path_rate=0, + use_grad_checkpoint=False, + vit_precision="fp16", + freeze_vit=True, + num_query_token=32, + t5_model="google/flan-t5-xl", + prompt="", + max_txt_len=32, + apply_lemmatizer=False, + ): + """ + apply_lemmatizer: when set to True, postprocess predict_answers() result with lemmas. + """ + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder, self.ln_vision = self.init_vision_encoder( + vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision + ) + if freeze_vit: + for name, param in self.visual_encoder.named_parameters(): + param.requires_grad = False + self.visual_encoder = self.visual_encoder.eval() + self.visual_encoder.train = disabled_train + logging.info("freeze vision encoder") + + self.Qformer, self.query_tokens = self.init_Qformer( + num_query_token, self.visual_encoder.num_features + ) + self.Qformer.cls = None + self.Qformer.bert.embeddings.word_embeddings = None + self.Qformer.bert.embeddings.position_embeddings = None + for layer in self.Qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + + self.t5_tokenizer = T5TokenizerFast.from_pretrained(t5_model) + t5_config = T5Config.from_pretrained(t5_model) + t5_config.dense_act_fn = "gelu" + self.t5_model = T5ForConditionalGeneration.from_pretrained( + t5_model, config=t5_config + ) + + for name, param in self.t5_model.named_parameters(): + param.requires_grad = False + param.data = param.data.bfloat16() + + self.t5_proj = nn.Linear( + self.Qformer.config.hidden_size, self.t5_model.config.hidden_size + ) + + self.max_txt_len = max_txt_len + self.prompt = prompt + + self._apply_lemmatizer = apply_lemmatizer + self._lemmatizer = None + + def forward(self, samples): + image = samples["image"] + + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_t5 = self.t5_proj(query_output.last_hidden_state) + atts_t5 = torch.ones(inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + + with self.maybe_autocast(dtype=torch.bfloat16): + input_tokens = self.t5_tokenizer( + samples["text_input"], + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + output_tokens = self.t5_tokenizer( + samples["text_output"], + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + + encoder_atts = torch.cat([atts_t5, input_tokens.attention_mask], dim=1) + + targets = output_tokens.input_ids.masked_fill( + output_tokens.input_ids == self.t5_tokenizer.pad_token_id, -100 + ) + + inputs_embeds = self.t5_model.encoder.embed_tokens(input_tokens.input_ids) + inputs_embeds = torch.cat([inputs_t5, inputs_embeds], dim=1) + + outputs = self.t5_model( + inputs_embeds=inputs_embeds, + attention_mask=encoder_atts, + decoder_attention_mask=output_tokens.attention_mask, + return_dict=True, + labels=targets, + ) + loss = outputs.loss + + return {"loss": loss} + + @torch.no_grad() + def generate( + self, + samples, + use_nucleus_sampling=False, + num_beams=5, + max_length=30, + min_length=1, + top_p=0.9, + repetition_penalty=1.0, + length_penalty=1.0, + num_captions=1, + temperature=1, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + use_nucleus_sampling (bool): Whether to use nucleus sampling. If False, use top-k sampling. + num_beams (int): Number of beams for beam search. 1 means no beam search. + max_length (int): The maximum length of the sequence to be generated. + min_length (int): The minimum length of the sequence to be generated. + top_p (float): The cumulative probability for nucleus sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_captions (int): Number of captions to be generated for each image. + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + image = samples["image"] + + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_embeds = image_embeds.float() + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_t5 = self.t5_proj(query_output.last_hidden_state) + atts_t5 = torch.ones(inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + + if "prompt" in samples.keys(): + prompt = samples["prompt"] + else: + prompt = self.prompt + + if isinstance(prompt, str): + prompt = [prompt] * image.size(0) + else: + assert len(prompt) == image.size( + 0 + ), "The number of prompts must be equal to the batch size." + + input_tokens = self.t5_tokenizer( + prompt, padding="longest", return_tensors="pt" + ).to(image.device) + + encoder_atts = torch.cat([atts_t5, input_tokens.attention_mask], dim=1) + + with self.maybe_autocast(dtype=torch.bfloat16): + inputs_embeds = self.t5_model.encoder.embed_tokens(input_tokens.input_ids) + inputs_embeds = torch.cat([inputs_t5, inputs_embeds], dim=1) + + outputs = self.t5_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=encoder_atts, + do_sample=use_nucleus_sampling, + top_p=top_p, + temperature=temperature, + num_beams=num_beams, + max_new_tokens=max_length, + min_length=min_length, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + num_return_sequences=num_captions, + ) + output_text = self.t5_tokenizer.batch_decode( + outputs, skip_special_tokens=True + ) + + return output_text + + def predict_answers( + self, + samples, + num_beams=5, + inference_method="generate", + max_len=10, + min_len=1, + num_ans_candidates=128, + answer_list=None, + prompt="", + length_penalty=-1, + **kwargs + ): + image = samples["image"] + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_embeds = image_embeds.float() + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_t5 = self.t5_proj(query_output.last_hidden_state) + atts_t5 = torch.ones(inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] + if prompt: + text_input = [prompt.format(question) for question in samples["text_input"]] + else: + text_input = samples["text_input"] + + input_tokens = self.t5_tokenizer( + text_input, padding="longest", return_tensors="pt" + ).to(image.device) + + encoder_atts = torch.cat([atts_t5, input_tokens.attention_mask], dim=1) + + with self.maybe_autocast(dtype=torch.bfloat16): + inputs_embeds = self.t5_model.encoder.embed_tokens(input_tokens.input_ids) + inputs_embeds = torch.cat([inputs_t5, inputs_embeds], dim=1) + + outputs = self.t5_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=encoder_atts, + do_sample=False, + num_beams=num_beams, + max_new_tokens=max_len, + min_length=min_len, + length_penalty=length_penalty, + ) + output_text = self.t5_tokenizer.batch_decode( + outputs, skip_special_tokens=True + ) + + if self._apply_lemmatizer: + output_text = self._lemmatize(output_text) + + return output_text + + def _lemmatize(self, answers): + def apply(answer): + doc = self.lemmatizer(answer) + + words = [] + for token in doc: + if token.pos_ in ["NOUN", "VERB"]: + words.append(token.lemma_) + else: + words.append(token.text) + answer = " ".join(words) + + return answer + + return [apply(answer) for answer in answers] + + @property + def lemmatizer(self): + if self._lemmatizer is None: + try: + import spacy + + self._lemmatizer = spacy.load("en_core_web_sm") + except ImportError: + logging.error( + """ + Please install spacy and en_core_web_sm model to apply lemmatization. + python -m spacy download en_core_web_sm + OR + import spacy.cli + spacy.cli.download("en_core_web_sm") + """ + ) + exit(1) + + return self._lemmatizer + + @classmethod + def from_config(cls, cfg): + vit_model = cfg.get("vit_model", "eva_clip_g") + img_size = cfg.get("image_size") + num_query_token = cfg.get("num_query_token") + t5_model = cfg.get("t5_model") + + drop_path_rate = cfg.get("drop_path_rate", 0) + use_grad_checkpoint = cfg.get("use_grad_checkpoint", False) + vit_precision = cfg.get("vit_precision", "fp16") + freeze_vit = cfg.get("freeze_vit", True) + + prompt = cfg.get("prompt", "") + max_txt_len = cfg.get("max_txt_len", 32) + + apply_lemmatizer = cfg.get("apply_lemmatizer", False) + + model = cls( + vit_model=vit_model, + img_size=img_size, + drop_path_rate=drop_path_rate, + use_grad_checkpoint=use_grad_checkpoint, + vit_precision=vit_precision, + freeze_vit=freeze_vit, + num_query_token=num_query_token, + t5_model=t5_model, + prompt=prompt, + max_txt_len=max_txt_len, + apply_lemmatizer=apply_lemmatizer, + ) + model.load_checkpoint_from_config(cfg) + + return model diff --git a/lavis/models/blip2_models/blip2_t5_instruct.py b/lavis/models/blip2_models/blip2_t5_instruct.py new file mode 100644 index 0000000000000000000000000000000000000000..100a44adba8c79e59701262b36deb5e90076b48d --- /dev/null +++ b/lavis/models/blip2_models/blip2_t5_instruct.py @@ -0,0 +1,784 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import logging +import string +import random +import copy + +import torch +import torch.nn as nn +from torch.cuda.amp import autocast as autocast +from transformers import T5TokenizerFast + +from lavis.common.registry import registry +from lavis.models.blip2_models.blip2 import Blip2Base, disabled_train +from lavis.models.blip2_models.modeling_t5 import T5Config, T5ForConditionalGeneration +from transformers.modeling_outputs import BaseModelOutput + + +@registry.register_model("blip2_t5_instruct") +class Blip2T5Instruct(Blip2Base): + """ + BLIP2 T5 model. + Supported model types: + - flant5xl + - flant5xxl + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip2_t5_instruct", "flant5xl") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "flant5xl": "configs/models/blip2/blip2_instruct_flant5xl.yaml", + "flant5xxl": "configs/models/blip2/blip2_instruct_flant5xxl.yaml", + } + + def __init__( + self, + vit_model="eva_clip_g", + img_size=224, + drop_path_rate=0, + use_grad_checkpoint=False, + vit_precision="fp16", + freeze_vit=True, + num_query_token=32, + t5_model="google/flan-t5-xl", + prompt="", + max_txt_len=128, + max_output_txt_len=256, + apply_lemmatizer=False, + num_few_shot_examples=0, + few_shot_prob=0, + qformer_text_input=True, + ): + """ + apply_lemmatizer: when set to True, postprocess predict_answers() result with lemmas. + """ + super().__init__() + + self.tokenizer = self.init_tokenizer(truncation_side="left") + + self.visual_encoder, self.ln_vision = self.init_vision_encoder( + vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision + ) + if freeze_vit: + for name, param in self.visual_encoder.named_parameters(): + param.requires_grad = False + self.visual_encoder = self.visual_encoder.eval() + self.visual_encoder.train = disabled_train + logging.info("freeze vision encoder") + + self.Qformer, self.query_tokens = self.init_Qformer( + num_query_token, self.visual_encoder.num_features + ) + + if not qformer_text_input: + self.Qformer.bert.embeddings.word_embeddings = None + self.Qformer.bert.embeddings.position_embeddings = None + for layer in self.Qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + else: + self.Qformer.resize_token_embeddings(len(self.tokenizer)) + self.Qformer.cls = None + + self.t5_tokenizer = T5TokenizerFast.from_pretrained(t5_model, truncation_side='left') + self.t5_output_tokenizer = T5TokenizerFast.from_pretrained(t5_model, truncation_side='right') + + t5_config = T5Config.from_pretrained(t5_model) + t5_config.dense_act_fn = "gelu" + self.t5_model = T5ForConditionalGeneration.from_pretrained( + t5_model, config=t5_config + ) + + for name, param in self.t5_model.named_parameters(): + param.requires_grad = False + param.data = param.data.bfloat16() + + self.t5_proj = nn.Linear( + self.Qformer.config.hidden_size, self.t5_model.config.hidden_size + ) + + self.max_txt_len = max_txt_len + self.max_output_txt_len = max_output_txt_len + self.prompt = prompt + + self._apply_lemmatizer = apply_lemmatizer + self._lemmatizer = None + + self.num_few_shot_examples = num_few_shot_examples + self.few_shot_prob = few_shot_prob + + self.qformer_text_input = qformer_text_input + + def forward(self, samples): + # print('-----------------') + # print(samples["text_input"]) + # print(samples["text_output"]) + # print('-----------------') + + image = samples["image"] + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + if self.qformer_text_input: + text_Qformer = self.tokenizer( + samples["text_input"], + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device) + Qformer_atts = torch.cat([query_atts,text_Qformer.attention_mask],dim=1) + + query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + else: + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_t5 = self.t5_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:]) + atts_t5 = torch.ones(inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + + fs_embeds, fs_atts = None, None + if self.few_shot_prob > 0 and "few_shot_samples" in samples.keys(): + fs_embeds, fs_atts = self.prepare_few_shot_embeds(samples['few_shot_samples']) + + with self.maybe_autocast(dtype=torch.bfloat16): + input_tokens = self.t5_tokenizer( + samples["text_input"], + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + output_tokens = self.t5_output_tokenizer( + samples["text_output"], + padding="longest", + truncation=True, + max_length=self.max_output_txt_len, + return_tensors="pt", + ).to(image.device) + + encoder_atts = torch.cat([atts_t5, input_tokens.attention_mask], dim=1) + + targets = output_tokens.input_ids.masked_fill( + output_tokens.input_ids == self.t5_tokenizer.pad_token_id, -100 + ) + + inputs_embeds = self.t5_model.encoder.embed_tokens(input_tokens.input_ids) + inputs_embeds = torch.cat([inputs_t5, inputs_embeds], dim=1) + + if fs_embeds is not None: + inputs_embeds = torch.cat([fs_embeds, inputs_embeds], dim=1) + encoder_atts = torch.cat([fs_atts, encoder_atts], dim=1) + + outputs = self.t5_model( + inputs_embeds=inputs_embeds, + attention_mask=encoder_atts, + decoder_attention_mask=output_tokens.attention_mask, + return_dict=True, + labels=targets, + ) + loss = outputs.loss + + return {"loss": loss} + + def prepare_few_shot_embeds(self, samples): + this_n_fs = random.choices( + list(range(self.num_few_shot_examples + 1)), + weights=[1 - self.few_shot_prob] + [self.few_shot_prob / self.num_few_shot_examples] * self.num_few_shot_examples + )[0] + + if this_n_fs == 0: + return None, None + + images = [] + text_input = [] + for sample in samples: + for n in range(this_n_fs): + images.append(sample['image'][n]) + text_input.append(sample['text_input'][n]) + images = torch.stack(images, dim=0) + + image = images + + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + if self.qformer_text_input: + text_Qformer = self.tokenizer( + text_input, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device) + Qformer_atts = torch.cat([query_atts,text_Qformer.attention_mask],dim=1) + query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask = Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + else: + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_t5 = self.t5_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:]) + atts_t5 = torch.ones(inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + + with self.maybe_autocast(dtype=torch.bfloat16): + input_tokens = self.t5_tokenizer( + text_input, + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + + encoder_atts = torch.cat([atts_t5, input_tokens.attention_mask], dim=1) + + inputs_embeds = self.t5_model.encoder.embed_tokens(input_tokens.input_ids) + inputs_embeds = torch.cat([inputs_t5, inputs_embeds], dim=1) + + if this_n_fs > 1: + encoder_atts = encoder_atts.reshape(encoder_atts.size(0) // this_n_fs, encoder_atts.size(1) * this_n_fs) + inputs_embeds = inputs_embeds.reshape(inputs_embeds.size(0) // this_n_fs, inputs_embeds.size(1) * this_n_fs, inputs_embeds.size(2)) + + return inputs_embeds, encoder_atts + + @torch.no_grad() + def generate( + self, + samples, + use_nucleus_sampling=False, + num_beams=5, + max_length=256, + min_length=1, + top_p=0.9, + repetition_penalty=1.5, + length_penalty=1.0, + num_captions=1, + temperature=1, + ): + if "prompt" in samples.keys(): + prompt = samples["prompt"] + else: + prompt = self.prompt + + image = samples["image"] + + bs = image.size(0) + + if isinstance(prompt, str): + prompt = [prompt] * bs + else: + assert len(prompt) == bs, "The number of prompts must be equal to the batch size." + + # For TextCaps + if "ocr_tokens" in samples.keys() and "{}" in prompt[0]: + prompt = [p.format(', '.join(samples['ocr_tokens'][i][:30])) for i, p in enumerate(prompt)] + + query_tokens = self.query_tokens.expand(bs, -1, -1) + if self.qformer_text_input: + # remove ocr tokens in q_former (for eval textvqa) + # qformer_prompt = prompt + # qformer_prompt = ['Question: ' + qp.split(' Question: ')[1] for qp in qformer_prompt] + + text_Qformer = self.tokenizer( + prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device) + Qformer_atts = torch.cat([query_atts,text_Qformer.attention_mask],dim=1) + + # For video data + if image.dim() == 5: + inputs_t5, atts_t5 = [], [] + for j in range(image.size(2)): + this_frame = image[:,:,j,:,:] + with self.maybe_autocast(): + frame_embeds = self.ln_vision(self.visual_encoder(this_frame)) + frame_atts = torch.ones(frame_embeds.size()[:-1], dtype=torch.long).to(image.device) + + if self.qformer_text_input: + frame_query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask = Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=frame_embeds, + encoder_attention_mask=frame_atts, + return_dict=True, + ) + else: + frame_query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=frame_embeds, + encoder_attention_mask=frame_atts, + return_dict=True, + ) + + frame_inputs_t5 = self.t5_proj(frame_query_output.last_hidden_state[:,:query_tokens.size(1),:]) + frame_atts_t5 = torch.ones(frame_inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + inputs_t5.append(frame_inputs_t5) + atts_t5.append(frame_atts_t5) + inputs_t5 = torch.cat(inputs_t5, dim=1) + atts_t5 = torch.cat(atts_t5, dim=1) + else: + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) + + if self.qformer_text_input: + query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + else: + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_t5 = self.t5_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:]) + atts_t5 = torch.ones(inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + + input_tokens = self.t5_tokenizer( + prompt, + padding="longest", + return_tensors="pt" + ).to(image.device) + + encoder_atts = torch.cat([atts_t5, input_tokens.attention_mask], dim=1) + + with self.maybe_autocast(dtype=torch.bfloat16): + inputs_embeds = self.t5_model.encoder.embed_tokens(input_tokens.input_ids) + inputs_embeds = torch.cat([inputs_t5, inputs_embeds], dim=1) + + outputs = self.t5_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=encoder_atts, + do_sample=use_nucleus_sampling, + top_p=top_p, + temperature=temperature, + num_beams=num_beams, + max_new_tokens=max_length, + min_length=min_length, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + num_return_sequences=num_captions, + ) + output_text = self.t5_tokenizer.batch_decode( + outputs, skip_special_tokens=True + ) + + return output_text + + def predict_answers( + self, + samples, + num_beams=5, + inference_method="generate", + max_len=10, + min_len=1, + num_ans_candidates=128, + answer_list=None, + prompt="", + length_penalty=-1, + **kwargs + ): + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] + + if prompt: + if prompt.count("{}") == 2: + if 'ocr_tokens' in samples: + text_input = [ + prompt.format(', '.join(samples['ocr_tokens'][i][:30]), samples["text_input"][i]) + for i in range(len(samples["text_input"]))] + elif 'choices' in samples: + text_input = [] + for i in range(len(samples["text_input"])): + this_choices = [f"({string.ascii_lowercase[j]}) {ch}" for j, ch in enumerate(samples["choices"][i])] + this_choices = " ".join(this_choices) + text_input.append(prompt.format(samples["text_input"][i], this_choices)) + else: + text_input = [prompt.format(question) for question in samples["text_input"]] + else: + text_input = samples["text_input"] + + samples["prompt"] = text_input + + output_text = self.generate( + samples, + num_beams=num_beams, + max_length=max_len, + min_length=min_len, + length_penalty=length_penalty + ) + + if self._apply_lemmatizer or ("apply_lemmatizer" in samples.keys() and samples["apply_lemmatizer"]): + output_text = self._lemmatize(output_text) + + return output_text + + def predict_class( + self, + samples, + candidates, + n_segments=1, + ): + # If candidates is a list of lists, each sample has its candidates, then we need to iterate one by one + if type(candidates[0]) == list: + results = [] + + for i in range(samples["image"].size(0)): + this_sample = { + "image": samples["image"][i].unsqueeze(0), + "prompt": samples["prompt"], + } + + if "text_input" in samples.keys(): + this_sample["text_input"] = [samples["text_input"][i]] + + if 'context' in samples.keys(): + this_sample['context'] = [samples["context"][i]] + + if 'history' in samples.keys(): + this_sample['history'] = [samples["history"][i]] + + if 'caption' in samples.keys(): + this_sample['caption'] = [samples["caption"][i]] + + this_result = self._predict_class(this_sample, candidates[i], n_segments) + results.append(this_result) + + try: + results = torch.cat(results, dim=0) + except: + results = [res.tolist()[0] for res in results] + + return results + + return self._predict_class(samples, candidates, n_segments) + + def _predict_class( + self, + samples, + candidates, + n_segments=1, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - prompt: the instruction + candidates: + (list): A list of candidate class names; + n_segments: + (int): Split the candidates into n_segments and predict one by one. This is useful when the number of candidates is too large. + Returns: + output_class: predicted class index + """ + + image = samples["image"] + prompt = samples["prompt"] + + bs = image.size(0) + + if isinstance(prompt, str): + prompt = [prompt] * bs + else: + assert len(prompt) == bs, "The number of prompts must be equal to the batch size." + + if "text_input" in samples.keys(): + if type(samples["text_input"][0]) == list: + prompt = [prompt[i].format(*samples["text_input"][i]) for i in range(len(prompt))] + else: + prompt = [prompt[i].format(samples["text_input"][i]) for i in range(len(prompt))] + + # scienceqa + if 'context' in samples.keys() and samples['context'] != '': + prompt = [f'context: {samples["context"][i]}. {prompt[i]}' for i in range(len(prompt))] + + # visual dialog + if 'history' in samples.keys() and samples['history'][0] != '': + prompt = [f'dialog history: {samples["history"][i]}\n{prompt[i]}' for i in range(len(prompt))] + + if 'caption' in samples.keys() and samples['caption'][0] != '': + prompt = [f'This image has the caption "{samples["caption"][i]}". {prompt[i]}' for i in range(len(prompt))] + + query_tokens = self.query_tokens.expand(bs, -1, -1) + if self.qformer_text_input: + text_Qformer = self.tokenizer( + prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt" + ).to(image.device) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device) + Qformer_atts = torch.cat([query_atts,text_Qformer.attention_mask], dim=1) + + if image.dim() == 5: + inputs_t5, atts_t5 = [], [] + for j in range(image.size(2)): + this_frame = image[:,:,j,:,:] + with self.maybe_autocast(): + frame_embeds = self.ln_vision(self.visual_encoder(this_frame)) + frame_atts = torch.ones(frame_embeds.size()[:-1], dtype=torch.long).to(image.device) + + if self.qformer_text_input: + frame_query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=frame_embeds, + encoder_attention_mask=frame_atts, + return_dict=True, + ) + else: + frame_query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=frame_embeds, + encoder_attention_mask=frame_atts, + return_dict=True, + ) + + frame_inputs_t5 = self.t5_proj(frame_query_output.last_hidden_state[:,:query_tokens.size(1),:]) + frame_atts_t5 = torch.ones(frame_inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + inputs_t5.append(frame_inputs_t5) + atts_t5.append(frame_atts_t5) + inputs_t5 = torch.cat(inputs_t5, dim=1) + atts_t5 = torch.cat(atts_t5, dim=1) + else: + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) + + if self.qformer_text_input: + query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + else: + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_t5 = self.t5_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:]) + atts_t5 = torch.ones(inputs_t5.size()[:-1], dtype=torch.long).to(image.device) + + input_tokens = self.t5_tokenizer( + prompt, padding="longest", return_tensors="pt" + ).to(image.device) + output_tokens = self.t5_tokenizer( + candidates, padding="longest", return_tensors="pt" + ).to(image.device) + + encoder_atts = torch.cat([atts_t5, input_tokens.attention_mask], dim=1) + + n_cands = len(candidates) + + with self.maybe_autocast(dtype=torch.bfloat16): + inputs_embeds = self.t5_model.encoder.embed_tokens(input_tokens.input_ids) + inputs_embeds = torch.cat([inputs_t5, inputs_embeds], dim=1) + + encoder_outputs = self.t5_model.encoder( + inputs_embeds=inputs_embeds, + attention_mask=encoder_atts, + ) + + all_losses = [] + for n in range(n_segments): + seg_len = n_cands // n_segments + if n == (n_segments - 1): + seg_len = n_cands - seg_len * (n_segments - 1) + + # this_encoder_outputs = copy.deepcopy(encoder_outputs) + this_encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0].clone(), + ) + + this_encoder_outputs['last_hidden_state'] = this_encoder_outputs[0].repeat_interleave(seg_len, dim=0) + this_encoder_atts = encoder_atts.repeat_interleave(seg_len, dim=0) + + start_i = n * (n_cands // n_segments) + end_i = start_i + seg_len + this_output_tokens_ids = output_tokens.input_ids[start_i:end_i].repeat(bs, 1) + this_output_tokens_atts = output_tokens.attention_mask[start_i:end_i].repeat(bs, 1) + + this_targets = this_output_tokens_ids.masked_fill(this_output_tokens_ids == self.t5_tokenizer.pad_token_id, -100) + + outputs = self.t5_model( + encoder_outputs=this_encoder_outputs, + attention_mask=this_encoder_atts, + decoder_attention_mask=this_output_tokens_atts, + return_dict=True, + labels=this_targets, + reduction="none", + ) + loss = outputs.loss + + loss = loss.reshape(bs, seg_len) + # output_class_ranks = torch.argsort(loss, dim=-1) + all_losses.append(loss) + + all_losses = torch.cat(all_losses, dim=-1) + output_class_ranks = torch.argsort(all_losses, dim=-1) + + # encoder_outputs['last_hidden_state'] = encoder_outputs[0].repeat_interleave(n_cands, dim=0) + # encoder_atts = encoder_atts.repeat_interleave(n_cands, dim=0) + # output_tokens.input_ids = output_tokens.input_ids.repeat(bs, 1) + # output_tokens.attention_mask = output_tokens.attention_mask.repeat(bs, 1) + + # # compute the LM loss for each candidate (sum logprob across all tokens) and select the highest + # targets = output_tokens.input_ids.masked_fill(output_tokens.input_ids == self.t5_tokenizer.pad_token_id, -100) + + # outputs = self.t5_model( + # encoder_outputs=encoder_outputs, + # attention_mask=encoder_atts, + # decoder_attention_mask=output_tokens.attention_mask, + # return_dict=True, + # labels=targets, + # reduction="none", + # ) + # loss = outputs.loss + + # loss = loss.reshape(bs, n_cands) + # output_class_ranks = torch.argsort(loss, dim=-1) # (bs, num_candidates) + + return output_class_ranks + + def _lemmatize(self, answers): + def apply(answer): + doc = self.lemmatizer(answer) + + words = [] + for token in doc: + if token.pos_ in ["NOUN", "VERB"]: + words.append(token.lemma_) + else: + words.append(token.text) + answer = " ".join(words) + + return answer + + return [apply(answer) for answer in answers] + + @property + def lemmatizer(self): + if self._lemmatizer is None: + try: + import spacy + + self._lemmatizer = spacy.load("en_core_web_sm") + except ImportError: + logging.error( + """ + Please install spacy and en_core_web_sm model to apply lemmatization. + python -m spacy download en_core_web_sm + OR + import spacy.cli + spacy.cli.download("en_core_web_sm") + """ + ) + exit(1) + + return self._lemmatizer + + @classmethod + def from_config(cls, cfg): + vit_model = cfg.get("vit_model", "eva_clip_g") + img_size = cfg.get("image_size") + num_query_token = cfg.get("num_query_token") + t5_model = cfg.get("t5_model") + + drop_path_rate = cfg.get("drop_path_rate", 0) + use_grad_checkpoint = cfg.get("use_grad_checkpoint", False) + vit_precision = cfg.get("vit_precision", "fp16") + freeze_vit = cfg.get("freeze_vit", True) + + prompt = cfg.get("prompt", "") + max_txt_len = cfg.get("max_txt_len", 128) + max_output_txt_len = cfg.get("max_output_txt_len", 256) + + apply_lemmatizer = cfg.get("apply_lemmatizer", False) + + num_few_shot_examples = cfg.get("num_few_shot_examples", 0) + few_shot_prob = cfg.get("few_shot_prob", 0.0) + + qformer_text_input = cfg.get("qformer_text_input", True) + + model = cls( + vit_model=vit_model, + img_size=img_size, + drop_path_rate=drop_path_rate, + use_grad_checkpoint=use_grad_checkpoint, + vit_precision=vit_precision, + freeze_vit=freeze_vit, + num_query_token=num_query_token, + t5_model=t5_model, + prompt=prompt, + max_txt_len=max_txt_len, + max_output_txt_len=max_output_txt_len, + apply_lemmatizer=apply_lemmatizer, + num_few_shot_examples=num_few_shot_examples, + few_shot_prob=few_shot_prob, + qformer_text_input=qformer_text_input, + ) + + # if qformer_text_input: + # # Hard-coded to load from BLIP-2 stage-1 pre-trained model (not ideal) + # model.load_from_pretrained( + # url_or_filename="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" + # ) + + model.load_checkpoint_from_config(cfg) + + return model diff --git a/lavis/models/blip2_models/blip2_vicuna_instruct.py b/lavis/models/blip2_models/blip2_vicuna_instruct.py new file mode 100644 index 0000000000000000000000000000000000000000..d614b74e81fadac8da7ada8725e3750fb5f6ddd4 --- /dev/null +++ b/lavis/models/blip2_models/blip2_vicuna_instruct.py @@ -0,0 +1,739 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + + +""" +Requires Transformer 4.28 and above, implementation may change according the Llama implementation +""" +import logging +import string +from packaging import version + +import torch +from torch.cuda.amp import autocast as autocast +import torch.nn as nn + +import transformers + +from lavis.common.registry import registry +from lavis.models.blip2_models.blip2 import Blip2Base, disabled_train + +@registry.register_model("blip2_vicuna_instruct") +class Blip2VicunaInstruct(Blip2Base): + """ + BLIP2 Vicuna model. + Supported model types: + - vicuna7b + - vicuna13b + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip2_vicuna_instruct", "vicuna7b") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "vicuna7b": "configs/models/blip2/blip2_instruct_vicuna7b.yaml", + "vicuna13b": "configs/models/blip2/blip2_instruct_vicuna13b.yaml", + } + + def __init__( + self, + vit_model="eva_clip_g", + img_size=224, + drop_path_rate=0, + use_grad_checkpoint=False, + vit_precision="fp16", + freeze_vit=True, + num_query_token=32, + llm_model="", + prompt="", + max_txt_len=128, + max_output_txt_len=256, + apply_lemmatizer=False, + qformer_text_input=True, + ): + super().__init__() + transformers_version = version.parse(transformers.__version__) + assert transformers_version >= version.parse("4.28"), "BLIP-2 Vicuna requires transformers>=4.28" + from transformers import LlamaTokenizer + from lavis.models.blip2_models.modeling_llama import LlamaForCausalLM + + self.tokenizer = self.init_tokenizer(truncation_side="left") + + self.visual_encoder, self.ln_vision = self.init_vision_encoder( + vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision + ) + if freeze_vit: + for name, param in self.visual_encoder.named_parameters(): + param.requires_grad = False + self.visual_encoder = self.visual_encoder.eval() + self.visual_encoder.train = disabled_train + logging.info("freeze vision encoder") + + self.Qformer, self.query_tokens = self.init_Qformer( + num_query_token, self.visual_encoder.num_features + ) + + if not qformer_text_input: + self.Qformer.bert.embeddings.word_embeddings = None + self.Qformer.bert.embeddings.position_embeddings = None + for layer in self.Qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + else: + self.Qformer.resize_token_embeddings(len(self.tokenizer)) + self.Qformer.cls = None + + self.llm_tokenizer = LlamaTokenizer.from_pretrained(llm_model, use_fast=False, truncation_side="left") + self.llm_model = LlamaForCausalLM.from_pretrained( + llm_model, torch_dtype=torch.float16 + ) + self.llm_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) + self.llm_tokenizer.add_special_tokens({'bos_token': ''}) + self.llm_tokenizer.add_special_tokens({'eos_token': ''}) + self.llm_tokenizer.add_special_tokens({'unk_token': ''}) + # self.llm_tokenizer.pad_token = self.llm_tokenizer.unk_token + + self.llm_model.resize_token_embeddings(len(self.llm_tokenizer)) + + # self.eos_token_id = self.llm_tokenizer( + # self.llm_tokenizer.eos_token, add_special_tokens=False + # ).input_ids[0] + + for name, param in self.llm_model.named_parameters(): + param.requires_grad = False + + self.llm_proj = nn.Linear( + self.Qformer.config.hidden_size, self.llm_model.config.hidden_size + ) + + self.max_txt_len = max_txt_len + self.max_output_txt_len = max_output_txt_len + self.prompt = prompt + prompt_tokens = self.llm_tokenizer(self.prompt, return_tensors="pt") + self.prompt_length = prompt_tokens.attention_mask.sum(1) + + self._lemmatizer = None + + self.qformer_text_input = qformer_text_input + + def concat_text_input_output(self, input_ids, input_atts, output_ids, output_atts): + input_part_targets_len = [] + llm_tokens = {"input_ids": [], "attention_mask": []} + for i in range(input_ids.size(0)): + this_input_ones = input_atts[i].sum() + input_part_targets_len.append(this_input_ones) + llm_tokens['input_ids'].append( + torch.cat([ + input_ids[i][:this_input_ones], + output_ids[i][1:], + input_ids[i][this_input_ones:] + ]) + ) + llm_tokens['attention_mask'].append( + torch.cat([ + input_atts[i][:this_input_ones], + output_atts[i][1:], + input_atts[i][this_input_ones:] + ]) + ) + llm_tokens['input_ids'] = torch.stack(llm_tokens['input_ids']) + llm_tokens['attention_mask'] = torch.stack(llm_tokens['attention_mask']) + return llm_tokens, input_part_targets_len + + def forward(self, samples): + # print('-----------------') + # print(samples["text_input"]) + # print(samples["text_output"]) + # print('-----------------') + + image = samples["image"] + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) + + bs = image.size(0) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + if self.qformer_text_input: + text_Qformer = self.tokenizer( + samples["text_input"], + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device) + Qformer_atts = torch.cat([query_atts, text_Qformer.attention_mask],dim=1) + + query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + else: + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_llm = self.llm_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:]) + atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(image.device) + + self.llm_tokenizer.padding_side = "right" + self.llm_tokenizer.truncation_side = 'left' + text_input_tokens = self.llm_tokenizer( + samples['text_input'], + return_tensors="pt", + padding="longest", + truncation=True, + max_length=self.max_txt_len, + ).to(image.device) + + self.llm_tokenizer.truncation_side = 'right' + text_output_tokens = self.llm_tokenizer( + [t + self.llm_tokenizer.eos_token for t in samples['text_output']], + return_tensors="pt", + padding="longest", + truncation=True, + max_length=self.max_output_txt_len, + ).to(image.device) + + llm_tokens, input_part_targets_len = self.concat_text_input_output( + text_input_tokens.input_ids, + text_input_tokens.attention_mask, + text_output_tokens.input_ids, + text_output_tokens.attention_mask, + ) + + # do not apply loss to the padding + targets = llm_tokens['input_ids'].masked_fill( + llm_tokens['input_ids'] == self.llm_tokenizer.pad_token_id, -100 + ) + + # do not apply loss to the text input (i.e., instruction) + for i, l in enumerate(input_part_targets_len): + targets[i][:l] = -100 + + # do not apply loss to the query tokens + empty_targets = ( + torch.ones(atts_llm.size(), dtype=torch.long).to(image.device).fill_(-100) + ) + targets = torch.cat([empty_targets, targets], dim=1) + + inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens['input_ids']) + inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1) + attention_mask = torch.cat([atts_llm, llm_tokens['attention_mask']], dim=1) + + with self.maybe_autocast(): + outputs = self.llm_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict=True, + labels=targets, + ) + + loss = outputs.loss + + return {"loss": loss} + + @torch.no_grad() + def generate( + self, + samples, + use_nucleus_sampling=False, + num_beams=5, + max_length=256, + min_length=1, + top_p=0.9, + repetition_penalty=1.5, + length_penalty=1, + num_captions=1, + temperature=1, + ): + self.llm_tokenizer.padding_side = "left" + + if "prompt" in samples.keys(): + prompt = samples["prompt"] + else: + prompt = self.prompt + + image = samples["image"] + + bs = image.size(0) + + if isinstance(prompt, str): + prompt = [prompt] * bs + else: + assert len(prompt) == bs, "The number of prompts must be equal to the batch size." + + # For TextCaps + if "ocr_tokens" in samples.keys() and "{}" in prompt[0]: + prompt = [p.format(', '.join(samples['ocr_tokens'][i][:30])) for i, p in enumerate(prompt)] + + query_tokens = self.query_tokens.expand(bs, -1, -1) + if self.qformer_text_input: + # remove ocr tokens in q_former (for eval textvqa) + # qformer_prompt = prompt + # qformer_prompt = ['Question: ' + qp.split(' Question: ')[1] for qp in qformer_prompt] + + text_Qformer = self.tokenizer( + prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device) + Qformer_atts = torch.cat([query_atts, text_Qformer.attention_mask], dim=1) + + # For video data + if image.dim() == 5: + inputs_llm, atts_llm = [], [] + for j in range(image.size(2)): + this_frame = image[:,:,j,:,:] + with self.maybe_autocast(): + frame_embeds = self.ln_vision(self.visual_encoder(this_frame)) + frame_atts = torch.ones(frame_embeds.size()[:-1], dtype=torch.long).to(image.device) + + if self.qformer_text_input: + frame_query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=frame_embeds, + encoder_attention_mask=frame_atts, + return_dict=True, + ) + else: + frame_query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=frame_embeds, + encoder_attention_mask=frame_atts, + return_dict=True, + ) + frame_inputs_llm = self.llm_proj(frame_query_output.last_hidden_state[:,:query_tokens.size(1),:]) + frame_atts_llm = torch.ones(frame_inputs_llm.size()[:-1], dtype=torch.long).to(image.device) + inputs_llm.append(frame_inputs_llm) + atts_llm.append(frame_atts_llm) + inputs_llm = torch.cat(inputs_llm, dim=1) + atts_llm = torch.cat(atts_llm, dim=1) + else: + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) + + if self.qformer_text_input: + query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + else: + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_llm = self.llm_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:]) + atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(image.device) + + llm_tokens = self.llm_tokenizer( + prompt, + padding="longest", + return_tensors="pt" + ).to(image.device) + + with self.maybe_autocast(): + inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens.input_ids) + inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1) + attention_mask = torch.cat([atts_llm, llm_tokens.attention_mask], dim=1) + + outputs = self.llm_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + do_sample=use_nucleus_sampling, + top_p=top_p, + temperature=temperature, + num_beams=num_beams, + max_length=max_length, + min_length=min_length, + # eos_token_id=self.eos_token_id, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + num_return_sequences=num_captions, + ) + + outputs[outputs == 0] = 2 # convert output id 0 to 2 (eos_token_id) + output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True) + output_text = [text.strip() for text in output_text] + + return output_text + + def predict_answers( + self, + samples, + num_beams=5, + inference_method="generate", + max_len=10, + min_len=1, + num_ans_candidates=128, + answer_list=None, + prompt="", + length_penalty=0, + **kwargs + ): + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] + + if prompt: + if prompt.count("{}") == 2: + if 'ocr_tokens' in samples: + text_input = [ + prompt.format(', '.join(samples['ocr_tokens'][i][:30]), samples["text_input"][i]) + for i in range(len(samples["text_input"]))] + elif 'choices' in samples: + text_input = [] + for i in range(len(samples["text_input"])): + this_choices = [f"({string.ascii_lowercase[j]}) {ch}" for j, ch in enumerate(samples["choices"][i])] + this_choices = " ".join(this_choices) + text_input.append(prompt.format(samples["text_input"][i], this_choices)) + else: + text_input = [prompt.format(question) for question in samples["text_input"]] + else: + text_input = samples["text_input"] + + samples["prompt"] = text_input + + output_text = self.generate( + samples, + num_beams=num_beams, + max_length=max_len, + min_length=min_len, + length_penalty=length_penalty + ) + + if "apply_lemmatizer" in samples.keys() and samples["apply_lemmatizer"]: + output_text = self._lemmatize(output_text) + + return output_text + + def predict_class( + self, + samples, + candidates, + n_segments=1, + ): + self.llm_tokenizer.padding_side = "left" + + # If candidates is a list of lists, each sample has its candidates, then we need to iterate one by one + if type(candidates[0]) == list: + results = [] + + for i in range(samples["image"].size(0)): + this_sample = { + "image": samples["image"][i].unsqueeze(0), + "prompt": samples["prompt"], + } + + if "text_input" in samples.keys(): + this_sample["text_input"] = [samples["text_input"][i]] + + if 'context' in samples.keys(): + this_sample['context'] = [samples["context"][i]] + + if 'history' in samples.keys(): + this_sample['history'] = [samples["history"][i]] + + if 'caption' in samples.keys(): + this_sample['caption'] = [samples["caption"][i]] + + this_result = self._predict_class(this_sample, candidates[i], n_segments) + results.append(this_result) + + try: + results = torch.cat(results, dim=0) + except: + results = [res.tolist()[0] for res in results] + + return results + + return self._predict_class(samples, candidates, n_segments) + + def _predict_class( + self, + samples, + candidates, + n_segments=1, + ): + image = samples["image"] + prompt = samples["prompt"] + + bs = image.size(0) + + if isinstance(prompt, str): + prompt = [prompt] * bs + else: + assert len(prompt) == bs, "The number of prompts must be equal to the batch size." + + if "text_input" in samples.keys(): + if type(samples["text_input"][0]) == list: + prompt = [prompt[i].format(*samples["text_input"][i]) for i in range(len(prompt))] + else: + prompt = [prompt[i].format(samples["text_input"][i]) for i in range(len(prompt))] + + # scienceqa + if 'context' in samples.keys() and samples['context'] != '': + prompt = [f'context: {samples["context"][i]}. {prompt[i]}' for i in range(len(prompt))] + + # visual dialog + if 'history' in samples.keys() and samples['history'][0] != '': + prompt = [f'dialog history: {samples["history"][i]}\n{prompt[i]}' for i in range(len(prompt))] + + if 'caption' in samples.keys() and samples['caption'][0] != '': + prompt = [f'This image has the caption "{samples["caption"][i]}". {prompt[i]}' for i in range(len(prompt))] + + query_tokens = self.query_tokens.expand(bs, -1, -1) + if self.qformer_text_input: + text_Qformer = self.tokenizer( + prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt" + ).to(image.device) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device) + Qformer_atts = torch.cat([query_atts, text_Qformer.attention_mask], dim=1) + + if image.dim() == 5: + inputs_llm, atts_llm = [], [] + for j in range(image.size(2)): + this_frame = image[:,:,j,:,:] + with self.maybe_autocast(): + frame_embeds = self.ln_vision(self.visual_encoder(this_frame)) + frame_atts = torch.ones(frame_embeds.size()[:-1], dtype=torch.long).to(image.device) + + if self.qformer_text_input: + frame_query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=frame_embeds, + encoder_attention_mask=frame_atts, + return_dict=True, + ) + else: + frame_query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=frame_embeds, + encoder_attention_mask=frame_atts, + return_dict=True, + ) + + frame_inputs_llm = self.llm_proj(frame_query_output.last_hidden_state[:,:query_tokens.size(1),:]) + frame_atts_llm = torch.ones(frame_inputs_llm.size()[:-1], dtype=torch.long).to(image.device) + inputs_llm.append(frame_inputs_llm) + atts_llm.append(frame_atts_llm) + inputs_llm = torch.cat(inputs_llm, dim=1) + atts_llm = torch.cat(atts_llm, dim=1) + else: + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) + + if self.qformer_text_input: + query_output = self.Qformer.bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + else: + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_llm = self.llm_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:]) + atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(image.device) + + self.llm_tokenizer.padding_side = "right" + self.llm_tokenizer.truncation_side = 'left' + text_input_tokens = self.llm_tokenizer( + prompt, + return_tensors="pt", + padding="longest", + # truncation=True, + # max_length=self.max_txt_len, + ).to(image.device) + + empty_targets = torch.ones(atts_llm.size(), dtype=torch.long).to(image.device).fill_(-100) + + # self.llm_tokenizer.padding_side = "right" + self.llm_tokenizer.truncation_side = 'right' + n_cands = len(candidates) + with self.maybe_autocast(dtype=torch.bfloat16): + all_losses = [] + for n in range(n_segments): + seg_len = n_cands // n_segments + if n == (n_segments - 1): + seg_len = n_cands - seg_len * (n_segments - 1) + + start_i = n * (n_cands // n_segments) + end_i = start_i + seg_len + + this_output_tokens = self.llm_tokenizer( + candidates[start_i:end_i], + return_tensors="pt", + padding="longest", + # truncation=True, + # max_length=self.max_output_txt_len, + ).to(image.device) + + this_input_tokens_ids = text_input_tokens.input_ids.repeat_interleave(seg_len, dim=0) + this_input_tokens_atts = text_input_tokens.attention_mask.repeat_interleave(seg_len, dim=0) + + this_output_tokens_ids = this_output_tokens.input_ids.repeat(bs, 1) + this_output_tokens_atts = this_output_tokens.attention_mask.repeat(bs, 1) + + this_llm_tokens, this_input_targets_len = self.concat_text_input_output( + this_input_tokens_ids, + this_input_tokens_atts, + this_output_tokens_ids, + this_output_tokens_atts + ) + + this_llm_input_ids = this_llm_tokens['input_ids'] + this_llm_atts = this_llm_tokens['attention_mask'] + # this_llm_input_ids = torch.cat([this_input_tokens_ids, this_output_tokens_ids], dim=1) + # this_llm_atts = torch.cat([this_input_tokens_atts, this_output_tokens_atts], dim=1) + + inputs_embeds = self.llm_model.get_input_embeddings()(this_llm_input_ids) + inputs_embeds = torch.cat([inputs_llm.repeat_interleave(seg_len, dim=0), inputs_embeds], dim=1) + attention_mask = torch.cat([atts_llm.repeat_interleave(seg_len, dim=0), this_llm_atts], dim=1) + + this_targets = this_llm_input_ids.masked_fill(this_llm_input_ids == self.llm_tokenizer.pad_token_id, -100) + # this_targets[:, :this_input_tokens_ids.size(1)] = -100 + for i, l in enumerate(this_input_targets_len): + this_targets[i][:l] = -100 + + this_targets = torch.cat([empty_targets.repeat_interleave(seg_len, dim=0), this_targets], dim=1) + + outputs = self.llm_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict=True, + labels=this_targets, + reduction="none", + ) + + loss = outputs.loss + + loss = loss.reshape(bs, seg_len) + # output_class_ranks = torch.argsort(loss, dim=-1) + all_losses.append(loss) + + all_losses = torch.cat(all_losses, dim=-1) + output_class_ranks = torch.argsort(all_losses, dim=-1) + + return output_class_ranks + + def _lemmatize(self, answers): + def apply(answer): + doc = self.lemmatizer(answer) + + words = [] + for token in doc: + if token.pos_ in ["NOUN", "VERB"]: + words.append(token.lemma_) + else: + words.append(token.text) + answer = " ".join(words) + + return answer + + return [apply(answer) for answer in answers] + + @property + def lemmatizer(self): + if self._lemmatizer is None: + try: + import spacy + + self._lemmatizer = spacy.load("en_core_web_sm") + except ImportError: + logging.error( + """ + Please install spacy and en_core_web_sm model to apply lemmatization. + python -m spacy download en_core_web_sm + OR + import spacy.cli + spacy.cli.download("en_core_web_sm") + """ + ) + exit(1) + + return self._lemmatizer + + @classmethod + def from_config(cls, cfg): + vit_model = cfg.get("vit_model", "eva_clip_g") + img_size = cfg.get("image_size") + num_query_token = cfg.get("num_query_token") + llm_model = cfg.get("llm_model") + + drop_path_rate = cfg.get("drop_path_rate", 0) + use_grad_checkpoint = cfg.get("use_grad_checkpoint", False) + vit_precision = cfg.get("vit_precision", "fp16") + freeze_vit = cfg.get("freeze_vit", True) + + prompt = cfg.get("prompt", "") + max_txt_len = cfg.get("max_txt_len", 128) + max_output_txt_len = cfg.get("max_output_txt_len", 256) + + apply_lemmatizer = cfg.get("apply_lemmatizer", False) + + qformer_text_input = cfg.get("qformer_text_input", True) + + model = cls( + vit_model=vit_model, + img_size=img_size, + drop_path_rate=drop_path_rate, + use_grad_checkpoint=use_grad_checkpoint, + vit_precision=vit_precision, + freeze_vit=freeze_vit, + num_query_token=num_query_token, + llm_model=llm_model, + prompt=prompt, + max_txt_len=max_txt_len, + max_output_txt_len=max_output_txt_len, + apply_lemmatizer=apply_lemmatizer, + qformer_text_input=qformer_text_input, + ) + + # if qformer_text_input: + # # Hard-coded to load from BLIP-2 stage-1 pre-trained model (not ideal) + # model.load_from_pretrained( + # url_or_filename="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" + # ) + + model.load_checkpoint_from_config(cfg) + + return model diff --git a/lavis/models/blip2_models/blip2_vicuna_xinstruct.py b/lavis/models/blip2_models/blip2_vicuna_xinstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..744d9eecb373f32c6db446b0805928ab265002ec --- /dev/null +++ b/lavis/models/blip2_models/blip2_vicuna_xinstruct.py @@ -0,0 +1,2640 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +""" +Requires Transformer 4.28 and above, implementation may change according the Llama implementation +""" +import logging +import string +from packaging import version +import os +from omegaconf import OmegaConf + +import torch +from torch.cuda.amp import autocast as autocast +import torch.nn as nn +from torch.nn.modules.module import _IncompatibleKeys + +from peft import ( + get_peft_model, + LoraConfig, + TaskType, +) + +import transformers +import random +from lavis.common.registry import registry +from lavis.models.base_model import BaseModel +from lavis.models.blip2_models.blip2 import Blip2Base, disabled_train, LayerNorm +from lavis.models.ulip_models.ULIP_models import ULIP_PointBERT +from lavis.tasks.multimodal_classification import MultimodalClassificationTask + +from lavis.common.utils import is_url +from lavis.models.blip2_models.Qformer import BertConfig, BertLMHeadModel +from lavis.common.dist_utils import download_cached_file +from lavis.processors.blip_processors import BlipCaptionProcessor + +class CastOutputToFloat(nn.Sequential): + def forward(self, x): return super().forward(x).to(torch.float32) + + +@registry.register_model("blip2_vicuna_xinstruct") +class Blip2VicunaXInstruct(Blip2Base): + """ + BLIP2 Vicuna model. + Supported model types: + - vicuna7b + - vicuna13b + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip2_vicuna_xinstruct", "vicuna7b") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "vicuna7b": "configs/models/blip2/blip2_xinstruct_vicuna7b.yaml", + "vicuna13b": "configs/models/blip2/blip2_xinstruct_vicuna13b.yaml", + } + + SEQUENCIAL_ENCODERS = [ + "eva_clip_g", + "beats" + ] + + SEQUENCIAL_MODALITIES = [ + "video", + "audio" + ] + + MODALITY_TO_CUE = { + "image": " image: ", + "pc": " 3d: ", + "video": " video: ", + "audio": " audio: ", + } + + def __init__( + self, + + modalities = ["image", "pc", "audio", "video"], + use_cues=True, + num_query_token=32, + qformer_text_input=True, + llm_text_input=False, + apply_lemmatizer=False, + + ## encoders + image_model="eva_clip_g", + pc_model="ulip2_pointbert", + video_model="eva_clip_g", + audio_model="beats", + + image_encoder_kwargs = {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}, + pc_encoder_kwargs = {}, + video_encoder_kwargs = {}, + audio_encoder_kwargs = {}, + + image_precision="fp16", + pc_precision="fp16", + video_precision="fp16", + audio_precision="fp16", + + freeze_image=True, + freeze_pc=True, + freeze_video=True, + freeze_audio=True, + + ## load pretrained parameters + pretrained_image_qformer=None, + pretrained_pc_qformer=None, + pretrained_video_qformer=None, + pretrained_audio_qformer=None, + + load_attention_image_qformer=False, + load_attention_pc_qformer=False, + load_attention_video_qformer=False, + load_attention_audio_qformer=False, + + load_qformer_type_image="", + load_qformer_type_pc="", + load_qformer_type_video="", + load_qformer_type_audio="", + + load_ln_type_image="", + load_ln_type_pc="", + load_ln_type_video="", + load_ln_type_audio="", + + load_projection_image=True, + load_projection_pc=True, + load_projection_video=True, + load_projection_audio=True, + + load_projection_type_image="", + load_projection_type_pc="", + load_projection_type_video="", + load_projection_type_audio="", + + ## llm model parameters + llm_model="", + lora_model="", + lora=False, + + ## generation parameters + prompt="", + prefix="", + postfix="", + max_txt_len=128, + max_output_txt_len=256, + special_qformer_input_prompt=False, + enumerate_inputs=False, + add_space=False, + remove_start=False, + clean_tokenization=False, # if set to true removes whitespace from cue, and start token from prompt. + + ## shared Q-former setup + shared_qformer=False, + pretrained_shared_qformer=None, + load_attention_shared_qformer=False, + load_qformer_type_shared="", + load_projection_shared=False, + load_projection_type_shared="", + encoder_projection_type_image="", + encoder_projection_type_pc="", + encoder_projection_type_video="", + encoder_projection_type_audio="", + shared_qformer_num_features=512, + + ## use cached features + cached_audio=False, + cached_image=False, + cached_pc=False, + cached_video=False, + + ## num features for modality (only needed in cached cases.) + num_features_audio=768, + num_features_image=1408, + num_features_video=1408, + num_features_pc=512, + + joint_video_audio=False, + + ## DisCRN + use_caption=False, + use_describe=False, + + ## classification setup + predict_with_gen=False, + format_candidates_prompt="{}", + + + ## projection only parameters + projection_only=False, + projection_only_audio=False, + projection_only_pc=False, + projection_only_video=False, + projection_only_image=False, + + projection_path_audio=False, + projection_path_pc=False, + projection_path_video=False, + projection_path_image=False, + + proj_dim=1, + + + ): + + super().__init__() + + transformers_version = version.parse(transformers.__version__) + assert transformers_version >= version.parse("4.28"), "BLIP-2 Vicuna requires transformers>=4.28" + from transformers import LlamaTokenizer + from lavis.models.blip2_models.modeling_llama import LlamaForCausalLM + logging.info(f"Using modalities {modalities}") + self.modalities = modalities + + logging.info(f"Shared Qformer is set to {shared_qformer}") + self.shared_qformer = shared_qformer + + logging.info(f"Video-audio interleaving is set to {joint_video_audio}") + self.joint_video_audio = joint_video_audio + + logging.info(f"Using Spacy en_core_wb_sm lemmatizer is set to {apply_lemmatizer}") + self._lemmatizer = None + self.apply_lemmatizer = apply_lemmatizer + + logging.info(f"Qformer text input {qformer_text_input} and LLM Text Input {llm_text_input}") + self.qformer_text_input = qformer_text_input + self.llm_text_input = llm_text_input + + self.projection_only = projection_only + self.proj_dim = proj_dim + logging.info(f"Projection only setup is set to {projection_only} with dimension {proj_dim}") + + for modality in self.modalities: + setattr(self, f"cached_{modality}", locals()[f"cached_{modality}"]) + if locals()[f"cached_{modality}"]: + setattr(self, f"num_features_{modality}", locals()[f"num_features_{modality}"]) + logging.info(f"Using cached {modality} representation with {getattr(self, f'num_features_{modality}')} embedding dim.") + + ### Initialize modality enoders ### + for modality in self.modalities: + modality_model = locals()[f"{modality}_model"] + modality_precision = locals()[f"{modality}_precision"] + modality_kwargs = locals()[f"{modality}_encoder_kwargs"] + modality_kwargs['load_ln_path'] = locals()[f"pretrained_shared_qformer"] if shared_qformer else \ + locals()[f"pretrained_{modality}_qformer"] + setattr(self, f"projection_only_{modality}", locals()[f"projection_only_{modality}"]) + setattr(self, f"projection_path_{modality}", locals()[f"projection_path_{modality}"]) + modality_kwargs['load_ln_type'] = locals()[f"load_ln_type_{modality}"] + if self.projection_only or locals()[f"projection_only_{modality}"]: + modality_kwargs['load_ln_path'] = getattr(self, f"projection_path_{modality}") + modality_kwargs['load_ln_type'] = modality + setattr(self, f"load_ln_type_{modality}", locals()[f"load_ln_type_{modality}"]) + setattr(self, f"pretrained_{modality}_qformer", locals()[f"pretrained_{modality}_qformer"]) + modality_encoder, modality_ln = getattr(self, f"init_{modality}_encoder")( + modality_model, + precision=modality_precision, + **modality_kwargs + ) + + freeze_modality = locals()[f"freeze_{modality}"] + cached_modality = locals()[f"cached_{modality}"] + if cached_modality: + setattr(self, f"{modality}_encoder", modality_encoder) + setattr(self, f"{modality}_ln", modality_ln) + continue + if freeze_modality: + for name, param in modality_encoder.named_parameters(): + param.requires_grad = False + modality_encoder = modality_encoder.eval() + modality_encoder.train = disabled_train + logging.info(f"freeze {modality} encoder") + + setattr(self, f"{modality}_encoder", modality_encoder) + setattr(self, f"{modality}_ln", modality_ln) + + ##### Init QFormers #### + self.tokenizer = self.init_tokenizer(truncation_side="left") # 30523 tokens. + self.num_query_token = num_query_token + if self.shared_qformer: + logging.info(f"Initializing shared QFormer with {shared_qformer_num_features} \ + number of features and query tokens of length {num_query_token}") + setattr(self, f"pretrained_shared_qformer", pretrained_shared_qformer) + setattr(self, f"load_qformer_type_shared", load_qformer_type_shared) + self.shared_Qformer, self.shared_query_tokens = self.init_Qformer( + num_query_token, + shared_qformer_num_features, + pretrained_qformer=pretrained_shared_qformer, + load_attention=load_attention_shared_qformer, + load_qformer_type=load_qformer_type_shared + ) + + if not qformer_text_input: + self.shared_Qformer.bert.embeddings.word_embeddings = None + self.shared_Qformer.bert.embeddings.position_embeddings = None + for layer in self.shared_Qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + else: + self.shared_Qformer.resize_token_embeddings(len(self.tokenizer)) + self.shared_Qformer.cls = None + + # Map shared Qformer by reference to all modalities. + for modality in self.modalities: + setattr(self, f"{modality}_Qformer", self.shared_Qformer) + setattr(self, f"{modality}_query_tokens", self.shared_query_tokens) + encoder_proj_type=locals()[f"encoder_projection_type_{modality}"] + setattr(self, f"encoder_projection_type_{modality}", locals()[f"encoder_projection_type_{modality}"]) + modality_encoder_features = getattr(self, f"{modality}_encoder").num_features + setattr(self, f"{modality}_encoder_projection", self.init_encoder_projection(modality_encoder_features, shared_qformer_num_features, pretrained_shared_qformer, encoder_proj_type)) + else: + for modality in self.modalities: + if getattr(self,f"cached_{modality}"): + modality_num_features = locals()[f"num_features_{modality}"] + else: + modality_num_features = getattr(self, f"{modality}_encoder").num_features + + setattr(self, f"pretrained_{modality}_qformer", locals()[f"pretrained_{modality}_qformer"]) + setattr(self, f"load_qformer_type_{modality}", locals()[f"load_qformer_type_{modality}"]) + + + setattr(self, f"projection_only_{modality}", locals()[f"projection_only_{modality}"]) + setattr(self, f"projection_path_{modality}", locals()[f"projection_path_{modality}"]) + + if self.projection_only or locals()[f"projection_only_{modality}"]: + logging.info(f"Initializing {modality} projection") + setattr(self, f"pretrained_{modality}_qformer", False) + if modality == 'audio' and proj_dim == 1: + modality_num_features *= 256 # hack to get full beats embedding. define better. + modality_projection = self.init_vicuna_projection( + modality_num_features, + num_query_token*proj_dim, + load_projection_path=getattr(self, f"projection_path_{modality}"), + load_projection_type=modality, + projection_key=f"{modality}_projection" + ) + setattr(self, f"{modality}_projection", modality_projection) + else: + logging.info(f"Initializing {modality} QFormer and query tokens of length {num_query_token}") + modality_qformer, modality_query_tokens = self.init_Qformer( + num_query_token, + modality_num_features, + pretrained_qformer=locals()[f"pretrained_{modality}_qformer"], + load_attention=locals()[f"load_attention_{modality}_qformer"], + load_qformer_type=locals()[f"load_qformer_type_{modality}"] + ) + + if not qformer_text_input: + modality_qformer.bert.embeddings.word_embeddings = None + modality_qformer.bert.embeddings.position_embeddings = None + for layer in modality_qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + else: + modality_qformer.resize_token_embeddings(len(self.tokenizer)) + modality_qformer.cls = None + setattr(self, f"{modality}_Qformer", modality_qformer) + setattr(self, f"{modality}_query_tokens", modality_query_tokens) + + ### Set up LLM ### + logging.info(f"Setting up llm model {llm_model}") + self.lora = lora + print(f"Lora is set to {self.lora}") + self.llm_tokenizer = LlamaTokenizer.from_pretrained(llm_model, use_fast=False, truncation_side="left") + self.llm_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) + self.llm_tokenizer.add_special_tokens({'bos_token': ''}) + self.llm_tokenizer.add_special_tokens({'eos_token': ''}) + self.llm_tokenizer.add_special_tokens({'unk_token': ''}) + if self.lora: + # https://github.com/lxe/llama-peft-tuner/blob/main/finetune_peft.py + self.llm_model = LlamaForCausalLM.from_pretrained( + llm_model, + load_in_8bit=True, + torch_dtype=torch.float16 + ) + self.llm_model.resize_token_embeddings(len(self.llm_tokenizer)) + self.peft_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + r=8, + lora_alpha=32, lora_dropout=0.1, + target_modules=['q_proj', 'v_proj'] + ) + self.llm_model.gradient_checkpointing_enable() + self.llm_model.enable_input_require_grads() + self.llm_model.lm_head = CastOutputToFloat(self.llm_model.lm_head) + self.llm_model.config.use_cache = False # silence the warnings. Please re-enable for inference! + self.llm_hidden_size = self.llm_model.config.hidden_size + self.llm_model = get_peft_model(self.llm_model, self.peft_config) + self.lora_model = lora_model + + else: + self.llm_model = LlamaForCausalLM.from_pretrained( + llm_model, torch_dtype=torch.float16 + ) + self.llm_model.resize_token_embeddings(len(self.llm_tokenizer)) + self.llm_hidden_size = self.llm_model.config.hidden_size + + for name, param in self.llm_model.named_parameters(): + param.requires_grad = False + + # Load LM projections + if self.shared_qformer and load_projection_shared: + qformer = getattr(self, f"shared_Qformer") + load_projection_path = locals()[f"load_projection_shared"] + if load_projection_path: + load_projection_path = locals()[f"pretrained_shared_qformer"] + load_projection_type = locals()[f"load_projection_type_shared"] + setattr(self, f"load_projection_shared", load_projection_path) + setattr(self, f"load_projection_type_shared", locals()[f"load_projection_type_shared"]) + logging.info(f"Loading shared Qformer projection.") + proj = self.init_vicuna_projection( + qformer.config.hidden_size, + self.llm_hidden_size, + load_projection_path=load_projection_path + ) + # Map projection by reference to all modalities. + for modality in self.modalities: + setattr(self, f"{modality}_llm_proj", proj) + else: + for modality in self.modalities: + load_projection_path = locals()[f"load_projection_{modality}"] + if load_projection_path == True: + load_projection_path = locals()[f"pretrained_{modality}_qformer"] + load_projection_type = locals()[f"load_projection_type_{modality}"] + setattr(self, f"load_projection_{modality}", load_projection_path) + setattr(self, f"load_projection_type_{modality}", load_projection_type) + if self.projection_only or getattr(self, f"projection_only_{modality}"): + proj = self.init_vicuna_projection( + self.num_query_token if proj_dim==1 else proj_dim, + self.num_query_token*self.llm_hidden_size if proj_dim==1 else self.llm_hidden_size, + load_projection_path=getattr(self, f"projection_path_{modality}"), + load_projection_type=load_projection_type, + ) + else: + qformer = getattr(self, f"{modality}_Qformer") + proj = self.init_vicuna_projection( + qformer.config.hidden_size, + self.llm_hidden_size, + load_projection_path=load_projection_path, + load_projection_type=load_projection_type + ) + setattr(self, f"{modality}_llm_proj", proj) + + self.clean_tokenization = clean_tokenization + logging.info(f"Clean tokenization is set to {self.clean_tokenization}") + + self.max_txt_len = max_txt_len + self.max_output_txt_len = max_output_txt_len + self.prompt = prompt + + self.prefix = prefix + if self.prefix: + self.tokenized_prefix = self.llm_tokenizer(self.prefix, return_tensors="pt") + + self.postfix = postfix + if type(self.postfix) != str or not self.postfix: + self.postfix = "" + logging.info(f"Using prefix set to {self.prefix} and postfix set to {self.postfix}.") + + self.use_cues = use_cues + logging.info(f"Using cues set to {self.use_cues}.") + if self.use_cues: + logging.info(f"Modality to cue {Blip2VicunaXInstruct.MODALITY_TO_CUE}") + self.tokenized_cue = {} + self.emb_cue = {} + self.att_cue = {} + for modality in self.modalities: + if self.clean_tokenization: + Blip2VicunaXInstruct.MODALITY_TO_CUE[modality] = Blip2VicunaXInstruct.MODALITY_TO_CUE[modality].lstrip() + self.tokenized_cue[modality] = self.llm_tokenizer(Blip2VicunaXInstruct.MODALITY_TO_CUE[modality], return_tensors="pt") + self.emb_cue[modality] = self.llm_model.get_input_embeddings()(self.tokenized_cue[modality].input_ids.to(self.device)) + self.att_cue[modality] = self.tokenized_cue[modality].attention_mask.to(self.device) + + + ## generation parameters + self.use_caption=use_caption + self.use_describe=use_describe + self.predict_with_gen=predict_with_gen + self.format_candidates_prompt=format_candidates_prompt + self.special_qformer_input_prompt=special_qformer_input_prompt + self.enumerate_inputs=enumerate_inputs + self.add_space=add_space + self.remove_start=remove_start + if self.projection_only: + self.qformer_text_input=False + + def concat_text_input_output(self, input_ids, input_atts, output_ids, output_atts): + input_part_targets_len = [] + llm_tokens = {"input_ids": [], "attention_mask": []} + for i in range(input_ids.size(0)): + this_input_ones = input_atts[i].sum() + input_part_targets_len.append(this_input_ones) + llm_tokens['input_ids'].append( + torch.cat([ + input_ids[i][:this_input_ones], + output_ids[i][1:], + input_ids[i][this_input_ones:] + ]) + ) + llm_tokens['attention_mask'].append( + torch.cat([ + input_atts[i][:this_input_ones], + output_atts[i][1:], + input_atts[i][this_input_ones:] + ]) + ) + llm_tokens['input_ids'] = torch.stack(llm_tokens['input_ids']) + llm_tokens['attention_mask'] = torch.stack(llm_tokens['attention_mask']) + return llm_tokens, input_part_targets_len + + def forward(self, samples): + # print('-----------------') + # print(samples["text_input"]) + # print(samples["text_output"]) + # print('-----------------') + if samples == None or samples == {} or not any([modality in samples for modality in self.modalities]): + return {"loss": torch.tensor(0.0)} + + random.shuffle(self.modalities) + + curr_modalities = [modality for modality in self.modalities if modality in samples] + excess_modalities = [modality for modality in self.modalities if modality not in curr_modalities] + # disable gradient in excess modalities + dummy_loss = 0. + for modality in excess_modalities: + if self.shared_qformer: + for name, param in getattr(self, f"{modality}_encoder_projection").named_parameters(): + # param.requires_grad = False + dummy_loss += param.sum()*0. + for name, param in getattr(self,f"{modality}_ln").named_parameters(): + # param.requires_grad = False + dummy_loss += param.sum()*0. + dummy_loss += getattr(self, f"{modality}_query_tokens").sum()*0. + for name, param in getattr(self, f'{modality}_Qformer').named_parameters(): + # param.requires_grad = False + dummy_loss += param.sum()*0. + for name, param in getattr(self, f'{modality}_llm_proj').named_parameters(): + # param.requires_grad = False + dummy_loss += param.sum()*0. + + embeds = {} + query_tokens = {} + data_atts = {} + for modality in curr_modalities: + data = samples[modality] + ln = getattr(self, f"{modality}_ln") + encoder = getattr(self, f"{modality}_encoder") + if modality == "video" and self.video_enc_name in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + embeds[modality] = [] + data_atts[modality] = [] + for j in range(data.size(2)): + this_frame = data[:,:,j,:,:] + with self.maybe_autocast(): + embeds[modality].append(ln(encoder(this_frame))) + if self.shared_qformer: + embeds[modality][-1] = getattr(self, f"{modality}_encoder_projection")(embeds[modality][j]) + data_atts[modality].append(torch.ones(embeds[modality][j].size()[:-1], dtype=torch.long).to(self.device)) + # B, Token Size, LM EMB + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + query_tokens[modality] = getattr(self, f"{modality}_query_tokens").expand(data.size(0), -1, -1) + + elif modality == 'audio' and self.audio_enc_name in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + embeds[modality] = [] + data_atts[modality] = [] + for j in range(data.size(1)): + this_frame = data[:,j,:,:] + with self.maybe_autocast(): + embeds[modality].append(ln(encoder(this_frame))) + if self.shared_qformer: + embeds[modality][j] = getattr(self, f"{modality}_encoder_projection")(embeds[modality][j]) + data_atts[modality].append(torch.ones(embeds[modality][j].size()[:-1], dtype=torch.long).to(self.device)) + # B, Token Size, LM EMB + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + query_tokens[modality] = getattr(self, f"{modality}_query_tokens").expand(data.size(0), -1, -1) + else: + with self.maybe_autocast(): + embeds[modality] = ln(encoder(data)) + if len(embeds[modality].size()) == 2: + # B, C, D + embeds[modality] = embeds[modality].unsqueeze(1) + # B, C + if self.shared_qformer: + embeds[modality] = getattr(self, f"{modality}_encoder_projection")(embeds[modality]) + data_atts[modality] = torch.ones(embeds[modality].size()[:-1], dtype=torch.long).to(self.device) + + # B, Token Size, LM EMB + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + query_tokens[modality] = getattr(self, f"{modality}_query_tokens").expand(embeds[modality].shape[0], -1, -1) + + query_outputs = {} + if self.qformer_text_input: + text_Qformer = self.tokenizer( + samples["text_input"] if not self.special_qformer_input_prompt else self.special_qformer_input_prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + + Qformer_atts = {} + query_atts = {} + + for modality in curr_modalities: + # B, Token Size + query_atts[modality] = torch.ones(query_tokens[modality].size()[:-1], dtype=torch.long).to(self.device) + # B, Token Size + Inp Size + Qformer_atts[modality] = torch.cat([query_atts[modality],text_Qformer.attention_mask],dim=1) + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + num = len(embeds[modality]) + bs = embeds[modality][0].shape[0] + indices = [j_+r for r,j in enumerate([[i*bs for i in range(num)]]*bs) for j_ in j] + reordered_embeds = torch.cat(embeds[modality])[indices] + reordered_atts = torch.cat(data_atts[modality])[indices] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.mean(1,keepdim=True)).view(bs*num, self.num_query_token, -1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.view(reordered_embeds.shape[0],-1)) + continue + query_output = getattr(self, f"{modality}_Qformer").bert( + text_Qformer.input_ids.repeat(num, 1), + attention_mask=Qformer_atts[modality].repeat(num, 1), + query_embeds=query_tokens[modality].repeat(num, 1, 1), + encoder_hidden_states=reordered_embeds, + encoder_attention_mask=reordered_atts, + return_dict=True, + ) + query_outputs[modality] = query_output + else: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality].mean(1, keepdim=True)).reshape(bs, self.num_query_token,-1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality]).reshape(bs, self.num_query_token,-1) + continue + query_outputs[modality] = getattr(self, f"{modality}_Qformer").bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts[modality], + query_embeds=query_tokens[modality], + encoder_hidden_states=embeds[modality].to(torch.float32), + encoder_attention_mask=data_atts[modality], + return_dict=True, + ) + else: + for modality in curr_modalities: + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + num = len(embeds[modality]) + bs = embeds[modality][0].shape[0] + indices = [j_+r for r,j in enumerate([[i*bs for i in range(num)]]*bs) for j_ in j] + reordered_embeds = torch.cat(embeds[modality])[indices] + reordered_atts = torch.cat(data_atts[modality])[indices] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.mean(1,keepdim=True)).view(bs*num, self.num_query_token, -1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.view(reordered_embeds.shape[0],-1)) + continue + query_output = getattr(self, f"{modality}_Qformer").bert( + query_embeds=query_tokens[modality].repeat(num, 1, 1), + encoder_hidden_states=reordered_embeds, + encoder_attention_mask=reordered_atts, + return_dict=True, + ) + query_outputs[modality] = query_output + else: + bs = embeds[modality].shape[0] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality].mean(1, keepdim=True)).reshape(bs, self.num_query_token,-1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality]).reshape(bs, self.num_query_token,-1) + continue + query_outputs[modality] = getattr(self, f"{modality}_Qformer").bert( + query_embeds=query_tokens[modality], + encoder_hidden_states=embeds[modality].to(torch.float32), # pc data is floa16. + encoder_attention_mask=data_atts[modality], + return_dict=True, + ) + + inputs_llm = {} + atts_llm = {} + for modality in curr_modalities: + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + # num*bs, num query tokens, llm emb size + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].unsqueeze(1)).reshape(bs*num, self.num_query_token, -1) + else: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality]).reshape(bs*num, self.num_query_token, -1) + inputs_llm[modality] = inputs_llm[modality].reshape(bs, num, self.num_query_token, -1).view(bs, num*self.num_query_token, -1) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + continue + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].last_hidden_state[:,:query_tokens[modality].size(1),:]) + # bs, num, num query tokens, llm emb size -> bs, num*num query tokens, llm emb size + inputs_llm[modality] = inputs_llm[modality].reshape(bs, num, self.num_query_token, -1).view(bs, num*self.num_query_token, -1) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + else: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim == 1: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].mean(-1)).reshape(bs, self.num_query_token, -1) + else: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].reshape(bs, self.num_query_token, -1)) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + continue + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].last_hidden_state[:,:query_tokens[modality].size(1),:]) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + + self.llm_tokenizer.padding_side = "right" + self.llm_tokenizer.truncation_side = 'left' + + if self.llm_text_input: + text_input_tokens = self.llm_tokenizer( + [f"{t}{self.postfix}" for t in samples['text_input']] if self.postfix else samples['text_input'], + return_tensors="pt", + padding="longest", + truncation=True, + max_length=self.max_txt_len, + add_special_tokens= not self.clean_tokenization + ).to(self.device) + + self.llm_tokenizer.truncation_side = 'right' + text_output_tokens = self.llm_tokenizer( + [t + self.llm_tokenizer.eos_token for t in samples['text_output']], + return_tensors="pt", + padding="longest", + truncation=True, + max_length=self.max_output_txt_len, + ).to(self.device) + + if self.llm_text_input: + llm_tokens, input_part_targets_len = self.concat_text_input_output( + text_input_tokens.input_ids, + text_input_tokens.attention_mask, + text_output_tokens.input_ids, + text_output_tokens.attention_mask, + ) + else: + llm_tokens = text_output_tokens + input_part_targets_len = [0 for _ in range(llm_tokens['input_ids'].shape[0])] # input length is 0 + + + # do not apply loss to the padding + targets = llm_tokens['input_ids'].masked_fill( + llm_tokens['input_ids'] == self.llm_tokenizer.pad_token_id, -100 + ) + + # do not apply loss to the text input (i.e., instruction) + for i, l in enumerate(input_part_targets_len): + targets[i][:l] = -100 + + inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens['input_ids']) + + bs = inputs_embeds.shape[0] + + att_list = [] + inp_list = [] + + if self.prefix: + att_list = [self.tokenized_prefix.attention_mask.repeat(bs, 1).to(self.device)] + inp_list = [self.llm_model.get_input_embeddings()(self.tokenized_prefix.input_ids.to(self.device)).repeat(bs, 1, 1)] + for modality in curr_modalities: + if self.use_cues: + if self.prefix and self.clean_tokenization: + att_list.extend([self.att_cue[modality][:,1:].repeat(bs, 1).to(self.device), atts_llm[modality]]) + inp_list.extend([self.emb_cue[modality][:,1:].repeat(bs, 1, 1).to(self.device), inputs_llm[modality]]) + att_list.extend([self.att_cue[modality].repeat(bs, 1).to(self.device), atts_llm[modality]]) + inp_list.extend([self.emb_cue[modality].repeat(bs, 1, 1).to(self.device), inputs_llm[modality]]) + else: + att_list.extend([atts_llm[modality]]) + inp_list.extend([inputs_llm[modality]]) + + # do not apply loss to the query tokens + empty_targets = ( + torch.ones(torch.cat(att_list, dim=1).size(), dtype=torch.long).to(self.device).fill_(-100) + ) + + # append llm prompt + output to queries + att_list.append(llm_tokens['attention_mask']) + inp_list.append(inputs_embeds) + + inputs_embeds = torch.cat(inp_list, dim=1) + attention_mask = torch.cat(att_list, dim=1) + targets = torch.cat([empty_targets, targets], dim=1) + + + + with self.maybe_autocast(): + outputs = self.llm_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict=True, + labels=targets, + ) + + loss = dummy_loss+outputs.loss + + + + return {"loss": loss} + + + def init_image_encoder(self, + model_name, + precision, + **kwargs): + + load_ln_path = kwargs['load_ln_path'] + del kwargs['load_ln_path'] + load_ln_type=kwargs['load_ln_type'] + del kwargs['load_ln_type'] + + encoder, _ = super().init_vision_encoder(model_name, kwargs['image_size'], kwargs['drop_path_rate'], kwargs['use_grad_checkpoint'], precision) + ln = self.init_ln(encoder.num_features, load_ln_path=load_ln_path, load_ln_type=load_ln_type) + return encoder, ln + + def init_pc_encoder( + self, model_name, precision, **kwargs + ): + assert model_name in [ + "ulip1_pointbert", + "ulip2_pointbert", + "ulip_shapenet", + "ulip_objaverse", + "objaverse_shapenet_k_1", + "ulip2_scaledup" + "" + ], "pc model must be in [ulip1_pointbert,ulip2_pointbert]" + + load_ln_path = kwargs['load_ln_path'] + del kwargs['load_ln_path'] + load_ln_type=kwargs['load_ln_type'] + del kwargs['load_ln_type'] + + if model_name == "ulip2_pointbert": + pc_encoder = ULIP_PointBERT(ulip_v=2) + elif model_name == "ulip_shapenet": + pc_encoder = ULIP_PointBERT(ulip_v="shapenet") + elif model_name == "ulip_objaverse": + pc_encoder = ULIP_PointBERT(ulip_v="objaverse_k_1") + elif model_name == "objaverse_shapenet_k_1": + pc_encoder = ULIP_PointBERT(ulip_v="objaverse_shapenet_k_1") + elif model_name == "ulip2_scaledup": + pc_encoder = ULIP_PointBERT(ulip_v="ulip2_scaledup") + else: + pc_encoder = ULIP_PointBERT(ulip_v=1) + ln_pc = self.init_ln(pc_encoder.num_features, load_ln_path=load_ln_path, load_ln_type=load_ln_type) + self.pc_enc_name = model_name + return pc_encoder, ln_pc + + + def init_video_encoder( + self, model_name, precision, **kwargs + ): + assert model_name in [ + "eva_clip_g", + "eva2_clip_L", + "clip_L", + ], "video_model must be in [eva_clip_g, eva2_clip_L, clip_L]" + + if model_name in ["eva_clip_g","eva2_clip_L","clip_L",]: + video_encoder, ln_video = self.init_image_encoder( + model_name, precision=precision, **kwargs + ) + self.video_enc_name = model_name + return video_encoder, ln_video + + def init_audio_encoder( + self, model_name, precision, **kwargs + ): + assert model_name in [ + 'beats' + ], "audio model must be in [beats]" + + load_ln_path = kwargs['load_ln_path'] + del kwargs['load_ln_path'] + load_ln_type=kwargs['load_ln_type'] + del kwargs['load_ln_type'] + if "beats" in model_name: + from lavis.models.beats_encoder import BeatsEncoder + if self.cached_audio: + audio_encoder = lambda x: x + ln_audio = self.init_ln(768, load_ln_path=load_ln_path, load_ln_type=load_ln_type) + else: + audio_encoder = BeatsEncoder(**kwargs) + if not self.cached_audio: + ln_audio = self.init_ln(audio_encoder.num_features, load_ln_path=load_ln_path, load_ln_type=load_ln_type) + self.audio_enc_name = model_name + return audio_encoder, ln_audio + + @torch.no_grad() + def get_query_outputs( + self, + samples + ): + if samples == None or samples == {}: + return + + curr_modalities = [modality for modality in self.modalities if modality in samples] + if len(curr_modalities) == 0: + print("Model modalities do not match sample modalities.") + return + + # get batch size + bs = None + for modality in curr_modalities: + data = samples[modality] + bs = data.size(0) + break + + if "prompt" in samples.keys(): + prompt = samples["prompt"] + elif "text_input" in samples.keys(): + prompt = samples["text_input"] + else: + prompt = self.prompt + + if isinstance(prompt, str): + prompt = [prompt] * bs + else: + assert len(prompt) == bs, "The number of prompts must be equal to the batch size." + + embeds = {} + query_tokens = {} + data_atts = {} + + for modality in curr_modalities: + data = samples[modality] + ln = getattr(self, f"{modality}_ln") + encoder = getattr(self, f"{modality}_encoder") + if modality == "video" and self.video_enc_name in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + embeds[modality] = [] + data_atts[modality] = [] + for j in range(data.size(2)): + this_frame = data[:,:,j,:,:] + with self.maybe_autocast(): + embeds[modality].append(ln(encoder(this_frame))) + if self.shared_qformer: + embeds[modality][-1] = getattr(self, f"{modality}_encoder_projection")(embeds[modality][j]) + data_atts[modality].append(torch.ones(embeds[modality][j].size()[:-1], dtype=torch.long).to(self.device)) + # B, Token Size, LM EMB + query_tokens[modality] = getattr(self, f"{modality}_query_tokens").expand(data.size(0), -1, -1) + + elif modality == 'audio' and self.audio_enc_name in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + embeds[modality] = [] + data_atts[modality] = [] + for j in range(data.size(1)): + this_frame = data[:,j,:,:] + with self.maybe_autocast(): + embeds[modality].append(ln(encoder(this_frame))) + if self.shared_qformer: + embeds[modality][j] = getattr(self, f"{modality}_encoder_projection")(embeds[modality][j]) + data_atts[modality].append(torch.ones(embeds[modality][j].size()[:-1], dtype=torch.long).to(self.device)) + # B, Token Size, LM EMB + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + query_tokens[modality] = getattr(self, f"{modality}_query_tokens").expand(data.size(0), -1, -1) + else: + with self.maybe_autocast(): + embeds[modality] = ln(encoder(data)) + if len(embeds[modality].size()) == 2: + # B, C, D + embeds[modality] = embeds[modality].unsqueeze(1) + # B, C + if self.shared_qformer: + embeds[modality] = getattr(self, f"{modality}_encoder_projection")(embeds[modality]) + + data_atts[modality] = torch.ones(embeds[modality].size()[:-1], dtype=torch.long).to(self.device) + + # B, Token Size, LM EMB + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + query_tokens[modality] = getattr(self, f"{modality}_query_tokens").expand(embeds[modality].shape[0], -1, -1) + + query_outputs = {} + if self.qformer_text_input: + text_Qformer = self.tokenizer( + prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + + Qformer_atts = {} + query_atts = {} + num = {} + for modality in curr_modalities: + # B, Token Size + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + query_atts[modality] = torch.ones(query_tokens[modality].size()[:-1], dtype=torch.long).to(self.device) + # B, Token Size + Inp Size + Qformer_atts[modality] = torch.cat([query_atts[modality],text_Qformer.attention_mask],dim=1) + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + num[modality] = len(embeds[modality]) + bs = embeds[modality][0].shape[0] + indices = [j_+r for r,j in enumerate([[i*bs for i in range(num[modality])]]*bs) for j_ in j] + reordered_embeds = torch.cat(embeds[modality])[indices] + reordered_atts = torch.cat(data_atts[modality])[indices] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.mean(1,keepdim=True)).view(bs*num[modality], self.num_query_token, -1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.view(reordered_embeds.shape[0],-1)) + continue + query_output = getattr(self, f"{modality}_Qformer").bert( + text_Qformer.input_ids.repeat(num[modality], 1), + attention_mask=Qformer_atts[modality].repeat(num[modality], 1), + query_embeds=query_tokens[modality].repeat(num[modality], 1, 1), + encoder_hidden_states=reordered_embeds, + encoder_attention_mask=reordered_atts, + return_dict=True, + ) + query_outputs[modality] = query_output + else: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality].mean(1, keepdim=True)).reshape(bs, self.num_query_token,-1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality]).reshape(bs, self.num_query_token,-1) + continue + query_outputs[modality] = getattr(self, f"{modality}_Qformer").bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts[modality], + query_embeds=query_tokens[modality], + encoder_hidden_states=embeds[modality].to(torch.float32), + encoder_attention_mask=data_atts[modality], + return_dict=True, + ) + else: + num = {} + for modality in curr_modalities: + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + num[modality] = len(embeds[modality]) + bs = embeds[modality][0].shape[0] + indices = [j_+r for r,j in enumerate([[i*bs for i in range(num[modality])]]*bs) for j_ in j] + reordered_embeds = torch.cat(embeds[modality])[indices] + reordered_atts = torch.cat(data_atts[modality])[indices] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.mean(1,keepdim=True)).view(bs*num, self.num_query_token, -1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.view(reordered_embeds.shape[0],-1)) + continue + query_output = getattr(self, f"{modality}_Qformer").bert( + query_embeds=query_tokens[modality].repeat(num[modality], 1, 1), + encoder_hidden_states=reordered_embeds, + encoder_attention_mask=reordered_atts, + return_dict=True, + ) + query_outputs[modality] = query_output + else: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality].mean(1, keepdim=True)).reshape(bs, self.num_query_token,-1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality]).reshape(bs, self.num_query_token,-1) + continue + query_outputs[modality] = getattr(self, f"{modality}_Qformer").bert( + query_embeds=query_tokens[modality], + encoder_hidden_states=embeds[modality].to(torch.float32), # pc data is floa16. + encoder_attention_mask=data_atts[modality], + return_dict=True, + ) + + for modality in curr_modalities: + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[f'llm_proj_{modality}'] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].unsqueeze(1)).reshape(bs*num, self.num_query_token, -1) + else: + query_outputs[f'llm_proj_{modality}'] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality]).reshape(bs*num, self.num_query_token, -1) + query_outputs[f'llm_proj_{modality}'] = query_outputs[f'llm_proj_{modality}'].reshape(bs, num[modality], self.num_query_token, -1).contiguous().view(bs, num[modality]*self.num_query_token, -1) + query_outputs[modality] = query_outputs[modality].view(bs, num[modality]*self.num_query_token, -1) + else: + query_outputs[f'llm_proj_{modality}'] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality]['last_hidden_state'][:,:query_tokens[modality].size(1),:]).contiguous().view(bs, num[modality]*self.num_query_token, -1) + query_outputs[modality] = query_outputs[modality]['last_hidden_state'][:,:query_tokens[modality].size(1),:].contiguous().view(bs, num[modality]*self.num_query_token, -1) + + + else: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim == 1: + query_outputs[f'llm_proj_{modality}'] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].mean(-1)).reshape(bs, self.num_query_token, -1) + else: + query_outputs[f'llm_proj_{modality}']= getattr(self, f"{modality}_llm_proj")(query_outputs[modality].reshape(bs, self.num_query_token, -1)) + else: + query_outputs[modality] = query_outputs[modality].last_hidden_state[:,:query_tokens[modality].size(1),:] + query_outputs[f'llm_proj_{modality}'] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality]) + + for modality in curr_modalities: + query_outputs[f'embeds_{modality}'] = embeds[modality] + return query_outputs + + @torch.no_grad() + def generate( + self, + samples, + use_nucleus_sampling=False, + num_beams=5, + max_length=256, + min_length=1, + top_p=0.9, + repetition_penalty=1.5, + length_penalty=1, + num_captions=1, + temperature=1, + special_qformer_input_prompt=False + ): + self.llm_tokenizer.padding_side = "left" + + if samples == None or samples == {}: + return + + if 'modalities' in samples: + curr_modalities = samples['modalities'][0] if isinstance(samples['modalities'][0], list) else samples['modalities'] + elif self.joint_video_audio: + curr_modalities = ["video", "audio"] + else: + curr_modalities = [modality for modality in self.modalities if modality in samples] + + + if len(curr_modalities) == 0: + print("Model modalities do not match sample modalities.") + return + + # get batch size + bs = None + for modality in curr_modalities: + data = samples[modality] + if isinstance(data, torch.Tensor): + bs = data.size(0) + else: + bs = len(data) + break + + if "prompt" in samples.keys(): + prompt = samples["prompt"] + elif self.prompt and 'text_input' in samples and '{}' in self.prompt: + prompt = [self.prompt.format(t) for t in samples["text_input"]] + elif "text_input" in samples.keys(): + prompt = samples["text_input"] + else: + prompt = self.prompt + + if isinstance(prompt, str): + prompt = [prompt] * bs + else: + assert len(prompt) == bs, "The number of prompts must be equal to the batch size." + + # For TextCaps + if "ocr_tokens" in samples.keys() and "{}" in prompt[0]: + prompt = [p.format(', '.join(samples['ocr_tokens'][i][:30])) for i, p in enumerate(prompt)] + + + if 'discrn' in samples and self.use_caption: ## discriminatory reasoning + if self.postfix: + prompt = [f'{t}{self.postfix}' for t in prompt] + if self.enumerate_inputs: + prompt = [f'{self.prefix}(a){Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][0]] if self.use_cues else " "}{samples["baseline_captions"][i][0]} (b){Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][1]] if self.use_cues else " "}{samples["baseline_captions"][i][1]} {prompt[i]}' for i in range(bs)] + else: + prompt = [f'{self.prefix}{Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][0]]}{samples["baseline_captions"][i][0] if self.use_cues else " "}{Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][1]] if self.use_cues else " "}{samples["baseline_captions"][i][1]} {prompt[i]}' for i in range(bs)] + llm_tokens = self.llm_tokenizer( + prompt, + padding="longest", + return_tensors="pt" + ).to(self.device) + inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens.input_ids) + + with self.maybe_autocast(): + outputs = self.llm_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=llm_tokens.attention_mask, + do_sample=use_nucleus_sampling, + top_p=top_p, + temperature=temperature, + num_beams=num_beams, + max_length=max_length, + min_length=min_length, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + num_return_sequences=num_captions, + ) + + outputs[outputs == 0] = 2 # convert output id 0 to 2 (eos_token_id) + output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True) + output_text = [o.strip() for o in output_text] + # print(output) + return output_text + + query_tokens = {} + for modality in curr_modalities: + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + query_tokens[modality] = getattr(self, f"{modality}_query_tokens").expand(bs, -1, -1) + if self.qformer_text_input: + if self.special_qformer_input_prompt or special_qformer_input_prompt: + qformer_prompt = special_qformer_input_prompt if special_qformer_input_prompt else self.special_qformer_input_prompt + qformer_prompt = [qformer_prompt] * len(prompt) + if "text_input" in samples.keys(): + if type(samples["text_input"][0]) == list: + qformer_prompt = [qformer_prompt[i].format(*samples["text_input"][i]) for i in range(len(qformer_prompt))] + else: + qformer_prompt = [qformer_prompt[i].format(samples["text_input"][i]) for i in range(len(qformer_prompt))] + text_Qformer = self.tokenizer( + qformer_prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + elif self.use_describe: + modality2prompt = { + "video": "a short description of the video", + "audio": "an audio that shows", + "image": "a short image caption", + "pc": "a 3d model of" + } + qformer_prompt = [modality2prompt[modality] for _ in samples['text_input']] + + text_Qformer = self.tokenizer( + qformer_prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + else: + text_Qformer = self.tokenizer( + prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + Qformer_atts = {} + query_atts = {} + + for modality in curr_modalities: + if not getattr(self, f"projection_only_{modality}"): + # B, Token Size + query_atts[modality] = torch.ones(query_tokens[modality].size()[:-1], dtype=torch.long).to(self.device) + # B, Token Size + Inp Size + Qformer_atts[modality] = torch.cat([query_atts[modality],text_Qformer.attention_mask],dim=1) + + embeds = {} + data_atts = {} + for modality in curr_modalities: + data = samples[modality] + ln = getattr(self, f"{modality}_ln") + encoder = getattr(self, f"{modality}_encoder") + if modality == "video" and "clip" in self.video_enc_name: + embeds[modality] = [] + data_atts[modality] = [] + for j in range(data.size(2)): + this_frame = data[:,:,j,:,:] + with self.maybe_autocast(): + embeds[modality].append(ln(encoder(this_frame))) + if self.shared_qformer: + embeds[modality][j] = getattr(self, f"{modality}_encoder_projection")(embeds[modality][j]) + data_atts[modality].append(torch.ones(embeds[modality][j].size()[:-1], dtype=torch.long).to(self.device)) + elif modality == 'audio' and 'beats' in self.audio_enc_name: + embeds[modality] = [] + data_atts[modality] = [] + for j in range(data.size(1)): + this_frame = data[:,j,:,:] + with self.maybe_autocast(): + embeds[modality].append(ln(encoder(this_frame))) + if self.shared_qformer: + embeds[modality][j] = getattr(self, f"{modality}_encoder_projection")(embeds[modality][j]) + data_atts[modality].append(torch.ones(embeds[modality][j].size()[:-1], dtype=torch.long).to(self.device)) + else: + with self.maybe_autocast(): + embeds[modality] = ln(encoder(data)) + if len(embeds[modality].size()) == 2: + embeds[modality] = embeds[modality].unsqueeze(1) + if self.shared_qformer: + with self.maybe_autocast(): + embeds[modality] = getattr(self, f"{modality}_encoder_projection")(embeds[modality]) + data_atts[modality] = torch.ones(embeds[modality].size()[:-1], dtype=torch.long).to(self.device) + + query_outputs = {} + num = {} + if self.qformer_text_input: + for modality in curr_modalities: + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + num[modality] = len(embeds[modality]) + bs = embeds[modality][0].shape[0] + indices = [j_+r for r,j in enumerate([[i*bs for i in range(num[modality])]]*bs) for j_ in j] + reordered_embeds = torch.cat(embeds[modality])[indices] + reordered_atts = torch.cat(data_atts[modality])[indices] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.mean(1,keepdim=True)).view(bs*num[modality], self.num_query_token, -1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.view(reordered_embeds.shape[0],-1)) + continue + query_output = getattr(self, f"{modality}_Qformer").bert( + text_Qformer.input_ids.repeat(num[modality], 1), + attention_mask=Qformer_atts[modality].repeat(num[modality], 1), + query_embeds=query_tokens[modality].repeat(num[modality], 1, 1), + encoder_hidden_states=reordered_embeds, + encoder_attention_mask=reordered_atts, + return_dict=True, + ) + query_outputs[modality] = query_output + else: + bs = embeds[modality].shape[0] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality].mean(1, keepdim=True)).reshape(bs, self.num_query_token,-1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality]).reshape(bs, self.num_query_token,-1) + continue + query_outputs[modality] = getattr(self, f"{modality}_Qformer").bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts[modality], + query_embeds=query_tokens[modality], + encoder_hidden_states=embeds[modality].to(torch.float32), + encoder_attention_mask=data_atts[modality], + return_dict=True, + ) + else: + for modality in curr_modalities: + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + num[modality] = len(embeds[modality]) + bs = embeds[modality][0].shape[0] + indices = [j_+r for r,j in enumerate([[i*bs for i in range(num[modality])]]*bs) for j_ in j] + reordered_embeds = torch.cat(embeds[modality])[indices] + reordered_atts = torch.cat(data_atts[modality])[indices] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.mean(1,keepdim=True)).view(bs*num[modality], self.num_query_token, -1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.view(reordered_embeds.shape[0],-1)) + continue + query_output = getattr(self, f"{modality}_Qformer").bert( + query_embeds=query_tokens[modality].repeat(num[modality], 1, 1), + encoder_hidden_states=reordered_embeds, + encoder_attention_mask=reordered_atts, + return_dict=True, + ) + query_outputs[modality] = query_output + else: + bs = embeds[modality].shape[0] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + with self.maybe_autocast(): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality].mean(1, keepdim=True)).reshape(bs, self.num_query_token,-1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality]).reshape(bs, self.num_query_token,-1) + continue + query_outputs[modality] = getattr(self, f"{modality}_Qformer").bert( + query_embeds=query_tokens[modality], + encoder_hidden_states=embeds[modality].to(torch.float32), + encoder_attention_mask=data_atts[modality], + return_dict=True, + ) + + inputs_llm = {} + atts_llm = {} + enumeration = {} + + for i,modality in enumerate(curr_modalities): + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].unsqueeze(1)).reshape(bs*num[modality], self.num_query_token, -1) + else: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].reshape(bs*num, self.num_query_token, -1)) + inputs_llm[modality] = inputs_llm[modality].reshape(bs, num[modality], self.num_query_token, -1).view(bs, num[modality]*self.num_query_token, -1) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + continue + # num*bs, num query tokens, llm emb size + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].last_hidden_state[:,:query_tokens[modality].size(1),:]) + # bs, num, num query tokens, llm emb size -> bs, num*num query tokens, llm emb size + inputs_llm[modality] = inputs_llm[modality].reshape(bs, num[modality], self.num_query_token, -1).view(bs, num[modality]*self.num_query_token, -1) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + else: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim == 1: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].mean(-1)).reshape(bs, self.num_query_token, -1) + else: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].reshape(bs, self.num_query_token, -1)) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + continue + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality]['last_hidden_state'][:,:query_tokens[modality].size(1),:]) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + if self.enumerate_inputs: + enumeration[modality] = self.llm_tokenizer( + [f"{'' if i == 0 else ' '}({chr(97+i)}) " for _ in prompt], + return_tensors="pt", + add_special_tokens=False if (i!= 0 or self.prefix) else True + ).to(self.device) + + ## remove trailing whitespace + prompt = [p.strip() for p in prompt] + + if 'dialog' in samples: + llm_tokens = self.llm_tokenizer( + [f"{d} {p}" if d else p for d, p in zip(samples['dialog'], prompt)], + padding="longest", + return_tensors="pt", + add_special_tokens= not self.clean_tokenization + ).to(self.device) + else: + llm_tokens = self.llm_tokenizer( + [f"{p}{self.postfix}" for p in prompt] if self.postfix else prompt, + padding="longest", + return_tensors="pt", + add_special_tokens= not self.clean_tokenization + ).to(self.device) + bs = llm_tokens.input_ids.shape[0] + + att_list = [] + inp_list = [] + if self.prefix: + att_list = [self.tokenized_prefix.attention_mask.repeat(bs, 1).to(self.device)] + inp_list = [self.llm_model.get_input_embeddings()(self.tokenized_prefix.input_ids.to(self.device)).repeat(bs, 1, 1)] + + if self.joint_video_audio: + for pos in range(num['video']): + if self.enumerate_inputs: + enumeration_pos = self.llm_tokenizer( + [f"{'' if pos == 0 else ' '}({chr(97+pos)}) " for _ in prompt], + return_tensors="pt", + add_special_tokens=False if (pos!= 0 or self.prefix) else True + ).to(self.device) + enumeration_inputs_llm = self.llm_model.get_input_embeddings()(enumeration_pos.input_ids) + enumeration_atts_llm = enumeration_pos.attention_mask.to(self.device) + inp_list.extend([enumeration_inputs_llm]) + att_list.extend([enumeration_atts_llm]) + if self.use_cues: + for modality in ['video', 'audio']: + if self.clean_tokenization: + if self.prefix or pos > 1 or self.enumerate_inputs or modality == 'audio': + att_list.extend([torch.tensor(self.tokenized_cue[modality].attention_mask[:,1:]).to(self.device).repeat(atts_llm[modality].shape[0], 1), atts_llm[modality].view(bs, num[modality], self.num_query_token)[:, pos, :]]) + inp_list.extend([self.emb_cue[modality][:,1:].to(self.device).repeat(inputs_llm[modality].shape[0], 1, 1), inputs_llm[modality].view(bs, num[modality], self.num_query_token, -1)[:, pos, :, :]]) + continue + att_list.extend([torch.tensor(self.tokenized_cue[modality].attention_mask).to(self.device).repeat(atts_llm[modality].shape[0], 1), atts_llm[modality].view(bs, num[modality], self.num_query_token)[:, pos, :]]) + inp_list.extend([self.emb_cue[modality].to(self.device).repeat(inputs_llm[modality].shape[0], 1, 1), inputs_llm[modality].view(bs, num[modality], self.num_query_token, -1)[:, pos, :, :]]) + else: + att_list.extend([atts_llm[modality].view(bs, num[modality], self.num_query_token)[:, pos, :]]) + inp_list.extend([inputs_llm[modality].view(bs, num[modality], self.num_query_token, -1)[:, pos, :, :]]) + else: + for modality in curr_modalities: + if self.enumerate_inputs: + enumeration_inputs_llm = self.llm_model.get_input_embeddings()(enumeration[modality].input_ids.to(self.device)) + enumeration_atts_llm = enumeration[modality].attention_mask.to(self.device) + inp_list.extend([enumeration_inputs_llm]) + att_list.extend([enumeration_atts_llm]) + if self.use_cues: + if self.clean_tokenization or self.remove_start: + if (modality==curr_modalities[0] and not (self.prefix or self.enumerate_inputs)): + att_list.extend([torch.tensor(self.tokenized_cue[modality].attention_mask).to(self.device).repeat(atts_llm[modality].shape[0], 1), atts_llm[modality]]) + inp_list.extend([self.emb_cue[modality].to(self.device).repeat(inputs_llm[modality].shape[0], 1, 1), inputs_llm[modality]]) + else: + att_list.extend([torch.tensor(self.tokenized_cue[modality].attention_mask[:,1:]).to(self.device).repeat(atts_llm[modality].shape[0], 1), atts_llm[modality]]) + inp_list.extend([self.emb_cue[modality][:,1:].to(self.device).repeat(inputs_llm[modality].shape[0], 1, 1), inputs_llm[modality]]) + else: + att_list.extend([torch.tensor(self.tokenized_cue[modality].attention_mask).to(self.device).repeat(atts_llm[modality].shape[0], 1), atts_llm[modality]]) + inp_list.extend([self.emb_cue[modality].to(self.device).repeat(inputs_llm[modality].shape[0], 1, 1), inputs_llm[modality]]) + + else: + att_list.extend([atts_llm[modality]]) + inp_list.extend([inputs_llm[modality]]) + + if self.add_space: + space_tok = self.llm_tokenizer( + [f" " for _ in prompt], + return_tensors="pt", + add_special_tokens=False + ) + space_inputs_llm = self.llm_model.get_input_embeddings()(space_tok.input_ids.to(self.device)) + space_atts_llm = space_tok.attention_mask.to(self.device) + inp_list.extend([space_inputs_llm]) + att_list.extend([space_atts_llm]) + + att_list.append(llm_tokens.attention_mask) + inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens.input_ids) + inp_list.append(inputs_embeds) + + attention_mask = torch.cat(att_list, dim=1) + inputs_embeds = torch.cat(inp_list, dim=1) + + + with self.maybe_autocast(): + outputs = self.llm_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + do_sample=use_nucleus_sampling, + top_p=top_p, + temperature=temperature, + num_beams=num_beams, + max_length=max_length, + min_length=min_length, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + num_return_sequences=num_captions, + ) + outputs[outputs == 0] = 2 # convert output id 0 to 2 (eos_token_id) + output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True) + output_text = [o.strip() for o in output_text] + return output_text + + @torch.no_grad() + def predict_answers( + self, + samples, + num_beams=5, + inference_method="generate", + max_len=10, + min_len=1, + num_ans_candidates=128, + answer_list=None, + prompt="", + length_penalty=-1, + **kwargs + ): + if samples == None or samples == {}: + return None + + # get batch size + bs = None + if 'modalities' in samples: + curr_modalities = samples['modalities'][0] if isinstance(samples['modalities'][0], list) else samples['modalities'] + else: + curr_modalities = [modality for modality in self.modalities if modality in samples] + for modality in curr_modalities: + data = samples[modality] + if isinstance(data, torch.Tensor): + bs = data.size(0) + else: + bs = len(data) + break + + if "text_input" not in samples: + samples["text_input"] = self.prompt + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] * bs + text_input = samples['text_input'] + + if not prompt and self.prompt: + prompt=self.prompt + if prompt: + if prompt.count("{}") == 2: + if 'ocr_tokens' in samples: + text_input = [ + prompt.format(', '.join(samples['ocr_tokens'][i][:30]), samples["text_input"][i]) + for i in range(len(samples["text_input"]))] + elif 'choices' in samples: + text_input = [] + for i in range(len(samples["text_input"])): + this_choices = [f"({string.ascii_lowercase[j]}) {ch}" for j, ch in enumerate(samples["choices"][i])] + this_choices = " ".join(this_choices) + text_input.append(prompt.format(samples["text_input"][i], this_choices)) + else: + text_input = [prompt.format(question) for question in samples["text_input"]] + samples["prompt"] = text_input + + if 'discrn' in samples and self.use_caption: ## discriminatory reasoning + self.llm_tokenizer.padding_side = "left" + + text_input = samples['text_input'] if 'prompt' not in samples else samples['prompt'] + if self.postfix: + text_input = [f'{t}{self.postfix}' for t in text_input] + if self.enumerate_inputs: + prompt = [f'{self.prefix}(a){Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][0]] if self.use_cues else " "}{samples["baseline_captions"][i][0]} (b){Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][1]] if self.use_cues else " "}{samples["baseline_captions"][i][1]} {text_input[i]}' for i in range(bs)] + else: + prompt = [f'{self.prefix}{Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][0]]}{samples["baseline_captions"][i][0] if self.use_cues else " "}{Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][1]] if self.use_cues else " "}{samples["baseline_captions"][i][1]} {text_input[i]}' for i in range(bs)] + llm_tokens = self.llm_tokenizer( + prompt, + padding="longest", + return_tensors="pt" + ).to(self.device) + + with self.maybe_autocast(): + outputs = self.llm_model.generate( + inputs_embeds=self.llm_model.get_input_embeddings()(llm_tokens.input_ids), + attention_mask=llm_tokens.attention_mask, + do_sample=False, + num_beams=num_beams, + max_length=max_len, + min_length=min_len, + repetition_penalty=1.5, + # eos_token_id=self.eos_token_id, + length_penalty=length_penalty, + ) + outputs[outputs == 0] = 2 # convert output id 0 to 2 (eos_token_id) + output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True) + return output_text + + output_text = self.generate( + samples, + num_beams=num_beams, + max_length=max_len, + min_length=min_len, + length_penalty=length_penalty + ) + + if "apply_lemmatizer" in samples.keys() and samples["apply_lemmatizer"]: + output_text = self._lemmatize(output_text) + + #vizwiz + output_text = [o if o != "" else "unanswerable" for o in output_text] + + return output_text + + def predict( + self, + samples, + candidates=None, + n_segments=1, + max_length=10, + min_length=1, + length_penalty=-1., + special_qformer_input_prompt=False + ): + + self.llm_tokenizer.padding_side = "left" + + if candidates == None: + candidates = self.candidates + else: + self.candidates = candidates # for the output targets. + + if self.predict_with_gen: + output = self.generate(samples,max_length=max_length,min_length=min_length,length_penalty=length_penalty) + result = [] + for text in output: + text = BlipCaptionProcessor().pre_caption(text) + pred_label = "" # default to an empty string + for cand in candidates: + cand = BlipCaptionProcessor().pre_caption(cand) + if cand in text.split(" "): + pred_label = cand + break # stop as soon as we find a match + result.append(pred_label) + return {"predictions":result, "target": samples["label"]} + + + # If candidates is a list of lists, each sample has its candidates, then we need to iterate one by one + if type(candidates[0]) == list: + results = [] + + for i in range(samples["image"].size(0)): + this_sample = { + "image": samples["image"][i].unsqueeze(0), + "prompt": samples["prompt"], + } + + if "text_input" in samples.keys(): + this_sample["text_input"] = [samples["text_input"][i]] + + if 'context' in samples.keys(): + this_sample['context'] = [samples["context"][i]] + + if 'history' in samples.keys(): + this_sample['history'] = [samples["history"][i]] + + if 'caption' in samples.keys(): + this_sample['caption'] = [samples["caption"][i]] + + this_result = self._predict_class(this_sample, candidates[i], n_segments, special_qformer_input_prompt) + results.append(this_result) + + try: + results = torch.cat(results, dim=0) + except: + results = [res.tolist()[0] for res in results] + + return results + + return self._predict_class(samples, candidates, n_segments, special_qformer_input_prompt) + + def _predict_class( + self, + samples, + candidates, + n_segments=1, + special_qformer_input_prompt=False, + ): + if list(samples.keys()) == []: + return None + + if "prompt" in samples: + prompt = samples["prompt"] + else: + prompt = self.prompt + + candidates = [self.format_candidates_prompt.format(c) for c in candidates] + + if 'modalities' in samples: + curr_modalities = samples['modalities'][0] if isinstance(samples['modalities'][0], list) else samples['modalities'] + else: + curr_modalities = [modality for modality in self.modalities if modality in samples] + + # get batch size + for modality in curr_modalities: + data = samples[modality] + if isinstance(data, torch.Tensor): + bs = data.size(0) + else: + bs = len(data) + break + + if isinstance(prompt, str): + prompt = [prompt] * bs + else: + assert len(prompt) == bs, "The number of prompts must be equal to the batch size." + + if "text_input" in samples.keys(): + if type(samples["text_input"][0]) == list: + prompt = [prompt[i].format(*samples["text_input"][i]) for i in range(len(prompt))] + else: + prompt = [prompt[i].format(samples["text_input"][i]) for i in range(len(prompt))] + + # scienceqa + if 'context' in samples.keys() and samples['context'] != '': + prompt = [f'context: {samples["context"][i]}. {prompt[i]}' for i in range(len(prompt))] + + # visual dialog + if 'history' in samples.keys() and samples['history'][0] != '': + prompt = [f'dialog history: {samples["history"][i]}\n{prompt[i]}' for i in range(len(prompt))] + + if 'caption' in samples.keys() and samples['caption'][0] != '': + prompt = [f'This image has the caption "{samples["caption"][i]}". {prompt[i]}' for i in range(len(prompt))] + + + + if 'discrn' in samples and self.use_caption: ## discriminatory reasoning + if self.postfix: + prompt = [f'{p}{self.postfix}' for p in prompt] + if self.enumerate_inputs: + prompt = [f'{self.prefix}(a){Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][0]] if self.use_cues else " "}{samples["baseline_captions"][i][0]} (b){Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][1]] if self.use_cues else " "}{samples["baseline_captions"][i][1]} {prompt[i]}' for i in range(bs)] + else: + prompt = [f'{self.prefix}{Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][0]]}{samples["baseline_captions"][i][0] if self.use_cues else " "}{Blip2VicunaXInstruct.MODALITY_TO_CUE[samples["modalities"][i][1]] if self.use_cues else " "}{samples["baseline_captions"][i][1]} {prompt[i]}' for i in range(bs)] + text_input_tokens = self.llm_tokenizer( + prompt, + padding="longest", + return_tensors="pt" + ).to(self.device) + else: + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + query_tokens = {} + for modality in self.modalities: + if modality not in samples: + continue + query_tokens[modality] = getattr(self, f"{modality}_query_tokens").expand(bs, -1, -1) + + if self.qformer_text_input: + if self.special_qformer_input_prompt or special_qformer_input_prompt: + + qformer_prompt = special_qformer_input_prompt if special_qformer_input_prompt else self.special_qformer_input_prompt + qformer_prompt = [qformer_prompt] * len(prompt) + if "text_input" in samples.keys(): + if type(samples["text_input"][0]) == list: + qformer_prompt = [qformer_prompt[i].format(*samples["text_input"][i]) for i in range(len(qformer_prompt))] + else: + qformer_prompt = [qformer_prompt[i].format(samples["text_input"][i]) for i in range(len(qformer_prompt))] + + text_Qformer = self.tokenizer( + qformer_prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + elif self.use_describe: + modality2prompt = { + "video": "a short description of the video", + "audio": "an audio that shows", + "image": "a short image caption", + "pc": "a 3d model of" + } + qformer_prompt = [modality2prompt[modality] for _ in samples['text_input']] + + # qformer_prompt = [f'Describe the {Blip2VicunaXInstruct.MODALITY_TO_CUE[modality].replace(":", "").strip() if modality != "pc" else "3d model"}.' for _ in samples["text_input"]] + text_Qformer = self.tokenizer( + qformer_prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + else: + text_Qformer = self.tokenizer( + prompt, + padding='longest', + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + + Qformer_atts = {} + query_atts = {} + + for modality in curr_modalities: + # B, Token Size + query_atts[modality] = torch.ones(query_tokens[modality].size()[:-1], dtype=torch.long).to(self.device) + # B, Token Size + Inp Size + Qformer_atts[modality] = torch.cat([query_atts[modality],text_Qformer.attention_mask],dim=1) + + embeds = {} + data_atts = {} + for modality in curr_modalities: + data = samples[modality] + ln = getattr(self, f"{modality}_ln") + encoder = getattr(self, f"{modality}_encoder") + if modality == "video" and "clip" in self.video_enc_name: + embeds[modality] = [] + data_atts[modality] = [] + for j in range(data.size(2)): + this_frame = data[:,:,j,:,:] + with self.maybe_autocast(): + embeds[modality].append(ln(encoder(this_frame))) + if self.shared_qformer: + embeds[modality][j] = getattr(self, f"{modality}_encoder_projection")(embeds[modality][j]) + data_atts[modality].append(torch.ones(embeds[modality][j].size()[:-1], dtype=torch.long).to(self.device)) + + elif modality == 'audio' and 'beats' in self.audio_enc_name: + embeds[modality] = [] + data_atts[modality] = [] + for j in range(data.size(1)): + this_frame = data[:,j,:,:] + with self.maybe_autocast(): + embeds[modality].append(ln(encoder(this_frame))) + if self.shared_qformer: + embeds[modality][j] = getattr(self, f"{modality}_encoder_projection")(embeds[modality][j]) + data_atts[modality].append(torch.ones(embeds[modality][j].size()[:-1], dtype=torch.long).to(self.device)) + else: + with self.maybe_autocast(): + embeds[modality] = ln(encoder(data)) + if len(embeds[modality].size()) == 2: + # B, C, D + embeds[modality] = embeds[modality].unsqueeze(1) + # B, C + if self.shared_qformer: + embeds[modality] = getattr(self, f"{modality}_encoder_projection")(embeds[modality]) + data_atts[modality] = torch.ones(embeds[modality].size()[:-1], dtype=torch.long).to(self.device) + + query_outputs = {} + num = {} + if self.qformer_text_input: + for modality in curr_modalities: + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + num[modality] = len(embeds[modality]) + bs = embeds[modality][0].shape[0] + indices = [j_+r for r,j in enumerate([[i*bs for i in range(num[modality])]]*bs) for j_ in j] + reordered_embeds = torch.cat(embeds[modality])[indices] + reordered_atts = torch.cat(data_atts[modality])[indices] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.mean(1,keepdim=True)).view(bs*num[modality], self.num_query_token, -1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.view(reordered_embeds.shape[0],-1)) + continue + query_output = getattr(self, f"{modality}_Qformer").bert( + text_Qformer.input_ids.repeat(num[modality], 1), + attention_mask=Qformer_atts[modality].repeat(num[modality], 1), + query_embeds=query_tokens[modality].repeat(num[modality], 1, 1), + encoder_hidden_states=reordered_embeds, + encoder_attention_mask=reordered_atts, + return_dict=True, + ) + query_outputs[modality] = query_output + else: + bs = embeds[modality].shape[0] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality].mean(1, keepdim=True)).reshape(bs, self.num_query_token,-1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality]).reshape(bs, self.num_query_token,-1) + continue + query_outputs[modality] = getattr(self, f"{modality}_Qformer").bert( + text_Qformer.input_ids, + attention_mask=Qformer_atts[modality], + query_embeds=query_tokens[modality], + encoder_hidden_states=embeds[modality].to(torch.float32), + encoder_attention_mask=data_atts[modality], + return_dict=True, + ) + else: + for modality in curr_modalities: + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + num[modality] = len(embeds[modality]) + bs = embeds[modality][0].shape[0] + indices = [j_+r for r,j in enumerate([[i*bs for i in range(num[modality])]]*bs) for j_ in j] + reordered_embeds = torch.cat(embeds[modality])[indices] + reordered_atts = torch.cat(data_atts[modality])[indices] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.mean(1,keepdim=True)).view(bs*num[modality], self.num_query_token, -1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(reordered_embeds.view(reordered_embeds.shape[0],-1)) + continue + query_output = getattr(self, f"{modality}_Qformer").bert( + query_embeds=query_tokens[modality].repeat(num[modality], 1, 1), + encoder_hidden_states=reordered_embeds, + encoder_attention_mask=reordered_atts, + return_dict=True, + ) + query_outputs[modality] = query_output + else: + bs = embeds[modality].shape[0] + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality].mean(1, keepdim=True)).reshape(bs, self.num_query_token,-1) + else: + query_outputs[modality] = getattr(self, f"{modality}_projection")(embeds[modality]).reshape(bs, self.num_query_token,-1) + continue + query_outputs[modality] = getattr(self, f"{modality}_Qformer").bert( + query_embeds=query_tokens[modality], + encoder_hidden_states=embeds[modality].to(torch.float32), + encoder_attention_mask=data_atts[modality], + return_dict=True, + ) + + inputs_llm = {} + atts_llm = {} + enumeration = {} + # from pdb import set_trace; set_trace() + for i,modality in enumerate(curr_modalities): + if modality in Blip2VicunaXInstruct.SEQUENCIAL_MODALITIES and getattr(self, f'{modality}_enc_name') in Blip2VicunaXInstruct.SEQUENCIAL_ENCODERS: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim != 1: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].unsqueeze(1)).reshape(bs*num[modality], self.num_query_token, -1) + else: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].reshape(bs*num, self.num_query_token, -1)) + inputs_llm[modality] = inputs_llm[modality].reshape(bs, num[modality], self.num_query_token, -1).view(bs, num[modality]*self.num_query_token, -1) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + continue + # num*bs, num query tokens, llm emb size + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].last_hidden_state[:,:query_tokens[modality].size(1),:]) + # bs, num, num query tokens, llm emb size -> bs, num*num query tokens, llm emb size + inputs_llm[modality] = inputs_llm[modality].reshape(bs, num[modality], self.num_query_token, -1).view(bs, num[modality]*self.num_query_token, -1) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + + else: + if self.projection_only or getattr(self, f"projection_only_{modality}"): + if self.proj_dim == 1: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].mean(-1)).reshape(bs, self.num_query_token, -1) + else: + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality].reshape(bs, self.num_query_token, -1)) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + continue + inputs_llm[modality] = getattr(self, f"{modality}_llm_proj")(query_outputs[modality]['last_hidden_state'][:,:query_tokens[modality].size(1),:]) + atts_llm[modality] = torch.ones(inputs_llm[modality].size()[:-1], dtype=torch.long).to(self.device) + if self.enumerate_inputs: + enumeration[modality] = self.llm_tokenizer( + [f"{'' if i == 0 else ' '}({chr(97+i)}) " for _ in prompt], + return_tensors="pt", + add_special_tokens=False if (i!= 0 or self.prefix) else True + ).to(self.device) + + att_list = [] + inp_list = [] + if self.prefix: + att_list = [self.tokenized_prefix.attention_mask.repeat(bs, 1).to(self.device)] + inp_list = [self.llm_model.get_input_embeddings()(self.tokenized_prefix.input_ids.to(self.device)).repeat(bs, 1, 1)] + + for modality in curr_modalities: + if self.enumerate_inputs: + enumeration_inputs_llm = self.llm_model.get_input_embeddings()(enumeration[modality].input_ids.to(self.device)) + enumeration_atts_llm = enumeration[modality].attention_mask.to(self.device) + inp_list.extend([enumeration_inputs_llm]) + att_list.extend([enumeration_atts_llm]) + if self.use_cues: + if self.clean_tokenization or self.remove_start: + if (modality==curr_modalities[0] and not (self.prefix or self.enumerate_inputs)): + att_list.extend([torch.tensor(self.tokenized_cue[modality].attention_mask).to(self.device).repeat(atts_llm[modality].shape[0], 1), atts_llm[modality]]) + inp_list.extend([self.emb_cue[modality].to(self.device).repeat(inputs_llm[modality].shape[0], 1, 1), inputs_llm[modality]]) + else: + att_list.extend([torch.tensor(self.tokenized_cue[modality].attention_mask[:,1:]).to(self.device).repeat(atts_llm[modality].shape[0], 1), atts_llm[modality]]) + inp_list.extend([self.emb_cue[modality][:,1:].to(self.device).repeat(inputs_llm[modality].shape[0], 1, 1), inputs_llm[modality]]) + else: + att_list.extend([torch.tensor(self.tokenized_cue[modality].attention_mask).to(self.device).repeat(atts_llm[modality].shape[0], 1), atts_llm[modality]]) + inp_list.extend([self.emb_cue[modality].to(self.device).repeat(inputs_llm[modality].shape[0], 1, 1), inputs_llm[modality]]) + + else: + att_list.extend([atts_llm[modality]]) + inp_list.extend([inputs_llm[modality]]) + + if self.add_space: + space_tok = self.llm_tokenizer( + [f" " for _ in prompt], + return_tensors="pt", + add_special_tokens=False + ) + space_inputs_llm = self.llm_model.get_input_embeddings()(space_tok.input_ids.to(self.device)) + space_atts_llm = space_tok.attention_mask.to(self.device) + inp_list.extend([space_inputs_llm]) + att_list.extend([space_atts_llm]) + + + + atts_llm = torch.cat(att_list, dim=1) + empty_targets = torch.ones(atts_llm.size(), dtype=torch.long).to(self.device).fill_(-100) + inputs_llm = torch.cat(inp_list, dim=1) + + + self.llm_tokenizer.padding_side = "right" + self.llm_tokenizer.truncation_side = 'left' + + + text_input_tokens = self.llm_tokenizer( + [f"{p}{self.postfix}" for p in prompt] if self.postfix else prompt, + padding="longest", + return_tensors="pt", + add_special_tokens= not self.clean_tokenization + ).to(self.device) + + self.llm_tokenizer.truncation_side = 'right' + n_cands = len(candidates) + with self.maybe_autocast(): + all_losses = [] + for n in range(n_segments): + seg_len = n_cands // n_segments + if n == (n_segments - 1): + seg_len = n_cands - seg_len * (n_segments - 1) + + start_i = n * (n_cands // n_segments) + end_i = start_i + seg_len + this_output_tokens = self.llm_tokenizer( + candidates[start_i:end_i], + return_tensors="pt", + padding="longest", + # truncation=True, + # max_length=self.max_output_txt_len, + ).to(self.device) + + this_input_tokens_ids = text_input_tokens.input_ids.repeat_interleave(seg_len, dim=0) + this_input_tokens_atts = text_input_tokens.attention_mask.repeat_interleave(seg_len, dim=0) + + this_output_tokens_ids = this_output_tokens.input_ids.repeat(bs, 1) + this_output_tokens_atts = this_output_tokens.attention_mask.repeat(bs, 1) + + this_llm_tokens, this_input_targets_len = self.concat_text_input_output( + this_input_tokens_ids, + this_input_tokens_atts, + this_output_tokens_ids, + this_output_tokens_atts + ) + + this_llm_input_ids = this_llm_tokens['input_ids'] + this_llm_atts = this_llm_tokens['attention_mask'] + + inputs_embeds = self.llm_model.get_input_embeddings()(this_llm_input_ids) + + if self.use_caption: + inputs_embeds = torch.cat([inputs_embeds], dim=1) + attention_mask = torch.cat([this_llm_atts], dim=1) + else: + inputs_embeds = torch.cat([inputs_llm.repeat_interleave(seg_len, dim=0), inputs_embeds], dim=1) + attention_mask = torch.cat([atts_llm.repeat_interleave(seg_len, dim=0), this_llm_atts], dim=1) + + + this_targets = this_llm_input_ids.masked_fill(this_llm_input_ids == self.llm_tokenizer.pad_token_id, -100) + + for i, l in enumerate(this_input_targets_len): + this_targets[i][:l] = -100 + + if self.use_caption: + torch.cat([this_targets], dim=1) + else: + this_targets = torch.cat([empty_targets.repeat_interleave(seg_len, dim=0), this_targets], dim=1) + + + outputs = self.llm_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict=True, + labels=this_targets, + reduction="none", + ) + + loss = outputs.loss + + loss = loss.reshape(bs, seg_len) + all_losses.append(loss) + + all_losses = torch.cat(all_losses, dim=-1) + all_losses = -all_losses + output_class_ranks = torch.argsort(all_losses, dim=-1) + return {"predictions": all_losses, "targets": torch.tensor([self.candidates.index(l) for l in samples["label"]])} + + def _lemmatize(self, answers): + def apply(answer): + doc = self.lemmatizer(answer) + + words = [] + for token in doc: + if token.pos_ in ["NOUN", "VERB"]: + words.append(token.lemma_) + else: + words.append(token.text) + answer = " ".join(words) + + return answer + + return [apply(answer) for answer in answers] + + @property + def lemmatizer(self): + if self._lemmatizer is None: + try: + import spacy + + self._lemmatizer = spacy.load("en_core_web_sm") + except ImportError: + logging.error( + """ + Please install spacy and en_core_web_sm model to apply lemmatization. + python -m spacy download en_core_web_sm + OR + import spacy.cli + spacy.cli.download("en_core_web_sm") + """ + ) + exit(1) + + return self._lemmatizer + + def get_optimizer_params(self, weight_decay, lr_scale=1): + return BaseModel.get_optimizer_params(self, weight_decay, lr_scale=lr_scale) + + @classmethod + def from_config(cls, cfg): + image_model = cfg.get("image_model","eva_clip_g") + pc_model = cfg.get("pc_model","ulip2_pointbert") + video_model = cfg.get("video_model","eva_clip_g") + audio_model = cfg.get("audio_model","beats") + + pretrained_image_qformer = cfg.get("pretrained_image_qformer",None) + pretrained_pc_qformer = cfg.get("pretrained_pc_qformer",None) + pretrained_video_qformer = cfg.get("pretrained_video_qformer",None) + pretrained_audio_qformer = cfg.get("pretrained_audio_qformer",None) + + load_attention_image_qformer = cfg.get("load_attention_image_qformer",False) + load_attention_pc_qformer = cfg.get("load_attention_pc_qformer",False) + load_attention_video_qformer = cfg.get("load_attention_video_qformer",False) + load_attention_audio_qformer = cfg.get("load_attention_audio_qformer",False) + + load_qformer_type_image=cfg.get('load_qformer_type_image', "") + load_qformer_type_pc=cfg.get('load_qformer_type_pc', "") + load_qformer_type_video=cfg.get('load_qformer_type_video', "") + load_qformer_type_audio=cfg.get('load_qformer_type_audio',"") + + load_projection_image=cfg.get('load_projection_image', True) + load_projection_pc=cfg.get('load_projection_pc', True) + load_projection_video=cfg.get('load_projection_video', True) + load_projection_audio=cfg.get('load_projection_audio', True) + + load_projection_type_image=cfg.get('load_projection_type_image', "") + load_projection_type_pc=cfg.get('load_projection_type_pc', "") + load_projection_type_video=cfg.get('load_projection_type_video', "") + load_projection_type_audio=cfg.get('load_projection_type_audio', "") + + load_ln_type_image=cfg.get('load_ln_type_image', "") + load_ln_type_pc=cfg.get('load_ln_type_pc', "") + load_ln_type_video=cfg.get('load_ln_type_video', "") + load_ln_type_audio=cfg.get('load_ln_type_audio', "") + + image_encoder_kwargs = cfg.get("image_encoder_kwargs", {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}) + pc_encoder_kwargs = cfg.get("pc_encoder_kwargs",{}) + video_encoder_kwargs = cfg.get("video_encoder_kwargs",{}) + audio_encoder_kwargs = cfg.get("audio_encoder_kwargs",{}) + + image_precision = cfg.get("image_precision","fp16") + pc_precision = cfg.get("pc_precision","fp16") + video_precision = cfg.get("video_precision","fp16") + audio_precision = cfg.get("audio_precision","fp16") + + freeze_image = cfg.get("freeze_image",True) + freeze_pc = cfg.get("freeze_pc",True) + freeze_video = cfg.get("freeze_video",True) + freeze_audio = cfg.get("freeze_audio",True) + num_query_token = cfg.get("num_query_token") + + llm_model = cfg.get("llm_model") + freeze_pc = cfg.get("freeze_pc", True) + freeze_video = cfg.get("freeze_video", True) + freeze_audio = cfg.get("freeze_audio", True) + + prompt = cfg.get("prompt", "") + max_txt_len = cfg.get("max_txt_len", 128) + max_output_txt_len = cfg.get("max_output_txt_len", 256) + + apply_lemmatizer = cfg.get("apply_lemmatizer", False) + + qformer_text_input = cfg.get("qformer_text_input", True) + modalities = cfg.get("modalities", ["image"]) + use_cues = cfg.get("use_cues", True) + shared_qformer = cfg.get("shared_qformer",False) + pretrained_shared_qformer = cfg.get("pretrained_shared_qformer", None) + load_attention_shared_qformer = cfg.get("load_attention_shared_qformer", None) + load_qformer_type_shared= cfg.get('load_qformer_type_shared',"") + load_projection_shared= cfg.get('load_projection_shared',False) + load_projection_type_shared= cfg.get('load_projection_type_shared',"") + shared_qformer_num_features=cfg.get("shared_qformer_num_features", 512) + encoder_projection_type_image=cfg.get("encoder_projection_type_image","") + encoder_projection_type_video=cfg.get("encoder_projection_type_video","") + encoder_projection_type_audio=cfg.get("encoder_projection_type_audio","") + encoder_projection_type_pc=cfg.get("encoder_projection_type_pc","") + + llm_text_input = cfg.get("llm_text_input", True) + lora = cfg.get("lora", False) + prefix = cfg.get("prefix", "") + postfix = cfg.get("postfix", "") + + cached_audio= cfg.get("cached_audio", False) + cached_image= cfg.get("cached_image", False) + cached_video= cfg.get("cached_video", False) + cached_pc= cfg.get("cached_pc", False) + + num_features_audio=cfg.get('num_features_audio', 768) + num_features_image=cfg.get('num_features_image', 1408) + num_features_video=cfg.get('num_features_video', 14080) + num_features_pc=cfg.get('num_features_depth', 512) + + joint_video_audio=cfg.get('joint_video_audio', False) + use_caption=cfg.get('use_caption', False) + use_describe=cfg.get('use_describe', False) + predict_with_gen = cfg.get('predict_with_gen', False) + format_candidates_prompt = cfg.get('format_candidates_prompt', "{}") + special_qformer_input_prompt = cfg.get('special_qformer_input_prompt', False) + enumerate_inputs = cfg.get('enumerate_inputs', False) + add_space = cfg.get('add_space', True) + projection_only = cfg.get('projection_only', False) + + lora_model = cfg.get('lora_model', '') + + projection_only_audio= cfg.get('projection_only_audio', False) + projection_only_pc= cfg.get('projection_only_pc', False) + projection_only_video= cfg.get('projection_only_video', False) + projection_only_image= cfg.get('projection_only_image', False) + + projection_path_audio=cfg.get('projection_path_audio', False) + projection_path_pc=cfg.get('projection_path_pc', False) + projection_path_video=cfg.get('projection_path_video', False) + projection_path_image=cfg.get('projection_path_image', False) + remove_start=cfg.get('remove_start', False) + proj_dim=cfg.get('proj_dim', 1) + clean_tokenization=cfg.get('clean_tokenization', False) + + logging.info("Model Config Arguments:") + logging.info(OmegaConf.to_yaml(cfg)) + + model = cls( + image_model=image_model, + pc_model=pc_model, + video_model=video_model, + audio_model=audio_model, + + pretrained_image_qformer=pretrained_image_qformer, + pretrained_pc_qformer=pretrained_pc_qformer, + pretrained_video_qformer=pretrained_video_qformer, + pretrained_audio_qformer=pretrained_audio_qformer, + + load_attention_image_qformer=load_attention_image_qformer, + load_attention_pc_qformer=load_attention_pc_qformer, + load_attention_video_qformer=load_attention_video_qformer, + load_attention_audio_qformer=load_attention_audio_qformer, + + load_qformer_type_image=load_qformer_type_image, + load_qformer_type_pc=load_qformer_type_pc, + load_qformer_type_video=load_qformer_type_video, + load_qformer_type_audio=load_qformer_type_audio, + + load_projection_image=load_projection_image, + load_projection_pc=load_projection_pc, + load_projection_video=load_projection_video, + load_projection_audio=load_projection_audio, + + load_projection_type_image=load_projection_type_image, + load_projection_type_pc=load_projection_type_pc, + load_projection_type_video=load_projection_type_video, + load_projection_type_audio=load_projection_type_audio, + + load_ln_type_image=load_ln_type_image, + load_ln_type_pc=load_ln_type_pc, + load_ln_type_video=load_ln_type_video, + load_ln_type_audio=load_ln_type_audio, + + image_encoder_kwargs = image_encoder_kwargs, + pc_encoder_kwargs = pc_encoder_kwargs, + video_encoder_kwargs = video_encoder_kwargs, + audio_encoder_kwargs = audio_encoder_kwargs, + + image_precision=image_precision, + pc_precision=pc_precision, + video_precision=video_precision, + audio_precision=audio_precision, + + freeze_image=freeze_image, + freeze_pc=freeze_pc, + freeze_video=freeze_video, + freeze_audio=freeze_audio, + + num_query_token=num_query_token, + llm_model=llm_model, + lora_model=lora_model, + lora = lora, + prompt=prompt, + max_txt_len=max_txt_len, + max_output_txt_len=max_output_txt_len, + apply_lemmatizer=apply_lemmatizer, + qformer_text_input=qformer_text_input, + modalities=modalities, + use_cues=use_cues, + llm_text_input=llm_text_input, + shared_qformer=shared_qformer, + pretrained_shared_qformer = pretrained_shared_qformer, + load_attention_shared_qformer = load_attention_shared_qformer, + shared_qformer_num_features=shared_qformer_num_features, + load_qformer_type_shared= load_qformer_type_shared, + load_projection_shared= load_projection_shared, + + encoder_projection_type_image=encoder_projection_type_image, + encoder_projection_type_video=encoder_projection_type_video, + encoder_projection_type_audio=encoder_projection_type_audio, + encoder_projection_type_pc=encoder_projection_type_pc, + + projection_path_audio=projection_path_audio, + projection_path_pc=projection_path_pc, + projection_path_video=projection_path_video, + projection_path_image=projection_path_image, + + load_projection_type_shared= load_projection_type_shared, + + prefix=prefix, + postfix=postfix, + + cached_audio=cached_audio, + cached_image=cached_image, + cached_video=cached_video, + cached_pc=cached_pc, + + num_features_audio=num_features_audio, + num_features_image=num_features_image, + num_features_video=num_features_video, + num_features_pc=num_features_pc, + + joint_video_audio=joint_video_audio, + use_caption=use_caption, + use_describe=use_describe, + predict_with_gen=predict_with_gen, + format_candidates_prompt=format_candidates_prompt, + special_qformer_input_prompt=special_qformer_input_prompt, + enumerate_inputs=enumerate_inputs, + add_space=add_space, + projection_only=projection_only, + + projection_only_audio= projection_only_audio, + projection_only_pc= projection_only_pc, + projection_only_video= projection_only_video, + projection_only_image= projection_only_image, + remove_start= remove_start, + proj_dim=proj_dim, + clean_tokenization=clean_tokenization + ) + + stage1_url_or_filename = cfg.get("stage1_url_or_filename","") + + if stage1_url_or_filename: + model.load_from_pretrained(stage1_url_or_filename) + + model.load_checkpoint_from_config(cfg) + return model + + @classmethod + def init_ln(cls, num_features, load_ln_path=False, load_ln_type=""): + ln = LayerNorm(num_features) + if load_ln_path and load_ln_type: + url_or_filename=load_ln_path + logging.info(f"Loading pretrained layer norm weights from {url_or_filename} of type {load_ln_type}") + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + if load_ln_type: + load_ln_type = f"{load_ln_type}_ln" if "vision" not in load_ln_type else "ln_vision" + loaded_state_dict = {} + if 'model' in checkpoint: + checkpoint = checkpoint['model'] + for k in checkpoint.keys(): + if load_ln_type in k: + loaded_state_dict['.'.join(k.split('.')[1:])] = checkpoint[k] + ln.load_state_dict(loaded_state_dict, strict=False) + + return ln + + @classmethod + def init_encoder_projection(cls, enc_num_features, shared_qformer_num_features, load_proj_path=False, load_proj_type=""): + encoder_projection = nn.Linear(enc_num_features, shared_qformer_num_features) + if load_proj_path and load_proj_type: + url_or_filename=load_proj_path + logging.info(f"Loading shared Qformer encoder projection weights from {url_or_filename} of type {load_proj_type}") + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + if load_proj_type: + load_proj_type = f"{load_proj_type}_" + loaded_state_dict = {} + if 'model' in checkpoint: + checkpoint = checkpoint['model'] + for k in checkpoint.keys(): + if load_proj_type+'encoder_projection' in k: + loaded_state_dict['.'.join(k.split('.')[1:])] = checkpoint[k] + encoder_projection.load_state_dict(loaded_state_dict, strict=False) + + return encoder_projection + + @classmethod + def init_vicuna_projection(cls, input_size, output_size, load_projection_path=False, load_projection_type="", projection_key=None): + proj = nn.Linear(input_size, output_size) + if load_projection_path: + url_or_filename=load_projection_path + logging.info(f"Loading pretrained projection weights from {url_or_filename} of type {load_projection_type} with key {projection_key if projection_key else load_projection_type+'_llm_proj.'}") + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + if load_projection_type: + load_projection_type = f"{load_projection_type}_" + loaded_state_dict = {} + if 'model' in checkpoint: + checkpoint = checkpoint['model'] + for k in checkpoint.keys(): + if projection_key: + if projection_key in k: + loaded_state_dict['.'.join(k.split('.')[1:])] = checkpoint[k] + else: + if load_projection_type+'llm_proj.' in k: + loaded_state_dict['.'.join(k.split('.')[1:])] = checkpoint[k] + proj.load_state_dict(loaded_state_dict, strict=False) + + return proj + + @classmethod + def init_Qformer(cls, num_query_token, modality_width, cross_attention_freq=2, pretrained_qformer=None, load_attention=False, load_qformer_type=""): + encoder_config = BertConfig.from_pretrained("bert-base-uncased") + encoder_config.encoder_width = modality_width + # insert cross-attention layer every other block + encoder_config.add_cross_attention = True + encoder_config.cross_attention_freq = cross_attention_freq + encoder_config.query_length = num_query_token + encoder_config.vocab_size += 1 # for special token [DEC] + Qformer = BertLMHeadModel(config=encoder_config) + query_tokens = nn.Parameter( + torch.zeros(1, num_query_token, encoder_config.hidden_size) + ) + query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range) + + if pretrained_qformer: + url_or_filename=pretrained_qformer + logging.info(f"Loading pretrained qformer weights and query tokens from {url_or_filename} of type {load_qformer_type}") + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + if load_qformer_type: + load_qformer_type = f"{load_qformer_type}_" + loaded_state_dict = {} + if 'model' in checkpoint: + checkpoint = checkpoint['model'] + for k in checkpoint.keys(): + if load_qformer_type+'Qformer.' in k: + if not load_attention and 'attention' in k: + continue + loaded_state_dict['.'.join(k.split('.')[1:])] = checkpoint[k] + Qformer.load_state_dict(loaded_state_dict, strict=False) + query_tokens.data = checkpoint[load_qformer_type+'query_tokens'] + + return Qformer, query_tokens + + def get_state_dict(self, url_or_filename, **kwargs): + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + if "model" in checkpoint.keys(): + state_dict = checkpoint["model"] + else: + state_dict = checkpoint + return state_dict + + def load_from_pretrained(self, url_or_filename, **kwargs): + state_dict = self.get_state_dict(url_or_filename) + self.load_state_dict(state_dict, strict=False) + logging.info("load checkpoint from %s" % url_or_filename) + + def load_checkpoint(self, url_or_filename, **kwargs): + """ + Load from a finetuned checkpoint. + + This should expect no mismatch in the model keys and the checkpoint keys. + """ + state_dict = self.get_state_dict(url_or_filename) + self.load_state_dict(state_dict, strict=True) + logging.info("load checkpoint from %s" % url_or_filename) + + def load_state_dict(self, state_dict, strict=True): + # from pdb import set_trace; set_trace() + unexpected_keys = [] + missing_keys = [] + if self.shared_qformer and not self.projection_only: + ## Load Q-Former if it is not loaded from config + if not getattr(self, "pretrained_shared_qformer"): + shared_qformer_state_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if "shared_Qformer" == k.split('.')[0]} + msg = self.shared_Qformer.load_state_dict(shared_qformer_state_dict, strict=strict) + missing_keys.extend(msg.missing_keys) + ## Load query tokens + if "shared_query_tokens" not in state_dict: + missing_keys.append("shared_query_tokens") + else: + self.shared_query_tokens = state_dict["shared_query_tokens"] + missing_keys.extend(msg.missing_keys) + unexpected_keys.extend(msg.unexpected_keys) + + for modality in self.modalities: + # Map shared Qformer by reference to all modalities. + setattr(self, f"{modality}_Qformer", self.shared_Qformer) + getattr(self, f"{modality}_query_tokens").data = state_dict[f"shared_query_tokens"] + # load encoder projections + modality_encoder_projection_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"{modality}_encoder_projection" in k.split('.')[0]} + msg = getattr(self, f"{modality}_encoder_projection").load_state_dict(modality_encoder_projection_dict, strict=strict) + missing_keys.extend(msg.missing_keys) + unexpected_keys.extend(msg.unexpected_keys) + # load modality layer norm + if getattr(self,f"load_ln_type_{modality}") == "vision": + modality_ln_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"ln_vision" in k.split('.')[0]} + else: + modality_ln_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"{modality}_ln" in k.split('.')[0]} + msg = getattr(self, f"{modality}_ln").load_state_dict(modality_ln_dict, strict=strict) + missing_keys.extend(msg.missing_keys) + unexpected_keys.extend(msg.unexpected_keys) + + ## Load Shared LLM projection if not loaded by config + if not getattr(self, "load_projection_shared"): + shared_llm_projection_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"shared_llm_proj" in k.split('.')[0]} + msg = self.shared_llm_proj.load_state_dict(shared_llm_projection_dict, strict=strict) + missing_keys.extend(msg.missing_keys) + unexpected_keys.extend(msg.unexpected_keys) + for modality in self.modalities: + ## Map to modality projections by reference + msg = setattr(self, f"{modality}_llm_proj", self.shared_llm_proj) + else: + for modality in self.modalities: + ## Load Q-Former if not loaded from config + if not getattr(self, f"pretrained_{modality}_qformer") or ((self.projection_only or getattr(self, f"projection_only_{modality}")) and not getattr(self, f"projection_path_{modality}")): + + if self.projection_only or getattr(self, f"projection_only_{modality}") : + if not getattr(self, f"projection_path_{modality}"): + logging.info(f"Loaded {modality} projection") + modality_qformer_state_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"{modality}_projection" == k.split('.')[0]} + msg = getattr(self, f"{modality}_projection").load_state_dict(modality_qformer_state_dict, strict=strict) + missing_keys.extend(msg.missing_keys) + unexpected_keys.extend(msg.unexpected_keys) + else: + modality_qformer_state_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"{modality}_Qformer" == k.split('.')[0]} + msg = getattr(self, f"{modality}_Qformer").load_state_dict(modality_qformer_state_dict, strict=strict) + missing_keys.extend(msg.missing_keys) + unexpected_keys.extend(msg.unexpected_keys) + ## Load query tokens + if not self.projection_only and not getattr(self, f"projection_only_{modality}"): + if f"{modality}_query_tokens" not in state_dict: + missing_keys.append(f"{modality}_query_tokens") + else: + logging.info(f"Loaded {modality} query tokens") + getattr(self, f"{modality}_query_tokens").data = state_dict[f"{modality}_query_tokens"] + # load modality layer norm if not loaded from config + if getattr(self,f"load_ln_type_{modality}") == "vision": + logging.info(f"Loaded {modality} vision ln") + modality_ln_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"ln_vision" in k.split('.')[0]} + else: + modality_ln_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"{modality}_ln" in k.split('.')[0]} + msg = getattr(self, f"{modality}_ln").load_state_dict(modality_ln_dict, strict=strict) + missing_keys.extend(msg.missing_keys) + unexpected_keys.extend(msg.unexpected_keys) + ## Load LLM projections if not loaded from config + if not getattr(self, f"load_projection_{modality}") or (getattr(self, f"projection_only_{modality}") or self.projection_only): + if not getattr(self, f"projection_path_{modality}"): + logging.info(f"Loaded {modality} llm projection") + modality_llm_projection_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"{modality}_llm_proj" in k.split('.')[0]} + msg = getattr(self, f"{modality}_llm_proj").load_state_dict(modality_llm_projection_dict, strict=strict) + missing_keys.extend(msg.missing_keys) + unexpected_keys.extend(msg.unexpected_keys) + + ## llm model is loaded from pretrained + lora_state_dict = {'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"llm_model" in k.split('.')[0]} + + if not self.lora or len(lora_state_dict) == 0: + unexpected_keys = [k for k in unexpected_keys if k.split('.')[0] != 'llm_model'] + else: + msg = self.llm_model.load_state_dict({'.'.join(k.split('.')[1:]):v for k,v in state_dict.items() if f"llm_model" in k.split('.')[0]}, strict=False) + missing_keys.extend(["llm_model."+k for k in msg.missing_keys]) + missing_keys = [k for k in missing_keys if 'encoder' not in k.split('.')[0]] + missing_keys = [k for k in missing_keys if k.split('.')[0] != 'llm_model'] + return _IncompatibleKeys(missing_keys, unexpected_keys) + + + def before_evaluation(self, dataset, task_type, **kwargs): + if task_type == MultimodalClassificationTask: + self.candidates = dataset.classnames + print(self.candidates) \ No newline at end of file diff --git a/lavis/models/blip2_models/modeling_llama.py b/lavis/models/blip2_models/modeling_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..b25f73bfb5ef2b8ab717067cf5a62fcf5c227283 --- /dev/null +++ b/lavis/models/blip2_models/modeling_llama.py @@ -0,0 +1,888 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch LLaMA model.""" +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from transformers.models.llama.configuration_llama import LlamaConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "LlamaConfig" + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +class LlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +class LlamaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) + self.register_buffer("inv_freq", inv_freq) + + # Build here to make `torch.jit.trace` work. + self.max_seq_len_cached = max_position_embeddings + t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. + if seq_len > self.max_seq_len_cached: + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + gather_indices = position_ids[:, None, :, None] # [bs, 1, seq_len, 1] + gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3]) + cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices) + sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class LlamaMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class LlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: LlamaConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class LlamaDecoderLayer(nn.Module): + def __init__(self, config: LlamaConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = LlamaAttention(config=config) + self.mlp = LlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + ) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +LLAMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`LlamaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaPreTrainedModel(PreTrainedModel): + config_class = LlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, LlamaModel): + module.gradient_checkpointing = value + + +LLAMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaModel(LlamaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`] + + Args: + config: LlamaConfig + """ + + def __init__(self, config: LlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class LlamaForCausalLM(LlamaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.model = LlamaModel(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + reduction: Optional[str] = "mean", + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, LlamaForCausalLM + + >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you consciours? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction=reduction) + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + if reduction == "none": + # loss = loss.view(logits.size(0), -1).sum(1) + loss = loss.view(logits.size(0), -1).mean(1) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + +@add_start_docstrings( + """ + The LLaMa Model transformer with a sequence classification head on top (linear layer). + + [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + LLAMA_START_DOCSTRING, +) +class LlamaForSequenceClassification(LlamaPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = LlamaModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) \ No newline at end of file diff --git a/lavis/models/blip2_models/modeling_opt.py b/lavis/models/blip2_models/modeling_opt.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4077c83a706825131be82702deba5e344b87e0 --- /dev/null +++ b/lavis/models/blip2_models/modeling_opt.py @@ -0,0 +1,1113 @@ +# coding=utf-8 +# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch OPT model.""" +import random +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from transformers.models.opt.configuration_opt import OPTConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/opt-350m" +_CONFIG_FOR_DOC = "OPTConfig" +_TOKENIZER_FOR_DOC = "GPT2Tokenizer" + +# Base model docstring +_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024] + +# SequenceClassification docstring +_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ArthurZ/opt-350m-dummy-sc" +_SEQ_CLASS_EXPECTED_LOSS = 1.71 +_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'" + +# QuestionAnswering docstring +_QA_EXPECTED_OUTPUT = "'a nice puppet'" +_QA_EXPECTED_LOSS = 7.41 +_QA_TARGET_START_INDEX = 14 +_QA_TARGET_END_INDEX = 15 + +OPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/opt-125m", + "facebook/opt-350m", + "facebook/opt-1.3b", + "facebook/opt-2.7b", + "facebook/opt-6.7b", + "facebook/opt-13b", + "facebook/opt-30b", + # See all OPT models at https://huggingface.co/models?filter=opt +] + + +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min)) + mask_cond = torch.arange(mask.size(-1)) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat( + [torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1 + ) + return mask[None, None, :, :].expand( + bsz, 1, tgt_len, tgt_len + past_key_values_length + ) + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(dtype).min + ) + + +class OPTLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward( + self, attention_mask: torch.LongTensor, past_key_values_length: int = 0 + ): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + attention_mask = attention_mask.long() + + # create positions depending on attention_mask + positions = ( + torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask + ).long() - 1 + + # cut positions if `past_key_values_length` is > 0 + positions = positions[:, past_key_values_length:] + + return super().forward(positions + self.offset) + + +class OPTAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return ( + tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + .transpose(1, 2) + .contiguous() + ) + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = ( + attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + + attention_mask + ) + attn_weights = torch.max( + attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min) + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437 + if attn_weights.dtype == torch.float16: + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(torch.float16) + else: + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view( + bsz, self.num_heads, tgt_len, src_len + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view( + bsz, self.num_heads, tgt_len, src_len + ) + attn_weights = attn_weights_reshaped.view( + bsz * self.num_heads, tgt_len, src_len + ) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training + ) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class OPTDecoderLayer(nn.Module): + def __init__(self, config: OPTConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = OPTAttention( + embed_dim=self.embed_dim, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.do_layer_norm_before = config.do_layer_norm_before + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim) + self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention + if self.do_layer_norm_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout( + hidden_states, p=self.dropout, training=self.training + ) + hidden_states = residual + hidden_states + + # 350m applies layer norm AFTER attention + if not self.do_layer_norm_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Fully Connected + hidden_states_shape = hidden_states.shape + hidden_states = hidden_states.reshape(-1, hidden_states.size(-1)) + residual = hidden_states + + # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention + if self.do_layer_norm_before: + hidden_states = self.final_layer_norm(hidden_states) + + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout( + hidden_states, p=self.dropout, training=self.training + ) + + hidden_states = (residual + hidden_states).view(hidden_states_shape) + + # 350m applies layer norm AFTER attention + if not self.do_layer_norm_before: + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +OPT_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`OPTConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare OPT Model outputting raw hidden-states without any specific head on top.", + OPT_START_DOCSTRING, +) +class OPTPreTrainedModel(PreTrainedModel): + + config_class = OPTConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["OPTDecoderLayer"] + _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (OPTDecoder)): + module.gradient_checkpointing = value + + +OPT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class OPTDecoder(OPTPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`] + + Args: + config: OPTConfig + """ + + def __init__(self, config: OPTConfig): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding( + config.vocab_size, config.word_embed_proj_dim, self.padding_idx + ) + self.embed_positions = OPTLearnedPositionalEmbedding( + config.max_position_embeddings, config.hidden_size + ) + + if config.word_embed_proj_dim != config.hidden_size: + self.project_out = nn.Linear( + config.hidden_size, config.word_embed_proj_dim, bias=False + ) + else: + self.project_out = None + + if config.word_embed_proj_dim != config.hidden_size: + self.project_in = nn.Linear( + config.word_embed_proj_dim, config.hidden_size, bias=False + ) + else: + self.project_in = None + + # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility + # with checkpoints that have been fine-tuned before transformers v4.20.1 + # see https://github.com/facebookresearch/metaseq/pull/164 + if config.do_layer_norm_before and not config._remove_final_layer_norm: + self.final_layer_norm = nn.LayerNorm(config.hidden_size) + else: + self.final_layer_norm = None + + self.layers = nn.ModuleList( + [OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)] + ) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask( + self, attention_mask, input_shape, inputs_embeds, past_key_values_length + ): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + past_key_values_length=past_key_values_length, + ).to(inputs_embeds.device) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ).to(inputs_embeds.device) + combined_attention_mask = ( + expanded_attn_mask + if combined_attention_mask is None + else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + query_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError( + "You have to specify either decoder_input_ids or decoder_inputs_embeds" + ) + + past_key_values_length = ( + past_key_values[0][0].shape[2] if past_key_values is not None else 0 + ) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if query_embeds is not None: + inputs_embeds = torch.cat([query_embeds, inputs_embeds], dim=1) + input_shape = inputs_embeds.size()[:-1] + + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device + ) + pos_embeds = self.embed_positions(attention_mask, past_key_values_length) + + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + if self.project_in is not None: + inputs_embeds = self.project_in(inputs_embeds) + + hidden_states = inputs_embeds + pos_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + # check if head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask], ["head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != (len(self.layers)): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): + continue + + past_key_value = ( + past_key_values[idx] if past_key_values is not None else None + ) + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + head_mask[idx] if head_mask is not None else None, + None, + ) + else: + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if self.final_layer_norm is not None: + hidden_states = self.final_layer_norm(hidden_states) + + if self.project_out is not None: + hidden_states = self.project_out(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +@add_start_docstrings( + "The bare OPT Model outputting raw hidden-states without any specific head on top.", + OPT_START_DOCSTRING, +) +class OPTModel(OPTPreTrainedModel): + def __init__(self, config: OPTConfig): + super().__init__(config) + self.decoder = OPTDecoder(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.decoder.embed_tokens = value + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPast, + config_class=_CONFIG_FOR_DOC, + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + query_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + query_embeds=query_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + + return BaseModelOutputWithPast( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + hidden_states=decoder_outputs.hidden_states, + attentions=decoder_outputs.attentions, + ) + + +class OPTForCausalLM(OPTPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = OPTModel(config) + + # the lm_head weight is automatically tied to the embed tokens weight + self.lm_head = nn.Linear( + config.word_embed_proj_dim, config.vocab_size, bias=False + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings( + output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + query_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + reduction: Optional[str] = "mean", + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import GPT2Tokenizer, OPTForCausalLM + + >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m") + >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m") + + >>> prompt = "Hey, are you consciours? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." + ```""" + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + query_embeds=query_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]).contiguous() + + loss = None + if labels is not None: + logits = logits[:, -labels.size(1) :, :] + + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction=reduction) + loss = loss_fct( + shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1) + ) + if reduction == "none": + loss = loss.view(shift_logits.size(0), -1).sum(1) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids=None, + query_embeds=None, + past=None, + attention_mask=None, + use_cache=None, + **kwargs, + ): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + if input_ids is not None: + attention_mask = input_ids.new_ones(input_ids.shape) + if past: + input_ids = input_ids[:, -1:] + query_embeds = None + # first step, decoder_cached_states are empty + return { + "input_ids": input_ids, + "query_embeds": query_embeds, + "attention_mask": attention_mask, + "past_key_values": past, + "use_cache": use_cache, + } + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx) for past_state in layer_past + ), + ) + return reordered_past diff --git a/lavis/models/blip2_models/modeling_t5.py b/lavis/models/blip2_models/modeling_t5.py new file mode 100644 index 0000000000000000000000000000000000000000..10e4d56f2c21b0cbe639e0f568bd352a6cb76351 --- /dev/null +++ b/lavis/models/blip2_models/modeling_t5.py @@ -0,0 +1,2063 @@ +# coding=utf-8 +# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch T5 model.""" + + +import copy +import math +import os +import warnings +from typing import Optional, Tuple, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss +from torch.utils.checkpoint import checkpoint + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import ( + ALL_LAYERNORM_LAYERS, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from transformers.utils import ( + DUMMY_INPUTS, + DUMMY_MASK, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_torch_fx_proxy, + logging, + replace_return_docstrings, +) +from transformers.utils.model_parallel_utils import assert_device_map, get_device_map +from transformers.models.t5.configuration_t5 import T5Config + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "T5Config" +_TOKENIZER_FOR_DOC = "T5Tokenizer" +_CHECKPOINT_FOR_DOC = "t5-small" + +#################################################### +# This dict contains ids and associated url +# for the pretrained weights provided with the models +#################################################### +T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "t5-small", + "t5-base", + "t5-large", + "t5-3b", + "t5-11b", + # See all T5 models at https://huggingface.co/models?filter=t5 +] + + +#################################################### +# This is a conversion method from TF 1.0 to PyTorch +# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 +#################################################### +def load_tf_weights_in_t5(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + tf_weights = {} + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + tf_weights[name] = array + + for txt_name in names: + name = txt_name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n + in [ + "adam_v", + "adam_m", + "AdamWeightDecayOptimizer", + "AdamWeightDecayOptimizer_1", + "global_step", + ] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + tf_weights.pop(txt_name, None) + continue + if "_slot_" in name[-1]: + logger.info(f"Skipping {'/'.join(name)}") + tf_weights.pop(txt_name, None) + continue + pointer = model + array = tf_weights[txt_name] + + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] in ["kernel", "scale", "embedding"]: + pointer = getattr(pointer, "weight") + elif scope_names[0] == "self_attention": + pointer = getattr(pointer, "layer") + pointer = pointer[0] + elif scope_names[0] == "enc_dec_attention": + pointer = getattr(pointer, "layer") + pointer = pointer[1] + elif scope_names[0] == "dense_relu_dense": + pointer = getattr(pointer, "layer") + pointer = pointer[2] + elif scope_names[0] == "rms_norm": + if hasattr(pointer, "layer_norm"): + pointer = getattr(pointer, "layer_norm") + elif hasattr(pointer, "final_layer_norm"): + pointer = getattr(pointer, "final_layer_norm") + elif scope_names[0] == "scale": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + elif scope_names[0] == "decoder" and name[1] == "logits": + continue + elif scope_names[0] == "logits": + pointer = getattr(pointer, "lm_head") + elif ( + scope_names[0] == "wi" + and len(scope_names) > 1 + and scope_names[1].isdigit() + ): + pointer = getattr(pointer, f"wi_{scope_names[1]}") + continue + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if scope_names[0] not in ["kernel", "scale", "embedding"]: + pointer = getattr(pointer, "weight") + if scope_names[0] != "embedding": + logger.info(f"Transposing numpy weight of shape {array.shape} for {name}") + array = np.transpose(array) + try: + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array.astype(np.float32)) + tf_weights.pop(txt_name, None) + + logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.") + return model + + +#################################################### +# PyTorch Models are constructed by sub-classing +# - torch.nn.Module for the layers and +# - PreTrainedModel for the models (it-self a sub-class of nn.Module) +#################################################### +PARALLELIZE_DOCSTRING = r""" + This is an experimental feature and is a subject to change at a moment's notice. + + Uses a device map to distribute attention modules of the model across several devices. If no device map is given, + it will evenly distribute blocks across all devices. + + Args: + device_map (`Dict[int, list]`, optional, defaults to None): + A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always + automatically mapped to the first device (for esoteric reasons). That means that the first device should + have fewer attention modules mapped to it than other devices. For reference, the t5 models have the + following number of attention modules: + + - t5-small: 6 + - t5-base: 12 + - t5-large: 24 + - t5-3b: 24 + - t5-11b: 24 + + Example: + + ```python + # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules: + model = T5ForConditionalGeneration.from_pretrained("t5-3b") + device_map = { + 0: [0, 1, 2], + 1: [3, 4, 5, 6, 7, 8, 9], + 2: [10, 11, 12, 13, 14, 15, 16], + 3: [17, 18, 19, 20, 21, 22, 23], + } + model.parallelize(device_map) + ``` +""" +DEPARALLELIZE_DOCSTRING = r""" + Moves the model to cpu from a model parallel state. + + Example: + + ```python + # On a 4 GPU machine with t5-3b: + model = T5ForConditionalGeneration.from_pretrained("t5-3b") + device_map = { + 0: [0, 1, 2], + 1: [3, 4, 5, 6, 7, 8, 9], + 2: [10, 11, 12, 13, 14, 15, 16], + 3: [17, 18, 19, 20, 21, 22, 23], + } + model.parallelize(device_map) # Splits the model across several devices + model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() + ``` +""" + + +class T5LayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Construct a layernorm module in the T5 style. No bias and no subtraction of mean. + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + + # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for + # half-precision inputs is done in fp32 + + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +try: + from apex.normalization import FusedRMSNorm + + T5LayerNorm = FusedRMSNorm # noqa + + logger.info( + "Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm" + ) +except ImportError: + # using the normal T5LayerNorm + pass +except Exception: + logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm") + pass + +ALL_LAYERNORM_LAYERS.append(T5LayerNorm) + + +class T5DenseActDense(nn.Module): + def __init__(self, config: T5Config): + super().__init__() + self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.act = ACT2FN[config.dense_act_fn] + + def forward(self, hidden_states): + hidden_states = self.wi(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + + +class T5DenseGatedActDense(nn.Module): + def __init__(self, config: T5Config): + super().__init__() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.act = ACT2FN[config.dense_act_fn] + + def forward(self, hidden_states): + hidden_gelu = self.act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + + +class T5LayerFF(nn.Module): + def __init__(self, config: T5Config): + super().__init__() + if config.is_gated_act: + self.DenseReluDense = T5DenseGatedActDense(config) + else: + self.DenseReluDense = T5DenseActDense(config) + + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + forwarded_states = self.DenseReluDense(forwarded_states) + hidden_states = hidden_states + self.dropout(forwarded_states) + return hidden_states + + +class T5Attention(nn.Module): + def __init__(self, config: T5Config, has_relative_attention_bias=False): + super().__init__() + self.is_decoder = config.is_decoder + self.has_relative_attention_bias = has_relative_attention_bias + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.relative_attention_max_distance = config.relative_attention_max_distance + self.d_model = config.d_model + self.key_value_proj_dim = config.d_kv + self.n_heads = config.num_heads + self.dropout = config.dropout_rate + self.inner_dim = self.n_heads * self.key_value_proj_dim + + # Mesh TensorFlow initialization to avoid scaling before softmax + self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) + + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding( + self.relative_attention_num_buckets, self.n_heads + ) + self.pruned_heads = set() + self.gradient_checkpointing = False + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads + ) + # Prune linear layers + self.q = prune_linear_layer(self.q, index) + self.k = prune_linear_layer(self.k, index) + self.v = prune_linear_layer(self.v, index) + self.o = prune_linear_layer(self.o, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.inner_dim = self.key_value_proj_dim * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @staticmethod + def _relative_position_bucket( + relative_position, bidirectional=True, num_buckets=32, max_distance=128 + ): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. The relative position is defined as + memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for + small absolute relative_position and larger buckets for larger absolute relative_positions. All relative + positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. + This should allow for more graceful generalization to longer sequences than the model has been trained on + + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + + Returns: + a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) + """ + relative_buckets = 0 + if bidirectional: + num_buckets //= 2 + relative_buckets += (relative_position > 0).to(torch.long) * num_buckets + relative_position = torch.abs(relative_position) + else: + relative_position = -torch.min( + relative_position, torch.zeros_like(relative_position) + ) + # now relative_position is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = relative_position < max_exact + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + relative_position_if_large = max_exact + ( + torch.log(relative_position.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.long) + relative_position_if_large = torch.min( + relative_position_if_large, + torch.full_like(relative_position_if_large, num_buckets - 1), + ) + + relative_buckets += torch.where( + is_small, relative_position, relative_position_if_large + ) + return relative_buckets + + def compute_bias(self, query_length, key_length, device=None): + """Compute binned relative position bias""" + if device is None: + device = self.relative_attention_bias.weight.device + context_position = torch.arange(query_length, dtype=torch.long, device=device)[ + :, None + ] + memory_position = torch.arange(key_length, dtype=torch.long, device=device)[ + None, : + ] + relative_position = ( + memory_position - context_position + ) # shape (query_length, key_length) + relative_position_bucket = self._relative_position_bucket( + relative_position, # shape (query_length, key_length) + bidirectional=(not self.is_decoder), + num_buckets=self.relative_attention_num_buckets, + max_distance=self.relative_attention_max_distance, + ) + values = self.relative_attention_bias( + relative_position_bucket + ) # shape (query_length, key_length, num_heads) + values = values.permute([2, 0, 1]).unsqueeze( + 0 + ) # shape (1, num_heads, query_length, key_length) + return values + + def forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) + # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + batch_size, seq_length = hidden_states.shape[:2] + + real_seq_length = seq_length + + if past_key_value is not None: + assert ( + len(past_key_value) == 2 + ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + real_seq_length += ( + past_key_value[0].shape[2] if query_length is None else query_length + ) + + key_length = ( + real_seq_length if key_value_states is None else key_value_states.shape[1] + ) + + def shape(states): + """projection""" + return states.view( + batch_size, -1, self.n_heads, self.key_value_proj_dim + ).transpose(1, 2) + + def unshape(states): + """reshape""" + return ( + states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + ) + + def project(hidden_states, proj_layer, key_value_states, past_key_value): + """projects hidden states correctly to key/query states""" + if key_value_states is None: + # self-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) + elif past_key_value is None: + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) + else: + # cross-attn + hidden_states = past_key_value + return hidden_states + + # get query states + query_states = shape( + self.q(hidden_states) + ) # (batch_size, n_heads, seq_length, dim_per_head) + + # get key/value states + key_states = project( + hidden_states, + self.k, + key_value_states, + past_key_value[0] if past_key_value is not None else None, + ) + value_states = project( + hidden_states, + self.v, + key_value_states, + past_key_value[1] if past_key_value is not None else None, + ) + + # compute scores + scores = torch.matmul( + query_states, key_states.transpose(3, 2) + ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, real_seq_length, key_length), + device=scores.device, + dtype=scores.dtype, + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device + ) + + # if key and values are already calculated + # we want only the last query position bias + if past_key_value is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + + if mask is not None: + position_bias = ( + position_bias + mask + ) # (batch_size, n_heads, seq_length, key_length) + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + + scores += position_bias_masked + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( + scores + ) # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training + ) # (batch_size, n_heads, seq_length, key_length) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask + + attn_output = unshape( + torch.matmul(attn_weights, value_states) + ) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) + + present_key_value_state = ( + (key_states, value_states) if (self.is_decoder and use_cache) else None + ) + outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + +class T5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.SelfAttention = T5Attention( + config, has_relative_attention_bias=has_relative_attention_bias + ) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.SelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[ + 1: + ] # add attentions if we output them + return outputs + + +class T5LayerCrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + key_value_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + query_length=None, + output_attentions=False, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.EncDecAttention( + normed_hidden_states, + mask=attention_mask, + key_value_states=key_value_states, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + query_length=query_length, + output_attentions=output_attentions, + ) + layer_output = hidden_states + self.dropout(attention_output[0]) + outputs = (layer_output,) + attention_output[ + 1: + ] # add attentions if we output them + return outputs + + +class T5Block(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.is_decoder = config.is_decoder + self.layer = nn.ModuleList() + self.layer.append( + T5LayerSelfAttention( + config, has_relative_attention_bias=has_relative_attention_bias + ) + ) + if self.is_decoder: + self.layer.append(T5LayerCrossAttention(config)) + + self.layer.append(T5LayerFF(config)) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): + + if past_key_value is not None: + if not self.is_decoder: + logger.warning( + "`past_key_values` is passed to the encoder. Please make sure this is intended." + ) + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + + if len(past_key_value) != expected_num_past_key_values: + raise ValueError( + f"There should be {expected_num_past_key_values} past states. " + f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" + f"Got {len(past_key_value)} past key / value states" + ) + + self_attn_past_key_value = past_key_value[:2] + cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=self_attn_past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] + attention_outputs = self_attention_outputs[ + 2: + ] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp( + hidden_states, min=-clamp_value, max=clamp_value + ) + + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here + if present_key_value_state is not None: + query_length = present_key_value_state[0].shape[2] + else: + query_length = None + + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 training + if ( + hidden_states.dtype == torch.float16 + and torch.isinf(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp( + hidden_states, min=-clamp_value, max=clamp_value + ) + + # Combine self attn and cross attn key value states + if present_key_value_state is not None: + present_key_value_state = ( + present_key_value_state + cross_attention_outputs[1] + ) + + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp( + hidden_states, min=-clamp_value, max=clamp_value + ) + + outputs = (hidden_states,) + + if use_cache: + outputs = outputs + (present_key_value_state,) + attention_outputs + else: + outputs = outputs + attention_outputs + + return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + +class T5PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = T5Config + load_tf_weights = load_tf_weights_in_t5 + base_model_prefix = "transformer" + is_parallelizable = True + supports_gradient_checkpointing = True + _no_split_modules = ["T5Block"] + + @property + def dummy_inputs(self): + input_ids = torch.tensor(DUMMY_INPUTS) + input_mask = torch.tensor(DUMMY_MASK) + dummy_inputs = { + "decoder_input_ids": input_ids, + "input_ids": input_ids, + "decoder_attention_mask": input_mask, + } + return dummy_inputs + + def _init_weights(self, module): + """Initialize the weights""" + factor = ( + self.config.initializer_factor + ) # Used for testing weights initialization + if isinstance(module, T5LayerNorm): + module.weight.data.fill_(factor * 1.0) + elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)): + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) + if hasattr(module, "lm_head") and not self.config.tie_word_embeddings: + module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0) + elif isinstance(module, T5DenseActDense): + # Mesh TensorFlow FF initialization + # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 + # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 + module.wi.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_model) ** -0.5) + ) + if hasattr(module.wi, "bias") and module.wi.bias is not None: + module.wi.bias.data.zero_() + module.wo.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_ff) ** -0.5) + ) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, T5DenseGatedActDense): + module.wi_0.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_model) ** -0.5) + ) + if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None: + module.wi_0.bias.data.zero_() + module.wi_1.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_model) ** -0.5) + ) + if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None: + module.wi_1.bias.data.zero_() + module.wo.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_ff) ** -0.5) + ) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, T5Attention): + # Mesh TensorFlow attention initialization to avoid scaling before softmax + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 + d_model = self.config.d_model + key_value_proj_dim = self.config.d_kv + n_heads = self.config.num_heads + module.q.weight.data.normal_( + mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5) + ) + module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.o.weight.data.normal_( + mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5) + ) + if module.has_relative_attention_bias: + module.relative_attention_bias.weight.data.normal_( + mean=0.0, std=factor * ((d_model) ** -0.5) + ) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (T5Attention, T5Stack)): + module.gradient_checkpointing = value + + def _shift_right(self, input_ids): + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + assert decoder_start_token_id is not None, ( + "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id." + " See T5 docs for more information" + ) + + # shift inputs to the right + if is_torch_fx_proxy(input_ids): + # Item assignment is not supported natively for proxies. + shifted_input_ids = torch.full( + input_ids.shape[:-1] + (1,), decoder_start_token_id + ) + shifted_input_ids = torch.cat( + [shifted_input_ids, input_ids[..., :-1]], dim=-1 + ) + else: + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + assert ( + pad_token_id is not None + ), "self.model.config.pad_token_id has to be defined." + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class T5Stack(T5PreTrainedModel): + def __init__(self, config, embed_tokens=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder + + self.block = nn.ModuleList( + [ + T5Block(config, has_relative_attention_bias=bool(i == 0)) + for i in range(config.num_layers) + ] + ) + self.final_layer_norm = T5LayerNorm( + config.d_model, eps=config.layer_norm_epsilon + ) + self.dropout = nn.Dropout(config.dropout_rate) + + # Initialize weights and apply final processing + self.post_init() + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + # Check validity of device_map + self.device_map = ( + get_device_map(len(self.block), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.block)) + self.model_parallel = True + self.first_device = ( + "cpu" + if "cpu" in self.device_map.keys() + else "cuda:" + str(min(self.device_map.keys())) + ) + self.last_device = "cuda:" + str(max(self.device_map.keys())) + # Load onto devices + for k, v in self.device_map.items(): + for layer in v: + cuda_device = "cuda:" + str(k) + self.block[layer] = self.block[layer].to(cuda_device) + + # Set embed_tokens to first layer + self.embed_tokens = self.embed_tokens.to(self.first_device) + # Set final layer norm to last device + self.final_layer_norm = self.final_layer_norm.to(self.last_device) + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def deparallelize(self): + self.model_parallel = False + self.device_map = None + self.first_device = "cpu" + self.last_device = "cpu" + for i in range(len(self.block)): + self.block[i] = self.block[i].to("cpu") + self.embed_tokens = self.embed_tokens.to("cpu") + self.final_layer_norm = self.final_layer_norm.to("cpu") + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(self.first_device) + self.embed_tokens = self.embed_tokens.to(self.first_device) + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if input_ids is not None and inputs_embeds is not None: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError( + f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError( + f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds" + ) + + if inputs_embeds is None: + assert ( + self.embed_tokens is not None + ), "You have to initialize the model with valid token embeddings" + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape + + # required mask seq length can be calculated via length of past + mask_seq_length = ( + past_key_values[0][0].shape[2] + seq_length + if past_key_values is not None + else seq_length + ) + + if use_cache is True: + assert ( + self.is_decoder + ), f"`use_cache` can only be set to `True` if {self} is used as a decoder" + + if attention_mask is None: + attention_mask = torch.ones( + batch_size, mask_seq_length, device=inputs_embeds.device + ) + if ( + self.is_decoder + and encoder_attention_mask is None + and encoder_hidden_states is not None + ): + encoder_seq_length = encoder_hidden_states.shape[1] + encoder_attention_mask = torch.ones( + batch_size, + encoder_seq_length, + device=inputs_embeds.device, + dtype=torch.long, + ) + + # initialize past_key_values with `None` if past does not exist + if past_key_values is None: + past_key_values = [None] * len(self.block) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.is_decoder and encoder_hidden_states is not None: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=inputs_embeds.device + ) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask( + cross_attn_head_mask, self.config.num_layers + ) + present_key_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None + position_bias = None + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) + + for i, (layer_module, past_key_value) in enumerate( + zip(self.block, past_key_values) + ): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if position_bias is not None: + position_bias = position_bias.to(hidden_states.device) + if encoder_hidden_states is not None: + encoder_hidden_states = encoder_hidden_states.to( + hidden_states.device + ) + if encoder_extended_attention_mask is not None: + encoder_extended_attention_mask = ( + encoder_extended_attention_mask.to(hidden_states.device) + ) + if encoder_decoder_position_bias is not None: + encoder_decoder_position_bias = encoder_decoder_position_bias.to( + hidden_states.device + ) + if layer_head_mask is not None: + layer_head_mask = layer_head_mask.to(hidden_states.device) + if cross_attn_layer_head_mask is not None: + cross_attn_layer_head_mask = cross_attn_layer_head_mask.to( + hidden_states.device + ) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return tuple(module(*inputs, use_cache, output_attentions)) + + return custom_forward + + layer_outputs = checkpoint( + create_custom_forward(layer_module), + hidden_states, + extended_attention_mask, + position_bias, + encoder_hidden_states, + encoder_extended_attention_mask, + encoder_decoder_position_bias, + layer_head_mask, + cross_attn_layer_head_mask, + None, # past_key_value is always None with gradient checkpointing + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: + layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] + + hidden_states, present_key_value_state = layer_outputs[:2] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) + position_bias = layer_outputs[2] + if self.is_decoder and encoder_hidden_states is not None: + encoder_decoder_position_bias = layer_outputs[ + 4 if output_attentions else 3 + ] + # append next layer key value states + if use_cache: + present_key_value_states = present_key_value_states + ( + present_key_value_state, + ) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) + if self.is_decoder: + all_cross_attentions = all_cross_attentions + (layer_outputs[5],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + present_key_value_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_value_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +T5_START_DOCSTRING = r""" + + The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text + Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan + Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a + text-to-text denoising generative setting. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`T5Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +T5_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you + should be able to pad the inputs on both the right and the left. + + Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for detail. + + [What are input IDs?](../glossary#input-ids) + + To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training). + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5 + Training](./t5#training). + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in + `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at + the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. + + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +T5_ENCODER_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you + should be able to pad the inputs on both the right and the left. + + Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for detail. + + To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training). + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +__HEAD_MASK_WARNING_MSG = """ +The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently, +`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions. +If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers, +num_heads)`. +""" + + +@add_start_docstrings( + "The bare T5 Model transformer outputting raw hidden-states without any specific head on top.", + T5_START_DOCSTRING, +) +class T5Model(T5PreTrainedModel): + _keys_to_ignore_on_load_missing = [ + r"encoder.embed_tokens.weight", + r"decoder.embed_tokens.weight", + ] + _keys_to_ignore_on_load_unexpected = [ + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", + ] + + def __init__(self, config: T5Config): + super().__init__(config) + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = T5Stack(decoder_config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.decoder.parallelize(self.device_map) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.encoder.deparallelize() + self.decoder.deparallelize() + self.encoder = self.encoder.to("cpu") + self.decoder = self.decoder.to("cpu") + self.model_parallel = False + self.device_map = None + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: + r""" + Returns: + + Example: + + ```python + >>> from transformers import T5Tokenizer, T5Model + + >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") + >>> model = T5Model.from_pretrained("t5-small") + + >>> input_ids = tokenizer( + ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" + ... ).input_ids # Batch size 1 + >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 + + >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model. + >>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg. + >>> decoder_input_ids = model._shift_right(decoder_input_ids) + + >>> # forward pass + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + hidden_states = hidden_states.to(self.decoder.first_device) + if decoder_input_ids is not None: + decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) + if attention_mask is not None: + attention_mask = attention_mask.to(self.decoder.first_device) + if decoder_attention_mask is not None: + decoder_attention_mask = decoder_attention_mask.to( + self.decoder.first_device + ) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING +) +class T5ForConditionalGeneration(T5PreTrainedModel): + _keys_to_ignore_on_load_missing = [ + r"encoder.embed_tokens.weight", + r"decoder.embed_tokens.weight", + r"lm_head.weight", + ] + _keys_to_ignore_on_load_unexpected = [ + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", + ] + + def __init__(self, config: T5Config): + super().__init__(config) + self.model_dim = config.d_model + + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = T5Stack(decoder_config, self.shared) + + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.decoder.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.decoder.first_device) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.encoder.deparallelize() + self.decoder.deparallelize() + self.encoder = self.encoder.to("cpu") + self.decoder = self.decoder.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.model_parallel = False + self.device_map = None + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def get_output_embeddings(self): + return self.lm_head + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + reduction: Optional[str] = "mean", + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., + config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for + labels in `[0, ..., config.vocab_size]` + + Returns: + + Examples: + + ```python + >>> from transformers import T5Tokenizer, T5ForConditionalGeneration + + >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") + >>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + + >>> # training + >>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids + >>> labels = tokenizer(" cute dog the ", return_tensors="pt").input_ids + >>> outputs = model(input_ids=input_ids, labels=labels) + >>> loss = outputs.loss + >>> logits = outputs.logits + + >>> # inference + >>> input_ids = tokenizer( + ... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" + ... ).input_ids # Batch size 1 + >>> outputs = model.generate(input_ids) + >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + >>> # studies have shown that owning a dog is good for you. + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + # Convert encoder inputs in embeddings if needed + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + + if ( + labels is not None + and decoder_input_ids is None + and decoder_inputs_embeds is None + ): + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + hidden_states = hidden_states.to(self.decoder.first_device) + if decoder_input_ids is not None: + decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) + if attention_mask is not None: + attention_mask = attention_mask.to(self.decoder.first_device) + if decoder_attention_mask is not None: + decoder_attention_mask = decoder_attention_mask.to( + self.decoder.first_device + ) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = decoder_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.encoder.first_device) + self.lm_head = self.lm_head.to(self.encoder.first_device) + sequence_output = sequence_output.to(self.lm_head.weight.device) + + if self.config.tie_word_embeddings: + # Rescale output before projecting on vocab + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + sequence_output = sequence_output * (self.model_dim**-0.5) + + lm_logits = self.lm_head(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-100, reduction=reduction) + loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) + if reduction == "none": + loss = loss.view(lm_logits.size(0), -1).sum(1) + + if not return_dict: + output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs + return ((loss,) + output) if loss is not None else output + + return Seq2SeqLMOutput( + loss=loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + "decoder_input_ids": input_ids, + "past_key_values": past, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return self._shift_right(labels) + + def _reorder_cache(self, past, beam_idx): + # if decoder past is not included in output + # speedy decoding is disabled and no need to reorder + if past is None: + logger.warning( + "You might want to consider setting `use_cache=True` to speed up decoding" + ) + return past + + reordered_decoder_past = () + for layer_past_states in past: + # get the correct batch idx from layer past batch dim + # batch dim of `past` is at 2nd position + reordered_layer_past_states = () + for layer_past_state in layer_past_states: + # need to set correct `past` for each of the four key / value states + reordered_layer_past_states = reordered_layer_past_states + ( + layer_past_state.index_select( + 0, beam_idx.to(layer_past_state.device) + ), + ) + + assert reordered_layer_past_states[0].shape == layer_past_states[0].shape + assert len(reordered_layer_past_states) == len(layer_past_states) + + reordered_decoder_past = reordered_decoder_past + ( + reordered_layer_past_states, + ) + return reordered_decoder_past + + +@add_start_docstrings( + "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", + T5_START_DOCSTRING, +) +class T5EncoderModel(T5PreTrainedModel): + authorized_missing_keys = [ + r"encoder.embed_tokens.weight", + ] + + def __init__(self, config: T5Config): + super().__init__(config) + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.encoder.deparallelize() + self.encoder = self.encoder.to("cpu") + self.model_parallel = False + self.device_map = None + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + + def get_encoder(self): + return self.encoder + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]: + r""" + Returns: + + Example: + + ```python + >>> from transformers import T5Tokenizer, T5EncoderModel + + >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") + >>> model = T5EncoderModel.from_pretrained("t5-small") + >>> input_ids = tokenizer( + ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" + ... ).input_ids # Batch size 1 + >>> outputs = model(input_ids=input_ids) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return encoder_outputs diff --git a/lavis/models/blip_diffusion_models/__init__.py b/lavis/models/blip_diffusion_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lavis/models/blip_diffusion_models/__pycache__/__init__.cpython-310.pyc b/lavis/models/blip_diffusion_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed51263a73107caaf74e5c2b08404e5851f4f35b Binary files /dev/null and b/lavis/models/blip_diffusion_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/blip_diffusion_models/__pycache__/blip_diffusion.cpython-310.pyc b/lavis/models/blip_diffusion_models/__pycache__/blip_diffusion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..143a9fe8944a9773d5205c063d4b2cba9c5ebe08 Binary files /dev/null and b/lavis/models/blip_diffusion_models/__pycache__/blip_diffusion.cpython-310.pyc differ diff --git a/lavis/models/blip_diffusion_models/__pycache__/modeling_ctx_clip.cpython-310.pyc b/lavis/models/blip_diffusion_models/__pycache__/modeling_ctx_clip.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ffe0926c2b3e4b7ce95de4d494b5c11cc60c2a9 Binary files /dev/null and b/lavis/models/blip_diffusion_models/__pycache__/modeling_ctx_clip.cpython-310.pyc differ diff --git a/lavis/models/blip_diffusion_models/__pycache__/ptp_utils.cpython-310.pyc b/lavis/models/blip_diffusion_models/__pycache__/ptp_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a7ff17fa71aae0e47e9416e477876cd2765da37 Binary files /dev/null and b/lavis/models/blip_diffusion_models/__pycache__/ptp_utils.cpython-310.pyc differ diff --git a/lavis/models/blip_diffusion_models/__pycache__/utils.cpython-310.pyc b/lavis/models/blip_diffusion_models/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50b9b6c6b196bd8c84ffd36db4a496d2d3fc6126 Binary files /dev/null and b/lavis/models/blip_diffusion_models/__pycache__/utils.cpython-310.pyc differ diff --git a/lavis/models/blip_diffusion_models/blip_diffusion.py b/lavis/models/blip_diffusion_models/blip_diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..037d2c1044e03fab50908bc82cecd8be38b3afd6 --- /dev/null +++ b/lavis/models/blip_diffusion_models/blip_diffusion.py @@ -0,0 +1,997 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import logging +import os + +import torch +import torch.nn.functional as F +import tqdm +from diffusers import ( + AutoencoderKL, + ControlNetModel, + DDPMScheduler, + DDIMScheduler, + PNDMScheduler, + UNet2DConditionModel, +) +from torch import nn +from transformers import CLIPTokenizer +from transformers.activations import QuickGELUActivation as QuickGELU + +from lavis.common.registry import registry +from lavis.common.utils import download_and_untar, is_url +from lavis.models.base_model import BaseModel +from lavis.models.blip2_models.blip2_qformer import Blip2Qformer +from lavis.models.blip_diffusion_models.modeling_ctx_clip import CtxCLIPTextModel +from lavis.models.blip_diffusion_models.utils import numpy_to_pil, prepare_cond_image +from lavis.models.blip_diffusion_models.ptp_utils import ( + LocalBlend, + P2PCrossAttnProcessor, + AttentionRefine, +) + + +class ProjLayer(nn.Module): + def __init__(self, in_dim, out_dim, hidden_dim, drop_p=0.1, eps=1e-12): + super().__init__() + + # Dense1 -> Act -> Dense2 -> Drop -> Res -> Norm + self.dense1 = nn.Linear(in_dim, hidden_dim) + self.act_fn = QuickGELU() + self.dense2 = nn.Linear(hidden_dim, out_dim) + self.dropout = nn.Dropout(drop_p) + + self.LayerNorm = nn.LayerNorm(out_dim, eps=eps) + + def forward(self, x): + x_in = x + + x = self.LayerNorm(x) + x = self.dropout(self.dense2(self.act_fn(self.dense1(x)))) + x_in + + return x + + +@registry.register_model("blip_diffusion") +class BlipDiffusion(BaseModel): + PRETRAINED_MODEL_CONFIG_DICT = { + "base": "configs/models/blip-diffusion/blip_diffusion_base.yaml", + "canny": "configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml", + "depth": "configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml", + "hed": "configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml", + } + + def __init__( + self, + vit_model="clip_L", + qformer_num_query_token=16, + qformer_cross_attention_freq=1, + qformer_pretrained_path=None, + qformer_train=False, + sd_pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5", + sd_train_text_encoder=False, + controlnet_pretrained_model_name_or_path=None, + vae_half_precision=False, + proj_train=False, + ): + super().__init__() + + self.num_query_token = qformer_num_query_token + + # BLIP-2 + self.blip = Blip2Qformer( + vit_model=vit_model, + num_query_token=qformer_num_query_token, + cross_attention_freq=qformer_cross_attention_freq, + ) + if qformer_pretrained_path is not None: + state_dict = torch.load(qformer_pretrained_path, map_location="cpu")[ + "model" + ] + # qformer keys: Qformer.bert.encoder.layer.1.attention.self.key.weight + # ckpt keys: text_model.bert.encoder.layer.1.attention.self.key.weight + for k in list(state_dict.keys()): + if "text_model" in k: + state_dict[k.replace("text_model", "Qformer")] = state_dict.pop(k) + + msg = self.blip.load_state_dict(state_dict, strict=False) + assert all(["visual" in k for k in msg.missing_keys]) + assert len(msg.unexpected_keys) == 0 + + self.qformer_train = qformer_train + + # projection layer + self.proj_layer = ProjLayer( + in_dim=768, out_dim=768, hidden_dim=3072, drop_p=0.1, eps=1e-12 + ) + self.proj_train = proj_train + + # stable diffusion + self.tokenizer = CLIPTokenizer.from_pretrained( + sd_pretrained_model_name_or_path, subfolder="tokenizer" + ) + self.text_encoder = CtxCLIPTextModel.from_pretrained( + sd_pretrained_model_name_or_path, subfolder="text_encoder" + ) + self.vae = AutoencoderKL.from_pretrained( + sd_pretrained_model_name_or_path, subfolder="vae" + ) + if vae_half_precision: + self.vae.half() + + self.unet = UNet2DConditionModel.from_pretrained( + sd_pretrained_model_name_or_path, subfolder="unet" + ) + # self.unet.enable_xformers_memory_efficient_attention() + + self.noise_scheduler = DDPMScheduler.from_config( + sd_pretrained_model_name_or_path, subfolder="scheduler" + ) + + self.sd_train_text_encoder = sd_train_text_encoder + + if controlnet_pretrained_model_name_or_path is not None: + self.controlnet = ControlNetModel.from_pretrained( + controlnet_pretrained_model_name_or_path + ) + + self.freeze_modules() + + self.ctx_embeddings_cache = nn.Parameter( + torch.zeros(1, self.num_query_token, 768), requires_grad=False + ) + self._use_embeddings_cache = False + + # inference-related + self._CTX_BEGIN_POS = 2 + + def freeze_modules(self): + to_freeze = [self.vae] + if not self.sd_train_text_encoder: + to_freeze.append(self.text_encoder) + + if not self.qformer_train: + to_freeze.append(self.blip) + + if not self.proj_train: + to_freeze.append(self.proj_layer) + + for module in to_freeze: + module.eval() + module.train = self.disabled_train + module.requires_grad_(False) + + def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + @property + def pndm_scheduler(self): + if not hasattr(self, "_pndm_scheduler"): + self._pndm_scheduler = PNDMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + set_alpha_to_one=False, + skip_prk_steps=True, + ) + return self._pndm_scheduler + + @property + def ddim_scheduler(self): + if not hasattr(self, "_ddim_scheduler"): + self._ddim_scheduler = DDIMScheduler.from_config( + "runwayml/stable-diffusion-v1-5", subfolder="scheduler" + ) + return self._ddim_scheduler + + def before_training(self, dataset, **kwargs): + assert len(dataset) == 1, "Only support single dataset for now." + + key = list(dataset.keys())[0] + dataset = dataset[key]["train"] + + # collect all examples + # [FIXME] this is not memory efficient. may OOM if the dataset is large. + examples = [dataset[i] for i in range(dataset.len_without_repeat)] + input_images = ( + torch.stack([example["inp_image"] for example in examples]) + .to(memory_format=torch.contiguous_format) + .float() + ).to(self.device) + subject_text = [dataset.subject for _ in range(input_images.shape[0])] + + # calculate ctx embeddings and cache them + ctx_embeddings = self.forward_ctx_embeddings( + input_image=input_images, text_input=subject_text + ) + # take mean of all ctx embeddings + ctx_embeddings = ctx_embeddings.mean(dim=0, keepdim=True) + self.ctx_embeddings_cache = nn.Parameter(ctx_embeddings, requires_grad=True) + self._use_embeddings_cache = True + + # free up CUDA memory + self.blip.to("cpu") + self.proj_layer.to("cpu") + + torch.cuda.empty_cache() + + def forward(self, samples): + latents = self.vae.encode(samples["tgt_image"].half()).latent_dist.sample() + latents = latents * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, + self.noise_scheduler.config.num_train_timesteps, + (bsz,), + device=latents.device, + ) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps) + ctx_embeddings = self.forward_ctx_embeddings( + input_image=samples["inp_image"], text_input=samples["subject_text"] + ) + + # Get the text embedding for conditioning + input_ids = self.tokenizer( + samples["caption"], + padding="do_not_pad", + truncation=True, + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids.to(self.device) + encoder_hidden_states = self.text_encoder( + input_ids=input_ids, + ctx_embeddings=ctx_embeddings, + ctx_begin_pos=[self._CTX_BEGIN_POS] * input_ids.shape[0], + )[0] + + # Predict the noise residual + noise_pred = self.unet( + noisy_latents.float(), timesteps, encoder_hidden_states + ).sample + + loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean") + + return {"loss": loss} + + def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20): + rv = [] + for prompt, tgt_subject in zip(prompts, tgt_subjects): + prompt = f"a {tgt_subject} {prompt.strip()}" + # a trick to amplify the prompt + rv.append(", ".join([prompt] * int(prompt_strength * prompt_reps))) + + return rv + + def _build_prompts_edit(self, cond_subject, tgt_subject, prompt): + placeholder = " ".join(["sks"] * self.num_query_token) + + src_prompt = f"a {cond_subject} {prompt}" + tgt_prompt = f"a {placeholder} {tgt_subject} {prompt}" + + return [src_prompt, tgt_prompt] + + def _predict_noise( + self, + t, + latent_model_input, + text_embeddings, + width=512, + height=512, + cond_image=None, + ): + if hasattr(self, "controlnet"): + cond_image = prepare_cond_image( + cond_image, width, height, batch_size=1, device=self.device + ) + + down_block_res_samples, mid_block_res_sample = self.controlnet( + latent_model_input, + t, + encoder_hidden_states=text_embeddings, + controlnet_cond=cond_image, + # conditioning_scale=controlnet_condition_scale, + return_dict=False, + ) + else: + down_block_res_samples, mid_block_res_sample = None, None + + noise_pred = self.unet( + latent_model_input, + timestep=t, + encoder_hidden_states=text_embeddings, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + )["sample"] + + return noise_pred + + def _init_latent(self, latent, height, width, generator, batch_size): + if latent is None: + latent = torch.randn( + (1, self.unet.in_channels, height // 8, width // 8), + generator=generator, + device=generator.device, + ) + latent = latent.expand( + batch_size, + self.unet.in_channels, + height // 8, + width // 8, + ) + return latent.to(self.device) + + def _forward_prompt_embeddings(self, input_image, src_subject, prompt): + # 1. extract BLIP query features and proj to text space -> (bs, 32, 768) + query_embeds = self.forward_ctx_embeddings(input_image, src_subject) + + # 2. embeddings for prompt, with query_embeds as context + tokenized_prompt = self._tokenize_text(prompt).to(self.device) + text_embeddings = self.text_encoder( + input_ids=tokenized_prompt.input_ids, + ctx_embeddings=query_embeds, + ctx_begin_pos=[self._CTX_BEGIN_POS], + )[0] + + return text_embeddings + + @torch.no_grad() + def get_image_latents(self, image, sample=True, rng_generator=None): + assert isinstance(image, torch.Tensor) + + encoding_dist = self.vae.encode(image).latent_dist + if sample: + encoding = encoding_dist.sample(generator=rng_generator) + else: + encoding = encoding_dist.mode() + latents = encoding * 0.18215 + return latents + + def _inversion_transform(self, image, target_size=512): + from torchvision import transforms + + tform = transforms.Compose( + [ + transforms.Resize(target_size), + transforms.CenterCrop(target_size), + transforms.ToTensor(), + ] + ) + image = tform(image).unsqueeze(0).to(self.device) + return 2.0 * image - 1.0 + + @torch.no_grad() + def edit( + self, + samples, + lb_threshold=0.3, + guidance_scale=7.5, + height=512, + width=512, + seed=42, + num_inference_steps=50, + num_inversion_steps=50, + neg_prompt="", + ): + raw_image = samples["raw_image"] + raw_image = self._inversion_transform(raw_image) + + latents = self.get_image_latents(raw_image, rng_generator=None) + + inv_latents = self._ddim_inverse( + samples=samples, + latents=latents, + seed=seed, + guidance_scale=1.0, + height=height, + width=width, + num_inference_steps=num_inversion_steps, + ) + + recon_image = self.generate_then_edit( + samples=samples, + latents=inv_latents, + seed=seed, + neg_prompt=neg_prompt, + guidance_scale=guidance_scale, + height=height, + width=width, + num_inference_steps=num_inference_steps, + use_inversion=True, + lb_threshold=lb_threshold, + ) + + return recon_image + + @torch.no_grad() + def _ddim_inverse( + self, + samples, + latents, + guidance_scale=1.0, + height=512, + width=512, + seed=42, + num_inference_steps=50, + ): + src_subject = samples["src_subject"] # source subject category + prompt = samples["prompt"] + + prompt = self._build_prompt( + prompts=prompt, + tgt_subjects=src_subject, + prompt_strength=1.0, + prompt_reps=1, + ) + + tokenized_prompt = self._tokenize_text(prompt, with_query=False).to(self.device) + text_embeddings = self.text_encoder( + input_ids=tokenized_prompt.input_ids, + ctx_embeddings=None, + )[0] + + if seed is not None: + generator = torch.Generator(device=self.device) + generator = generator.manual_seed(seed) + + latents = self._init_latent(latents, height, width, generator, batch_size=1) + + scheduler = self.ddim_scheduler + + # set timesteps + extra_set_kwargs = {} + scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + + iterator = tqdm.tqdm(reversed(scheduler.timesteps)) + + for i, t in enumerate(iterator): + latents = self._noise_latent_step( + latents=latents, + t=t, + text_embeddings=text_embeddings, + height=height, + width=width, + guidance_scale=guidance_scale, + ) + + return latents + + @torch.no_grad() + def generate( + self, + samples, + latents=None, + guidance_scale=7.5, + height=512, + width=512, + seed=42, + num_inference_steps=50, + neg_prompt="", + controller=None, + prompt_strength=1.0, + prompt_reps=20, + use_ddim=False, + ): + if controller is not None: + self._register_attention_refine(controller) + + cond_image = samples["cond_images"] # reference image + cond_subject = samples["cond_subject"] # source subject category + tgt_subject = samples["tgt_subject"] # target subject category + prompt = samples["prompt"] + cldm_cond_image = samples.get("cldm_cond_image", None) # conditional image + + prompt = self._build_prompt( + prompts=prompt, + tgt_subjects=tgt_subject, + prompt_strength=prompt_strength, + prompt_reps=prompt_reps, + ) + + text_embeddings = self._forward_prompt_embeddings( + cond_image, cond_subject, prompt + ) + + # 3. unconditional embedding + do_classifier_free_guidance = guidance_scale > 1.0 + if do_classifier_free_guidance: + max_length = self.text_encoder.text_model.config.max_position_embeddings + + uncond_input = self.tokenizer( + [neg_prompt], + padding="max_length", + max_length=max_length, + return_tensors="pt", + ) + uncond_embeddings = self.text_encoder( + input_ids=uncond_input.input_ids.to(self.device), + ctx_embeddings=None, + )[0] + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + + if seed is not None: + generator = torch.Generator(device=self.device) + generator = generator.manual_seed(seed) + + latents = self._init_latent(latents, height, width, generator, batch_size=1) + + scheduler = self.pndm_scheduler if not use_ddim else self.ddim_scheduler + + # set timesteps + extra_set_kwargs = {} + scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + + iterator = tqdm.tqdm(scheduler.timesteps) + + for i, t in enumerate(iterator): + latents = self._denoise_latent_step( + latents=latents, + t=t, + text_embeddings=text_embeddings, + cond_image=cldm_cond_image, + height=height, + width=width, + guidance_scale=guidance_scale, + use_inversion=use_ddim, + ) + + image = self._latent_to_image(latents) + + return image + + def _register_attention_refine( + self, + src_subject, + prompts, + num_inference_steps, + cross_replace_steps=0.8, + self_replace_steps=0.4, + threshold=0.3, + ): + device, tokenizer = self.device, self.tokenizer + + lb = LocalBlend( + prompts=prompts, + words=(src_subject,), + device=device, + tokenizer=tokenizer, + threshold=threshold, + ) + + controller = AttentionRefine( + prompts, + num_inference_steps, + cross_replace_steps=cross_replace_steps, + self_replace_steps=self_replace_steps, + tokenizer=tokenizer, + device=device, + local_blend=lb, + ) + + self._register_attention_control(controller) + + return controller + + def _register_attention_control(self, controller): + attn_procs = {} + cross_att_count = 0 + for name in self.unet.attn_processors.keys(): + cross_attention_dim = ( + None + if name.endswith("attn1.processor") + else self.unet.config.cross_attention_dim + ) + if name.startswith("mid_block"): + hidden_size = self.unet.config.block_out_channels[-1] + place_in_unet = "mid" + elif name.startswith("up_blocks"): + block_id = int(name[len("up_blocks.")]) + hidden_size = list(reversed(self.unet.config.block_out_channels))[ + block_id + ] + place_in_unet = "up" + elif name.startswith("down_blocks"): + block_id = int(name[len("down_blocks.")]) + hidden_size = self.unet.config.block_out_channels[block_id] + place_in_unet = "down" + else: + continue + cross_att_count += 1 + attn_procs[name] = P2PCrossAttnProcessor( + controller=controller, place_in_unet=place_in_unet + ) + + self.unet.set_attn_processor(attn_procs) + if controller is not None: + controller.num_att_layers = cross_att_count + + @torch.no_grad() + def generate_then_edit( + self, + samples, + cross_replace_steps=0.8, + self_replace_steps=0.4, + guidance_scale=7.5, + height=512, + width=512, + latents=None, + seed=42, + num_inference_steps=250, + neg_prompt="", + use_inversion=False, + lb_threshold=0.3, + ): + cond_image = samples["cond_images"] # reference image + cond_subject = samples["cond_subject"] # source subject category + + src_subject = samples["src_subject"] + tgt_subject = samples["tgt_subject"] # target subject category + + prompt = samples["prompt"] + assert len(prompt) == 1, "Do not support multiple prompts for now" + prompt = self._build_prompts_edit(src_subject, tgt_subject, prompt[0]) + print(prompt) + + controller = self._register_attention_refine( + src_subject=src_subject, + prompts=prompt, + num_inference_steps=num_inference_steps, + cross_replace_steps=cross_replace_steps, + self_replace_steps=self_replace_steps, + threshold=lb_threshold, + ) + + query_embeds = self.forward_ctx_embeddings(cond_image, cond_subject) + + tokenized_prompt_bef = self._tokenize_text(prompt[:1], with_query=False).to( + self.device + ) + tokenized_prompt_aft = self._tokenize_text(prompt[1:], with_query=True).to( + self.device + ) + + text_embeddings_bef = self.text_encoder( + input_ids=tokenized_prompt_bef.input_ids, + )[0] + text_embeddings_aft = self.text_encoder( + input_ids=tokenized_prompt_aft.input_ids, + ctx_embeddings=query_embeds, + ctx_begin_pos=[self._CTX_BEGIN_POS], + )[0] + + text_embeddings = torch.cat([text_embeddings_bef, text_embeddings_aft], dim=0) + + # 3. unconditional embedding + do_classifier_free_guidance = guidance_scale > 1.0 + + # [TODO] add support for batched input + batch_size = 2 + + if do_classifier_free_guidance: + max_length = self.text_encoder.text_model.config.max_position_embeddings + + uncond_input = self.tokenizer( + [neg_prompt], + padding="max_length", + max_length=max_length, + return_tensors="pt", + ) + + # FIXME use context embedding for uncond_input or not? + uncond_embeddings = self.text_encoder( + input_ids=uncond_input.input_ids.to(self.device), + ctx_embeddings=None, + )[0] + # repeat the uncond embedding to match the number of prompts + uncond_embeddings = uncond_embeddings.expand(batch_size, -1, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + + if seed is not None: + generator = torch.Generator(device=self.device) + generator = generator.manual_seed(seed) + + latents = self._init_latent(latents, height, width, generator, batch_size) + + scheduler = self.pndm_scheduler if not use_inversion else self.ddim_scheduler + # set timesteps + scheduler.set_timesteps(num_inference_steps) + + iterator = tqdm.tqdm(scheduler.timesteps) + + for i, t in enumerate(iterator): + latents = self._denoise_latent_step( + latents=latents, + t=t, + text_embeddings=text_embeddings, + height=height, + width=width, + guidance_scale=guidance_scale, + use_inversion=use_inversion, + ) + + latents = controller.step_callback(latents) + + image = self._latent_to_image(latents) + controller.reset() + + return image + + def _latent_to_image(self, latents): + latents = 1 / 0.18215 * latents + image = self.vae.decode(latents).sample + + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).numpy() + + image = numpy_to_pil(image) + + return image + + def _noise_latent_step( + self, + latents, + t, + text_embeddings, + guidance_scale, + height, + width, + ): + def backward_ddim(x_t, alpha_t, alpha_tm1, eps_xt): + """from noise to image""" + return ( + alpha_tm1**0.5 + * ( + (alpha_t**-0.5 - alpha_tm1**-0.5) * x_t + + ((1 / alpha_tm1 - 1) ** 0.5 - (1 / alpha_t - 1) ** 0.5) * eps_xt + ) + + x_t + ) + + do_classifier_free_guidance = guidance_scale > 1.0 + + latent_model_input = ( + torch.cat([latents] * 2) if do_classifier_free_guidance else latents + ) + + # predict the noise residual + noise_pred = self._predict_noise( + t=t, + latent_model_input=latent_model_input, + text_embeddings=text_embeddings, + width=width, + height=height, + ) + + scheduler = self.ddim_scheduler + + prev_timestep = ( + t - scheduler.config.num_train_timesteps // scheduler.num_inference_steps + ) + alpha_prod_t = scheduler.alphas_cumprod[t] + alpha_prod_t_prev = ( + scheduler.alphas_cumprod[prev_timestep] + if prev_timestep >= 0 + else scheduler.final_alpha_cumprod + ) + alpha_prod_t, alpha_prod_t_prev = alpha_prod_t_prev, alpha_prod_t + latents = backward_ddim( + x_t=latents, + alpha_t=alpha_prod_t, + alpha_tm1=alpha_prod_t_prev, + eps_xt=noise_pred, + ) + + return latents + + def _denoise_latent_step( + self, + latents, + t, + text_embeddings, + guidance_scale, + height, + width, + cond_image=None, + use_inversion=False, + ): + if use_inversion: + noise_placeholder = [] + + # expand the latents if we are doing classifier free guidance + do_classifier_free_guidance = guidance_scale > 1.0 + + latent_model_input = ( + torch.cat([latents] * 2) if do_classifier_free_guidance else latents + ) + + # predict the noise residual + noise_pred = self._predict_noise( + t=t, + latent_model_input=latent_model_input, + text_embeddings=text_embeddings, + width=width, + height=height, + cond_image=cond_image, + ) + + if use_inversion: + noise_placeholder.append(noise_pred[2].unsqueeze(0)) + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_text - noise_pred_uncond + ) + + if use_inversion: + noise_placeholder.append(noise_pred[-1].unsqueeze(0)) + noise_pred = torch.cat(noise_placeholder) + + # compute the previous noisy sample x_t -> x_t-1 + scheduler = self.ddim_scheduler if use_inversion else self.pndm_scheduler + + latents = scheduler.step( + noise_pred, + t, + latents, + )["prev_sample"] + + return latents + + def _tokenize_text(self, text_input, with_query=True): + max_len = self.text_encoder.text_model.config.max_position_embeddings + if with_query: + max_len -= self.num_query_token + + tokenized_text = self.tokenizer( + text_input, + padding="max_length", + truncation=True, + max_length=max_len, + return_tensors="pt", + ) + + return tokenized_text + + def forward_ctx_embeddings(self, input_image, text_input, ratio=None): + def compute_ctx_embeddings(input_image, text_input): + # blip_embeddings = self.blip(image=input_image, text=text_input) + blip_embeddings = self.blip.extract_features( + {"image": input_image, "text_input": text_input}, mode="multimodal" + ).multimodal_embeds + ctx_embeddings = self.proj_layer(blip_embeddings) + + return ctx_embeddings + + if isinstance(text_input, str): + text_input = [text_input] + + if self._use_embeddings_cache: + # expand to batch size + ctx_embeddings = self.ctx_embeddings_cache.expand(len(text_input), -1, -1) + else: + if isinstance(text_input[0], str): + text_input, input_image = [text_input], [input_image] + + all_ctx_embeddings = [] + + for inp_image, inp_text in zip(input_image, text_input): + ctx_embeddings = compute_ctx_embeddings(inp_image, inp_text) + all_ctx_embeddings.append(ctx_embeddings) + + if ratio is not None: + assert len(ratio) == len(all_ctx_embeddings) + assert sum(ratio) == 1 + else: + ratio = [1 / len(all_ctx_embeddings)] * len(all_ctx_embeddings) + + ctx_embeddings = torch.zeros_like(all_ctx_embeddings[0]) + + for ratio, ctx_embeddings_ in zip(ratio, all_ctx_embeddings): + ctx_embeddings += ratio * ctx_embeddings_ + + return ctx_embeddings + + @classmethod + def from_config(cls, cfg): + vit_model = cfg.get("vit_model", "clip_L") + + qformer_cross_attention_freq = cfg.get("qformer_cross_attention_freq", 1) + qformer_num_query_token = cfg.get("qformer_num_query_token", 16) + qformer_train = cfg.get("qformer_train", False) + + sd_train_text_encoder = cfg.get("sd_train_text_encoder", False) + sd_pretrained_model_name_or_path = cfg.get( + "sd_pretrained_model_name_or_path", "runwayml/stable-diffusion-v1-5" + ) + + controlnet_pretrained_model_name_or_path = cfg.get( + "controlnet_pretrained_model_name_or_path", None + ) + + vae_half_precision = cfg.get("vae_half_precision", False) + + model = cls( + vit_model=vit_model, + qformer_cross_attention_freq=qformer_cross_attention_freq, + qformer_num_query_token=qformer_num_query_token, + qformer_train=qformer_train, + sd_train_text_encoder=sd_train_text_encoder, + sd_pretrained_model_name_or_path=sd_pretrained_model_name_or_path, + controlnet_pretrained_model_name_or_path=controlnet_pretrained_model_name_or_path, + vae_half_precision=vae_half_precision, + ) + model.load_checkpoint_from_config(cfg) + + return model + + def load_checkpoint_from_dir(self, checkpoint_dir_or_url): + # if checkpoint_dir is a url, download it and untar it + if is_url(checkpoint_dir_or_url): + checkpoint_dir_or_url = download_and_untar(checkpoint_dir_or_url) + + logging.info(f"Loading pretrained model from {checkpoint_dir_or_url}") + + def load_state_dict(module, filename): + try: + state_dict = torch.load( + os.path.join(checkpoint_dir_or_url, filename), map_location="cpu" + ) + msg = module.load_state_dict(state_dict, strict=False) + except FileNotFoundError: + logging.info("File not found, skip loading: {}".format(filename)) + + load_state_dict(self.proj_layer, "proj_layer/proj_weight.pt") + load_state_dict(self.blip, "blip_model/blip_weight.pt") + load_state_dict(self.unet, "unet/diffusion_pytorch_model.bin") + load_state_dict(self.vae, "vae/diffusion_pytorch_model.bin") + load_state_dict(self.text_encoder, "text_encoder/pytorch_model.bin") + + try: + self.ctx_embeddings_cache.data = torch.load( + os.path.join( + checkpoint_dir_or_url, "ctx_embeddings_cache/ctx_embeddings_cache.pt" + ), + map_location=self.device, + ) + self._use_embeddings_cache = True + print("Loaded ctx_embeddings_cache from {}".format(checkpoint_dir_or_url)) + except FileNotFoundError: + self._use_embeddings_cache = False + print("No ctx_embeddings_cache found in {}".format(checkpoint_dir_or_url)) + + def load_from_pretrained(self, url_or_filename): + checkpoint_dir = url_or_filename + self.load_checkpoint_from_dir(checkpoint_dir) + + def load_checkpoint(self, url_or_filename): + """ + Used to load finetuned models. + """ + + super().load_checkpoint(url_or_filename) + + print("loading fine-tuned model from {}".format(url_or_filename)) + self._use_embeddings_cache = True diff --git a/lavis/models/blip_diffusion_models/modeling_ctx_clip.py b/lavis/models/blip_diffusion_models/modeling_ctx_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..737b77d3ff431e9f98db898b8fed308c2dd41d09 --- /dev/null +++ b/lavis/models/blip_diffusion_models/modeling_ctx_clip.py @@ -0,0 +1,240 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +from typing import Optional, Tuple, Union + +import torch +from torch import nn +from transformers.modeling_outputs import BaseModelOutputWithPooling +from transformers.models.clip.configuration_clip import CLIPTextConfig +from transformers.models.clip.modeling_clip import ( + CLIPEncoder, + CLIPPreTrainedModel, + _expand_mask, +) + + +class CtxCLIPTextModel(CLIPPreTrainedModel): + config_class = CLIPTextConfig + + _no_split_modules = ["CLIPEncoderLayer"] + + def __init__(self, config: CLIPTextConfig): + super().__init__(config) + self.text_model = CtxCLIPTextTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.text_model.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.text_model.embeddings.token_embedding = value + + def forward( + self, + ctx_embeddings: torch.Tensor = None, + ctx_begin_pos: list = None, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import CLIPTokenizer, CLIPTextModel + + >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + return self.text_model( + ctx_embeddings=ctx_embeddings, + ctx_begin_pos=ctx_begin_pos, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class CtxCLIPTextTransformer(nn.Module): + def __init__(self, config: CLIPTextConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + self.embeddings = CtxCLIPTextEmbeddings(config) + self.encoder = CLIPEncoder(config) + self.final_layer_norm = nn.LayerNorm(embed_dim) + + def forward( + self, + ctx_embeddings: torch.Tensor, + ctx_begin_pos: list, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if input_ids is None: + raise ValueError("You have to specify either input_ids") + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + + hidden_states = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + ctx_embeddings=ctx_embeddings, + ctx_begin_pos=ctx_begin_pos, + ) + + bsz, seq_len = input_shape + if ctx_embeddings is not None: + seq_len += ctx_embeddings.size(1) + # CLIP's text model uses causal mask, prepare it here. + # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 + causal_attention_mask = self._build_causal_attention_mask( + bsz, seq_len, hidden_states.dtype + ).to(hidden_states.device) + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask, hidden_states.dtype) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.final_layer_norm(last_hidden_state) + + # text_embeds.shape = [batch_size, sequence_length, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0], device=input_ids.device), + input_ids.to(torch.int).argmax(dim=-1), + ] + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def _build_causal_attention_mask(self, bsz, seq_len, dtype): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype) + mask.fill_(torch.tensor(torch.finfo(dtype).min)) + mask.triu_(1) # zero out the lower diagonal + mask = mask.unsqueeze(1) # expand mask + return mask + + +class CtxCLIPTextEmbeddings(nn.Module): + def __init__(self, config: CLIPTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding( + config.max_position_embeddings, embed_dim + ) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + ) + + def forward( + self, + ctx_embeddings: torch.Tensor, + ctx_begin_pos: list, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + if ctx_embeddings is None: + ctx_len = 0 + else: + ctx_len = ctx_embeddings.shape[1] + + seq_length = ( + input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + ) + ctx_len + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + # for each input embeddings, add the ctx embeddings at the correct position + input_embeds_ctx = [] + bsz = inputs_embeds.shape[0] + + if ctx_embeddings is not None: + for i in range(bsz): + cbp = ctx_begin_pos[i] + + prefix = inputs_embeds[i, :cbp] + # remove the special token embedding + suffix = inputs_embeds[i, cbp:] + + input_embeds_ctx.append( + torch.cat([prefix, ctx_embeddings[i], suffix], dim=0) + ) + + inputs_embeds = torch.stack(input_embeds_ctx, dim=0) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings diff --git a/lavis/models/blip_diffusion_models/ptp_utils.py b/lavis/models/blip_diffusion_models/ptp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7ee8cc8e79ca96b787a53bfff12556c2b7a6583e --- /dev/null +++ b/lavis/models/blip_diffusion_models/ptp_utils.py @@ -0,0 +1,566 @@ +# Originally from https://github.com/google/prompt-to-prompt/blob/main/ptp_utils.py +# +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc + +import cv2 +import numpy as np +import torch +from IPython.display import display +from PIL import Image +from diffusers.models.cross_attention import CrossAttention +from typing import Union, Tuple, List, Dict, Optional +import torch.nn.functional as nnf + + +def text_under_image(image: np.ndarray, text: str, text_color: Tuple[int, int, int] = (0, 0, 0)) -> np.ndarray: + h, w, c = image.shape + offset = int(h * .2) + img = np.ones((h + offset, w, c), dtype=np.uint8) * 255 + font = cv2.FONT_HERSHEY_SIMPLEX + img[:h] = image + textsize = cv2.getTextSize(text, font, 1, 2)[0] + text_x, text_y = (w - textsize[0]) // 2, h + offset - textsize[1] // 2 + cv2.putText(img, text, (text_x, text_y), font, 1, text_color, 2) + return img + + +def view_images(images: Union[np.ndarray, List], + num_rows: int = 1, + offset_ratio: float = 0.02, + display_image: bool = True) -> Image.Image: + """ Displays a list of images in a grid. """ + if type(images) is list: + num_empty = len(images) % num_rows + elif images.ndim == 4: + num_empty = images.shape[0] % num_rows + else: + images = [images] + num_empty = 0 + + empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255 + images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty + num_items = len(images) + + h, w, c = images[0].shape + offset = int(h * offset_ratio) + num_cols = num_items // num_rows + image_ = np.ones((h * num_rows + offset * (num_rows - 1), + w * num_cols + offset * (num_cols - 1), 3), dtype=np.uint8) * 255 + for i in range(num_rows): + for j in range(num_cols): + image_[i * (h + offset): i * (h + offset) + h:, j * (w + offset): j * (w + offset) + w] = images[ + i * num_cols + j] + + pil_img = Image.fromarray(image_) + if display_image: + display(pil_img) + return pil_img + + + +class AttentionControl(abc.ABC): + + def step_callback(self, x_t): + return x_t + + def between_steps(self): + return + + @property + def num_uncond_att_layers(self): + return 0 + + @abc.abstractmethod + def forward (self, attn, is_cross: bool, place_in_unet: str): + raise NotImplementedError + + def __call__(self, attn, is_cross: bool, place_in_unet: str): + if self.cur_att_layer >= self.num_uncond_att_layers: + h = attn.shape[0] + attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet) + self.cur_att_layer += 1 + if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers: + self.cur_att_layer = 0 + self.cur_step += 1 + self.between_steps() + return attn + + def reset(self): + self.cur_step = 0 + self.cur_att_layer = 0 + + def __init__(self): + self.cur_step = 0 + self.num_att_layers = -1 + self.cur_att_layer = 0 + + +class EmptyControl(AttentionControl): + + def forward(self, attn, is_cross: bool, place_in_unet: str): + return attn + + +class AttentionStore(AttentionControl): + + @staticmethod + def get_empty_store(): + return {"down_cross": [], "mid_cross": [], "up_cross": [], + "down_self": [], "mid_self": [], "up_self": []} + + def forward(self, attn, is_cross: bool, place_in_unet: str): + key = f"{place_in_unet}_{'cross' if is_cross else 'self'}" + if attn.shape[1] <= 32 ** 2: # avoid memory overhead + self.step_store[key].append(attn) + return attn + + def between_steps(self): + if len(self.attention_store) == 0: + self.attention_store = self.step_store + else: + for key in self.attention_store: + for i in range(len(self.attention_store[key])): + self.attention_store[key][i] += self.step_store[key][i] + self.step_store = self.get_empty_store() + + def get_average_attention(self): + average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store} + return average_attention + + + def reset(self): + super(AttentionStore, self).reset() + self.step_store = self.get_empty_store() + self.attention_store = {} + + def __init__(self): + super(AttentionStore, self).__init__() + self.step_store = self.get_empty_store() + self.attention_store = {} + +class LocalBlend: + + def __call__(self, x_t, attention_store): + k = 1 + maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3] + maps = [item.reshape(self.alpha_layers.shape[0], -1, 1, 16, 16, self.max_num_words) for item in maps] + maps = torch.cat(maps, dim=1) + maps = (maps * self.alpha_layers).sum(-1).mean(1) + mask = nnf.max_pool2d(maps, (k * 2 + 1, k * 2 +1), (1, 1), padding=(k, k)) + mask = nnf.interpolate(mask, size=(x_t.shape[2:])) + mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0] + mask = mask.gt(self.threshold) + mask = (mask[:1] + mask[1:]).float() + x_t = x_t[:1] + mask * (x_t - x_t[:1]) + return x_t + + def __init__(self, prompts: List[str], words, tokenizer, device, threshold=.3, max_num_words=77): + self.max_num_words = 77 + + alpha_layers = torch.zeros(len(prompts), 1, 1, 1, 1, self.max_num_words) + for i, (prompt, words_) in enumerate(zip(prompts, words)): + if type(words_) is str: + words_ = [words_] + for word in words_: + ind = get_word_inds(prompt, word, tokenizer) + alpha_layers[i, :, :, :, :, ind] = 1 + self.alpha_layers = alpha_layers.to(device) + self.threshold = threshold + +class AttentionControlEdit(AttentionStore, abc.ABC): + + def step_callback(self, x_t): + if self.local_blend is not None: + x_t = self.local_blend(x_t, self.attention_store) + return x_t + + def replace_self_attention(self, attn_base, att_replace): + if att_replace.shape[2] <= 16 ** 2: + return attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape) + else: + return att_replace + + @abc.abstractmethod + def replace_cross_attention(self, attn_base, att_replace): + raise NotImplementedError + + def forward(self, attn, is_cross: bool, place_in_unet: str): + super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet) + # FIXME not replace correctly + if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]): + h = attn.shape[0] // (self.batch_size) + attn = attn.reshape(self.batch_size, h, *attn.shape[1:]) + attn_base, attn_repalce = attn[0], attn[1:] + if is_cross: + alpha_words = self.cross_replace_alpha[self.cur_step] + attn_repalce_new = self.replace_cross_attention(attn_base, attn_repalce) * alpha_words + (1 - alpha_words) * attn_repalce + attn[1:] = attn_repalce_new + else: + attn[1:] = self.replace_self_attention(attn_base, attn_repalce) + attn = attn.reshape(self.batch_size * h, *attn.shape[2:]) + return attn + + def __init__(self, prompts, num_steps: int, + cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]], + self_replace_steps: Union[float, Tuple[float, float]], + local_blend: Optional[LocalBlend], + tokenizer, + device): + super(AttentionControlEdit, self).__init__() + # add tokenizer and device here + + self.tokenizer = tokenizer + self.device = device + + self.batch_size = len(prompts) + self.cross_replace_alpha = get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps, self.tokenizer).to(self.device) + if type(self_replace_steps) is float: + self_replace_steps = 0, self_replace_steps + self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1]) + self.local_blend = local_blend + +class AttentionReplace(AttentionControlEdit): + + def replace_cross_attention(self, attn_base, att_replace): + return torch.einsum('hpw,bwn->bhpn', attn_base, self.mapper) + + def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float, + local_blend: Optional[LocalBlend] = None, tokenizer=None, device=None): + super(AttentionReplace, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend, tokenizer, device) + self.mapper = get_replacement_mapper(prompts, self.tokenizer).to(self.device) + + +class AttentionRefine(AttentionControlEdit): + + def replace_cross_attention(self, attn_base, att_replace): + # example mapper: + # because we insert subject embeddings at position 2, and we have + # 16 subject embeddings in total. Therefore, mapper[2:18] = -1. + # tokens before subject embeddings correspond to themselves, so mapper[:2] = 0, 1. + # tokens after subject embeddings correspond to themselves, so mapper[18:] = 2, 3, ... + # tensor([ 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + # 2, 3, 4, 5, 6, 7, 8, 9, 10, 27, 28, 29, 30, 31, 32, 33, 34, 35, + # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + # 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + # 72, 73, 74, 75, 76], device='cuda:0') + # + # example alphas: 0 means using new attention, 1 means using old attention + # tensor([[[[1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., + # 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + # 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + # 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]], device='cuda:0') + attn_base_replace = attn_base[:, :, self.mapper].permute(2, 0, 1, 3) + attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas) + return attn_replace + + def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float, + local_blend: Optional[LocalBlend] = None, tokenizer=None, device=None): + super(AttentionRefine, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend, tokenizer, device) + self.mapper, alphas = get_refinement_mapper(prompts, self.tokenizer) + self.mapper, alphas = self.mapper.to(self.device), alphas.to(self.device) + self.alphas = alphas.reshape(alphas.shape[0], 1, 1, alphas.shape[1]) + +class AttentionReweight(AttentionControlEdit): + + def replace_cross_attention(self, attn_base, att_replace): + if self.prev_controller is not None: + attn_base = self.prev_controller.replace_cross_attention(attn_base, att_replace) + attn_replace = attn_base[None, :, :, :] * self.equalizer[:, None, None, :] + return attn_replace + + def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float, equalizer, + local_blend: Optional[LocalBlend] = None, controller: Optional[AttentionControlEdit] = None, tokenizer=None, device=None): + super(AttentionReweight, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend, tokenizer, device) + self.equalizer = equalizer.to(self.device) + self.prev_controller = controller + + +def get_equalizer( + text: str, + word_select: Union[int, Tuple[int, ...]], + values: Union[List[float], Tuple[float, ...]], + tokenizer, + num_subject_token=-1, + ): + if num_subject_token > 0: + tokens = text.split(" ") + tokens = [tokens[0]] + ["sks"] * num_subject_token + tokens[1:] + new_text = " ".join(tokens) + text = new_text + + if type(word_select) is int or type(word_select) is str: + word_select = (word_select,) + equalizer = torch.ones(len(values), 77) + values = torch.tensor(values, dtype=torch.float32) + for word in word_select: + inds = get_word_inds(text, word, tokenizer) + equalizer[:, inds] = values + return equalizer + + +def update_alpha_time_word(alpha, bounds: Union[float, Tuple[float, float]], prompt_ind: int, + word_inds: Optional[torch.Tensor]=None): + if type(bounds) is float: + bounds = 0, bounds + start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0]) + if word_inds is None: + word_inds = torch.arange(alpha.shape[2]) + alpha[: start, prompt_ind, word_inds] = 0 + alpha[start: end, prompt_ind, word_inds] = 1 + alpha[end:, prompt_ind, word_inds] = 0 + return alpha + +def get_time_words_attention_alpha(prompts, num_steps, + cross_replace_steps: Union[float, Dict[str, Tuple[float, float]]], + tokenizer, max_num_words=77): + if type(cross_replace_steps) is not dict: + cross_replace_steps = {"default_": cross_replace_steps} + if "default_" not in cross_replace_steps: + cross_replace_steps["default_"] = (0., 1.) + alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words) + for i in range(len(prompts) - 1): + alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"], + i) + for key, item in cross_replace_steps.items(): + if key != "default_": + inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))] + for i, ind in enumerate(inds): + if len(ind) > 0: + alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind) + alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words) + return alpha_time_words + + + +# seg_alinger +class ScoreParams: + + def __init__(self, gap, match, mismatch): + self.gap = gap + self.match = match + self.mismatch = mismatch + + def mis_match_char(self, x, y): + if x != y: + return self.mismatch + else: + return self.match + + +def get_matrix(size_x, size_y, gap): + matrix = [] + for i in range(len(size_x) + 1): + sub_matrix = [] + for j in range(len(size_y) + 1): + sub_matrix.append(0) + matrix.append(sub_matrix) + for j in range(1, len(size_y) + 1): + matrix[0][j] = j*gap + for i in range(1, len(size_x) + 1): + matrix[i][0] = i*gap + return matrix + + +def get_matrix(size_x, size_y, gap): + matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32) + matrix[0, 1:] = (np.arange(size_y) + 1) * gap + matrix[1:, 0] = (np.arange(size_x) + 1) * gap + return matrix + + +def get_traceback_matrix(size_x, size_y): + matrix = np.zeros((size_x + 1, size_y +1), dtype=np.int32) + matrix[0, 1:] = 1 + matrix[1:, 0] = 2 + matrix[0, 0] = 4 + return matrix + + +def global_align(x, y, score): + matrix = get_matrix(len(x), len(y), score.gap) + trace_back = get_traceback_matrix(len(x), len(y)) + for i in range(1, len(x) + 1): + for j in range(1, len(y) + 1): + left = matrix[i, j - 1] + score.gap + up = matrix[i - 1, j] + score.gap + diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1]) + matrix[i, j] = max(left, up, diag) + if matrix[i, j] == left: + trace_back[i, j] = 1 + elif matrix[i, j] == up: + trace_back[i, j] = 2 + else: + trace_back[i, j] = 3 + return matrix, trace_back + + +def get_aligned_sequences(x, y, trace_back): + x_seq = [] + y_seq = [] + i = len(x) + j = len(y) + mapper_y_to_x = [] + while i > 0 or j > 0: + if trace_back[i, j] == 3: + x_seq.append(x[i-1]) + y_seq.append(y[j-1]) + i = i-1 + j = j-1 + mapper_y_to_x.append((j, i)) + elif trace_back[i][j] == 1: + x_seq.append('-') + y_seq.append(y[j-1]) + j = j-1 + mapper_y_to_x.append((j, -1)) + elif trace_back[i][j] == 2: + x_seq.append(x[i-1]) + y_seq.append('-') + i = i-1 + elif trace_back[i][j] == 4: + break + mapper_y_to_x.reverse() + return x_seq, y_seq, torch.tensor(mapper_y_to_x, dtype=torch.int64) + + +def get_mapper(x: str, y: str, tokenizer, max_len=77): + x_seq = tokenizer.encode(x) + y_seq = tokenizer.encode(y) + score = ScoreParams(0, 1, -1) + matrix, trace_back = global_align(x_seq, y_seq, score) + mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1] + alphas = torch.ones(max_len) + alphas[: mapper_base.shape[0]] = mapper_base[:, 1].ne(-1).float() + mapper = torch.zeros(max_len, dtype=torch.int64) + mapper[:mapper_base.shape[0]] = mapper_base[:, 1] + mapper[mapper_base.shape[0]:] = len(y_seq) + torch.arange(max_len - len(y_seq)) + return mapper, alphas + + +def get_refinement_mapper(prompts, tokenizer, max_len=77): + x_seq = prompts[0] + mappers, alphas = [], [] + for i in range(1, len(prompts)): + mapper, alpha = get_mapper(x_seq, prompts[i], tokenizer, max_len) + mappers.append(mapper) + alphas.append(alpha) + return torch.stack(mappers), torch.stack(alphas) + + +def get_word_inds(text: str, word_place: int, tokenizer): + split_text = text.split(" ") + if type(word_place) is str: + word_place = [i for i, word in enumerate(split_text) if word_place == word] + elif type(word_place) is int: + word_place = [word_place] + out = [] + if len(word_place) > 0: + words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1] + cur_len, ptr = 0, 0 + + for i in range(len(words_encode)): + cur_len += len(words_encode[i]) + if ptr in word_place: + out.append(i + 1) + if cur_len >= len(split_text[ptr]): + ptr += 1 + cur_len = 0 + return np.array(out) + + +def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77): + words_x = x.split(' ') + words_y = y.split(' ') + if len(words_x) != len(words_y): + raise ValueError(f"attention replacement edit can only be applied on prompts with the same length" + f" but prompt A has {len(words_x)} words and prompt B has {len(words_y)} words.") + inds_replace = [i for i in range(len(words_y)) if words_y[i] != words_x[i]] + inds_source = [get_word_inds(x, i, tokenizer) for i in inds_replace] + inds_target = [get_word_inds(y, i, tokenizer) for i in inds_replace] + mapper = np.zeros((max_len, max_len)) + i = j = 0 + cur_inds = 0 + while i < max_len and j < max_len: + if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i: + inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds] + if len(inds_source_) == len(inds_target_): + mapper[inds_source_, inds_target_] = 1 + else: + ratio = 1 / len(inds_target_) + for i_t in inds_target_: + mapper[inds_source_, i_t] = ratio + cur_inds += 1 + i += len(inds_source_) + j += len(inds_target_) + elif cur_inds < len(inds_source): + mapper[i, j] = 1 + i += 1 + j += 1 + else: + mapper[j, j] = 1 + i += 1 + j += 1 + + return torch.from_numpy(mapper).float() + + +def get_replacement_mapper(prompts, tokenizer, max_len=77): + x_seq = prompts[0] + mappers = [] + for i in range(1, len(prompts)): + mapper = get_replacement_mapper_(x_seq, prompts[i], tokenizer, max_len) + mappers.append(mapper) + return torch.stack(mappers) + + +class P2PCrossAttnProcessor: + + def __init__(self, controller, place_in_unet): + super().__init__() + self.controller = controller + self.place_in_unet = place_in_unet + + def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None): + batch_size, sequence_length, _ = hidden_states.shape + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size=batch_size) + + query = attn.to_q(hidden_states) + + is_cross = encoder_hidden_states is not None + encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + + # one line change + if self.controller is not None: + self.controller(attention_probs, is_cross, self.place_in_unet) + + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states diff --git a/lavis/models/blip_diffusion_models/utils.py b/lavis/models/blip_diffusion_models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..17b3a1eaabb6662aa6ce2a2b8b364795de5d659a --- /dev/null +++ b/lavis/models/blip_diffusion_models/utils.py @@ -0,0 +1,101 @@ +""" + Copyright (c) 2023, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +import numpy as np +import PIL +import torch +from diffusers.utils.pil_utils import PIL_INTERPOLATION +from PIL import Image + +from lavis.common.annotator.canny import CannyDetector +from lavis.common.annotator.util import HWC3, resize_image + +apply_canny = CannyDetector() + + +def numpy_to_pil(images): + """ + Convert a numpy image or a batch of images to a PIL image. + """ + if images.ndim == 3: + images = images[None, ...] + images = (images * 255).round().astype("uint8") + pil_images = [Image.fromarray(image) for image in images] + + return pil_images + + +def preprocess_canny( + input_image: np.ndarray, + image_resolution: int, + low_threshold: int, + high_threshold: int, +): + image = resize_image(HWC3(input_image), image_resolution) + control_image = apply_canny(image, low_threshold, high_threshold) + control_image = HWC3(control_image) + # vis_control_image = 255 - control_image + # return PIL.Image.fromarray(control_image), PIL.Image.fromarray( + # vis_control_image) + return PIL.Image.fromarray(control_image) + + +def generate_canny(cond_image_input, low_threshold, high_threshold): + # convert cond_image_input to numpy array + cond_image_input = np.array(cond_image_input).astype(np.uint8) + + # canny_input, vis_control_image = preprocess_canny(cond_image_input, 512, low_threshold=100, high_threshold=200) + vis_control_image = preprocess_canny(cond_image_input, 512, low_threshold=low_threshold, high_threshold=high_threshold) + + return vis_control_image + + +def prepare_cond_image( + image, width, height, batch_size, device, do_classifier_free_guidance=True + ): + if not isinstance(image, torch.Tensor): + if isinstance(image, Image.Image): + image = [image] + + if isinstance(image[0], Image.Image): + images = [] + + for image_ in image: + image_ = image_.convert("RGB") + image_ = image_.resize( + (width, height), resample=PIL_INTERPOLATION["lanczos"] + ) + image_ = np.array(image_) + image_ = image_[None, :] + images.append(image_) + + image = images + + image = np.concatenate(image, axis=0) + image = np.array(image).astype(np.float32) / 255.0 + image = image.transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, dim=0) + + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + # repeat_by = num_images_per_prompt + raise NotImplementedError + + image = image.repeat_interleave(repeat_by, dim=0) + + # image = image.to(device=self.device, dtype=dtype) + image = image.to(device=device) + + if do_classifier_free_guidance: + image = torch.cat([image] * 2) + + return image diff --git a/lavis/models/blip_models/__init__.py b/lavis/models/blip_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2b88146b9eb3d60dd10ee2aed8e0a33cba924746 --- /dev/null +++ b/lavis/models/blip_models/__init__.py @@ -0,0 +1,90 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +from typing import List + +from torch import nn + + +def tie_encoder_decoder_weights( + encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, skip_key: str +): + uninitialized_encoder_weights: List[str] = [] + if decoder.__class__ != encoder.__class__: + logging.info( + f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized." + ) + + def tie_encoder_to_decoder_recursively( + decoder_pointer: nn.Module, + encoder_pointer: nn.Module, + module_name: str, + uninitialized_encoder_weights: List[str], + skip_key: str, + depth=0, + ): + assert isinstance(decoder_pointer, nn.Module) and isinstance( + encoder_pointer, nn.Module + ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module" + if hasattr(decoder_pointer, "weight") and skip_key not in module_name: + assert hasattr(encoder_pointer, "weight") + encoder_pointer.weight = decoder_pointer.weight + if hasattr(decoder_pointer, "bias"): + assert hasattr(encoder_pointer, "bias") + encoder_pointer.bias = decoder_pointer.bias + print(module_name + " is tied") + return + + encoder_modules = encoder_pointer._modules + decoder_modules = decoder_pointer._modules + if len(decoder_modules) > 0: + assert ( + len(encoder_modules) > 0 + ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" + + all_encoder_weights = set( + [module_name + "/" + sub_name for sub_name in encoder_modules.keys()] + ) + encoder_layer_pos = 0 + for name, module in decoder_modules.items(): + if name.isdigit(): + encoder_name = str(int(name) + encoder_layer_pos) + decoder_name = name + if not isinstance( + decoder_modules[decoder_name], + type(encoder_modules[encoder_name]), + ) and len(encoder_modules) != len(decoder_modules): + # this can happen if the name corresponds to the position in a list module list of layers + # in this case the decoder has added a cross-attention that the encoder does not have + # thus skip this step and subtract one layer pos from encoder + encoder_layer_pos -= 1 + continue + elif name not in encoder_modules: + continue + elif depth > 500: + raise ValueError( + "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model." + ) + else: + decoder_name = encoder_name = name + tie_encoder_to_decoder_recursively( + decoder_modules[decoder_name], + encoder_modules[encoder_name], + module_name + "/" + name, + uninitialized_encoder_weights, + skip_key, + depth=depth + 1, + ) + all_encoder_weights.remove(module_name + "/" + encoder_name) + + uninitialized_encoder_weights += list(all_encoder_weights) + + # tie weights recursively + tie_encoder_to_decoder_recursively( + decoder, encoder, base_model_prefix, uninitialized_encoder_weights, skip_key + ) diff --git a/lavis/models/blip_models/__pycache__/__init__.cpython-310.pyc b/lavis/models/blip_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4cf895ebddcc976eb8207eb6d0ddda6a147f8cd Binary files /dev/null and b/lavis/models/blip_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a28fe1a6969b5bfcd37ac0edf074c4817f7533c0 Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_caption.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_caption.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d2b99d9e0bfec2463cc815c8364c961017b7d26 Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_caption.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_classification.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_classification.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ff493199e75a1f0ed3c6cb6234666403744ce90 Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_classification.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_feature_extractor.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_feature_extractor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e76057887d5b6cbd356f9ea49154c8c1f29966f Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_feature_extractor.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_image_text_matching.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_image_text_matching.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9f4fb2dd72dce6eaae66e4ab8eeb52891932370 Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_image_text_matching.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_nlvr.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_nlvr.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e62430b897e36d52678a9871cf2ac3425d2f34f4 Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_nlvr.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_outputs.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_outputs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54c6e96b6a8f9b0a8a78f69d1fa4328678e526b2 Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_outputs.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_pretrain.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_pretrain.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ef452790fae59fd37fe2d00713a0a9f3926e16d Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_pretrain.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_retrieval.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_retrieval.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f097d3871e2002e1a9e21a80c3bf9667615f573 Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_retrieval.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/blip_vqa.cpython-310.pyc b/lavis/models/blip_models/__pycache__/blip_vqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..948347dbc21144bbbff1e9dd79459314923de060 Binary files /dev/null and b/lavis/models/blip_models/__pycache__/blip_vqa.cpython-310.pyc differ diff --git a/lavis/models/blip_models/__pycache__/nlvr_encoder.cpython-310.pyc b/lavis/models/blip_models/__pycache__/nlvr_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ed4537da695a5816b9ee03fdc670fbf6a376eaa Binary files /dev/null and b/lavis/models/blip_models/__pycache__/nlvr_encoder.cpython-310.pyc differ diff --git a/lavis/models/blip_models/blip.py b/lavis/models/blip_models/blip.py new file mode 100644 index 0000000000000000000000000000000000000000..fba1f4893c7bebdb3f6fc5ba87fe055274397d91 --- /dev/null +++ b/lavis/models/blip_models/blip.py @@ -0,0 +1,65 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +import os +from packaging import version + +import torch +from lavis.common.dist_utils import download_cached_file +from lavis.common.utils import is_url +from lavis.models.base_model import BaseModel +from lavis.models.vit import interpolate_pos_embed +from transformers import BertTokenizer +import transformers + +class BlipBase(BaseModel): + def __init__(self): + super().__init__() + transformers_version = version.parse(transformers.__version__) + assert transformers_version < version.parse("4.27"), "BLIP models are not compatible with transformers>=4.27, run pip install transformers==4.25 to downgrade" + + @classmethod + def init_tokenizer(cls): + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]}) + tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0] + return tokenizer + + def load_from_pretrained(self, url_or_filename): + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + + state_dict = checkpoint["model"] + + state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed( + state_dict["visual_encoder.pos_embed"], self.visual_encoder + ) + if "visual_encoder_m.pos_embed" in self.state_dict().keys(): + state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed( + state_dict["visual_encoder_m.pos_embed"], self.visual_encoder_m + ) + + for key in self.state_dict().keys(): + if key in state_dict.keys(): + if state_dict[key].shape != self.state_dict()[key].shape: + del state_dict[key] + + msg = self.load_state_dict(state_dict, strict=False) + + logging.info("Missing keys {}".format(msg.missing_keys)) + logging.info("load checkpoint from %s" % url_or_filename) + + return msg diff --git a/lavis/models/blip_models/blip_caption.py b/lavis/models/blip_models/blip_caption.py new file mode 100644 index 0000000000000000000000000000000000000000..26f0690a596039a33edfb90b34b3fc0a62ef28ce --- /dev/null +++ b/lavis/models/blip_models/blip_caption.py @@ -0,0 +1,219 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +from lavis.common.registry import registry + +from lavis.models.blip_models.blip import BlipBase +from lavis.models.blip_models.blip_outputs import ( + BlipOutput, + BlipIntermediateOutput, +) +from lavis.models.med import XBertLMHeadDecoder +from lavis.models.vit import VisionTransformerEncoder + + +@registry.register_model("blip_caption") +class BlipCaption(BlipBase): + """ + BLIP captioning model. + + Supported model types: + - base_coco: fine-tuned BLIP base model on COCO caption dataset (Karparthy split). + - large_coco: fine-tuned BLIP large model on COCO caption dataset (Karparthy split). + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip_caption", "base_coco") + >>> model = load_model("blip_caption", "large_coco") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "base_coco": "configs/models/blip_caption_base_coco.yaml", + "large_coco": "configs/models/blip_caption_large_coco.yaml", + } + + def __init__(self, image_encoder, text_decoder, prompt=None, max_txt_len=40): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = image_encoder + self.text_decoder = text_decoder + + self.prompt = prompt + self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1 + + self.max_txt_len = max_txt_len + + def forward_encoder(self, samples): + image_embeds = self.visual_encoder.forward_features(samples["image"]) + return image_embeds + + def forward_decoder(self, samples, image_embeds): + # prepare inputs for forwarding decoder + raw_text = samples["text_input"] + text = self.tokenizer( + raw_text, + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + text.input_ids[:, 0] = self.tokenizer.bos_token_id + + # prepare targets for forwarding decoder + decoder_targets = text.input_ids.masked_fill( + text.input_ids == self.tokenizer.pad_token_id, -100 + ) + decoder_targets[:, : self.prompt_length] = -100 + + # forward decoder + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + decoder_output = self.text_decoder( + input_ids=text.input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + labels=decoder_targets, + return_dict=True, + ) + + return decoder_output, decoder_targets + + def forward(self, samples): + r""" + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size. + Returns: + output (BlipOutput): A BlipOutput object containing the following + attributes: + - loss (torch.Tensor): A scalar tensor containing the total loss. For BlipCaption, this is the same as the LM loss. + - loss_lm (torch.Tensor): A scalar tensor containing the LM loss. + - intermediate_outputs (BlipIntermediateOutput): A BlipIntermediateOutput object containing intermediate outputs. + see :class:`lavis.models.blip_models.blip_outputs.BlipOutput` for more details. + + Example: + ```python + >>> from PIL import Image + >>> from lavis.models import load_model_and_preprocess + >>> model, vis_processors, txt_processors = load_model_and_preprocess("blip_caption") + >>> raw_image = Image.open("docs/data/merlion.png").convert("RGB") + >>> image = vis_processors["eval"](raw_image).unsqueeze(0) + >>> text_input = ["a large statue of a person spraying water from a fountain"] + >>> samples = {"image": image, "text_input": text_input} + >>> output = model(samples) + >>> output.keys() + odict_keys(['intermediate_output', 'loss', 'loss_lm']) + >>> output.intermediate_output.image_embeds.shape + torch.Size([1, 577, 768]) + >>> output.intermediate_output.decoder_labels.shape + torch.Size([1, 13]) + ```""" + + image_embeds = self.forward_encoder(samples) + decoder_output, decoder_targets = self.forward_decoder(samples, image_embeds) + + # return decoder_out + return BlipOutput( + loss=decoder_output.loss, + loss_lm=decoder_output.loss, + intermediate_output=BlipIntermediateOutput( + image_embeds=image_embeds, + decoder_output=decoder_output, + decoder_labels=decoder_targets, + ), + ) + + def generate( + self, + samples, + use_nucleus_sampling=False, + num_beams=3, + max_length=30, + min_length=10, + top_p=0.9, + repetition_penalty=1.0, + num_captions=1, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + use_nucleus_sampling (bool): Whether to use nucleus sampling. If False, use top-k sampling. + num_beams (int): Number of beams for beam search. 1 means no beam search. + max_length (int): The maximum length of the sequence to be generated. + min_length (int): The minimum length of the sequence to be generated. + top_p (float): The cumulative probability for nucleus sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_captions (int): Number of captions to be generated for each image. + Returns: + captions (list): A list of strings of length batch_size * num_captions. + + Example: + ```python + >>> from PIL import Image + >>> from lavis.models import load_model_and_preprocess + >>> model, vis_processors, txt_processors = load_model_and_preprocess("blip_caption") + >>> raw_image = Image.open("docs/data/merlion.png").convert("RGB") + >>> image = vis_processors["eval"](raw_image).unsqueeze(0) + >>> samples = {"image": image} + >>> captions = model.generate(samples) + >>> captions + ['a large statue of a person spraying water from a fountain'] + >>> captions = model.generate(samples, use_nucleus_sampling=True, num_captions=3) + >>> captions # example output, results may vary due to randomness + ['singapore showing the view of some building', + 'the singapore harbor in twilight, as the weather is going down', + 'the famous singapore fountain at sunset'] + """ + # prepare inputs for decoder generation. + encoder_out = self.forward_encoder(samples) + image_embeds = torch.repeat_interleave(encoder_out, num_captions, 0) + + prompt = [self.prompt] * image_embeds.size(0) + prompt = self.tokenizer(prompt, return_tensors="pt").to(self.device) + prompt.input_ids[:, 0] = self.tokenizer.bos_token_id + prompt.input_ids = prompt.input_ids[:, :-1] + + # get decoded text + decoder_out = self.text_decoder.generate_from_encoder( + tokenized_prompt=prompt, + visual_embeds=image_embeds, + sep_token_id=self.tokenizer.sep_token_id, + pad_token_id=self.tokenizer.pad_token_id, + use_nucleus_sampling=use_nucleus_sampling, + num_beams=num_beams, + max_length=max_length, + min_length=min_length, + top_p=top_p, + repetition_penalty=repetition_penalty, + ) + + outputs = self.tokenizer.batch_decode(decoder_out, skip_special_tokens=True) + captions = [output[len(self.prompt) :] for output in outputs] + + return captions + + @classmethod + def from_config(cls, cfg): + # vision encoder + image_encoder = VisionTransformerEncoder.from_config(cfg) + # text encoder + multimodal decoder + text_decoder = XBertLMHeadDecoder.from_config(cfg) + + prompt = cfg.get("prompt", None) + max_txt_len = cfg.get("max_txt_len", 40) + + model = cls(image_encoder, text_decoder, prompt=prompt, max_txt_len=max_txt_len) + model.load_checkpoint_from_config(cfg) + + return model diff --git a/lavis/models/blip_models/blip_classification.py b/lavis/models/blip_models/blip_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..46c49099d6170fd74b8cbdfae8b1925707e493b6 --- /dev/null +++ b/lavis/models/blip_models/blip_classification.py @@ -0,0 +1,177 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from copy import deepcopy + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.base_model import MomentumDistilationMixin +from lavis.models.blip_models.blip import BlipBase +from lavis.models.blip_models.blip_outputs import ( + BlipIntermediateOutput, + BlipOutputWithLogits, +) +from lavis.models.med import XBertEncoder +from lavis.models.vit import VisionTransformerEncoder +from torch import nn + + +@registry.register_model("blip_classification") +class BlipClassification(BlipBase, MomentumDistilationMixin): + PRETRAINED_MODEL_CONFIG_DICT = { + "base": "configs/models/blip_classification_base.yaml", + } + + def __init__( + self, + image_encoder, + text_encoder, + num_classes, + momentum=0.995, + alpha=0.4, + max_txt_len=40, + use_distill=True, + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.use_distill = use_distill + + self.visual_encoder = image_encoder + self.text_encoder = text_encoder + + hidden_size = text_encoder.config.hidden_size + self.cls_head = nn.Sequential( + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, num_classes), + ) + + if self.use_distill: + self.visual_encoder_m = deepcopy(self.visual_encoder) + self.text_encoder_m = deepcopy(self.text_encoder) + self.cls_head_m = deepcopy(self.cls_head) + + self.momentum = momentum + self.alpha = alpha + + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.cls_head, self.cls_head_m], + ] + + self.copy_params() + + self.max_txt_len = max_txt_len + + def _rampup_factor(self, epoch, iters, num_iters_per_epoch): + return min(1, (epoch * num_iters_per_epoch + iters) / num_iters_per_epoch) + + def forward(self, samples, is_train=True): + sentences = samples["text_input"] + sentences = self.tokenizer( + sentences, + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + samples.update({"tokenized_text": sentences}) + + targets = samples["label"] + + image_embeds = self.visual_encoder.forward_features(samples["image"]) + encoder_output = self.text_encoder.forward_automask( + samples["tokenized_text"], image_embeds + ) + + prediction = self.cls_head(encoder_output.last_hidden_state[:, 0, :]) + + if is_train: + if self.use_distill: + with torch.no_grad(): + self._momentum_update() + + image_embeds_m = self.visual_encoder_m(samples["image"]) + encoder_output_m = self.text_encoder_m.forward_automask( + samples["tokenized_text"], image_embeds_m + ) + + prediction_m = self.cls_head_m( + encoder_output_m.last_hidden_state[:, 0, :] + ) + + alpha = self.alpha * self._rampup_factor( + epoch=samples["epoch"], + iters=samples["iters"], + num_iters_per_epoch=samples["num_iters_per_epoch"], + ) + + loss = (1 - alpha) * F.cross_entropy( + prediction, targets + ) - alpha * torch.sum( + F.log_softmax(prediction, dim=1) * F.softmax(prediction_m, dim=1), + dim=1, + ).mean() + else: + loss = F.cross_entropy(prediction, targets) + + # return {"loss": loss} + return BlipOutputWithLogits( + loss=loss, + intermediate_output=BlipIntermediateOutput( + image_embeds=image_embeds, + image_embeds_m=image_embeds_m, + encoder_output=encoder_output, + encoder_output_m=encoder_output_m, + ), + logits=prediction, + logits_m=prediction_m, + ) + + else: + return {"predictions": prediction, "targets": targets} + + def predict(self, samples): + output = self.forward(samples, is_train=False) + return output + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg) + + # text encoder + multimodal encoder + text_encoder = XBertEncoder.from_config(cfg) + use_distill = cfg.get("use_distill", True) + momentum = cfg.get("momentum", 0.995) + num_classes = cfg.get("num_classes", -1) + alpha = cfg.get("alpha", 0.4) + max_txt_len = cfg.get("max_txt_len", 40) + + assert num_classes > 1, "Invalid number of classes provided, found {}".format( + num_classes + ) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + use_distill=use_distill, + alpha=alpha, + num_classes=num_classes, + momentum=momentum, + max_txt_len=max_txt_len, + ) + + # load pre-trained weights + pretrain_path = cfg.get("pretrained", None) + if pretrain_path is not None: + msg = model.load_from_pretrained(url_or_filename=pretrain_path) + + return model diff --git a/lavis/models/blip_models/blip_feature_extractor.py b/lavis/models/blip_models/blip_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..31df635b2e36b57dc2c5d211b76f3735a3e273df --- /dev/null +++ b/lavis/models/blip_models/blip_feature_extractor.py @@ -0,0 +1,212 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import warnings + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.blip_models.blip import BlipBase +from lavis.models.blip_models.blip_outputs import BlipOutputFeatures +from lavis.models.med import XBertEncoder +from lavis.models.vit import VisionTransformerEncoder +from torch import nn + + +@registry.register_model("blip_feature_extractor") +class BlipFeatureExtractor(BlipBase): + """ + Class for BLIP feature extractor. + + Supported model types: + - base: BLIP base model with pre-trained weights from capfilt by BLIP large model. + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip_feature_extractor", "base") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "base": "configs/models/blip_feature_extractor_base.yaml", + # "large": "configs/models/blip_feature_extractor_large.yaml", + } + + def __init__(self, image_encoder, text_encoder, embed_dim, max_txt_len=40): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = image_encoder + self.text_encoder = text_encoder + + # creating projection layers for ITC + text_width = text_encoder.config.hidden_size + vision_width = image_encoder.vision_width + + self.vision_proj = nn.Linear(vision_width, embed_dim) + self.text_proj = nn.Linear(text_width, embed_dim) + + self.max_txt_len = max_txt_len + + self.temp = nn.Parameter(0.07 * torch.ones([])) + + @torch.no_grad() + def extract_features(self, samples, mode="multimodal"): + """ + Extract features for multimodal or unimodal samples. + + Args: + samples (dict): A dictionary of samples, containing the following keys: + - image (torch.Tensor): A tensor of shape (B, C, H, W) containing the image. + Raw images should be preprocessed before being passed to feature extractor. + - text_input (list): A list of strings containing the text, length B. + mode (str): The mode of feature extraction. Can be either "multimodal", "text" or "image". + If "multimodal", return image features and multimodal features; + if "text", return text features; + if "image", return image features. + Default: "multimodal". + + Returns: + BlipOutputFeatures: A BlipOutputFeatures object containing the features. + See lavis/models/blip_models/blip_outputs.py for more details. + + Examples: + ```python + >>> from PIL import Image + >>> from lavis.models import load_model_and_preprocess + >>> raw_image = Image.open("docs/data/merlion.png").convert("RGB") + >>> caption = "a large fountain spewing water into the air" + >>> model, vis_processors, txt_processors = load_model_and_preprocess("blip_feature_extractor", is_eval=True) + >>> image = vis_processors["eval"](raw_image).unsqueeze(0) + >>> text_input = txt_processors["eval"](caption) + + >>> sample = {"image": image, "text_input": [text_input]} + + >>> features_multimodal = model.extract_features(sample) + >>> features_multimodal.keys() + odict_keys(['image_embeds', 'multimodal_embeds']) + >>> features_multimodal.image_embeds.shape + torch.Size([1, 197, 768]) + >>> features_multimodal.multimodal_embeds.shape + torch.Size([1, 12, 768]) + + >>> features_text = model.extract_features(sample, mode="text") + >>> features_text.keys() + odict_keys(['text_embeds', 'text_features']) + >>> features_text.text_embeds.shape + torch.Size([1, 12, 768]) + >>> features_text.text_features.shape + torch.Size([1, 12, 256]) + + >>> features_image = model.extract_features(sample, mode="image") + >>> features_image.keys() + odict_keys(['image_embeds', 'image_features']) + >>> features_image.image_embeds.shape + torch.Size([1, 197, 768]) + >>> features_image.image_features.shape + torch.Size([1, 197, 256]) + ``` + """ + image = samples.get("image") + caption = samples.get("text_input") + + # assert mode is one of "image", "text", "multimodal" + assert mode in [ + "image", + "text", + "multimodal", + ], "mode must be one of 'image', 'text', 'multimodal'" + + # initalize output + image_embeds, text_embeds, multimodal_embeds = None, None, None + image_features, text_features = None, None + + if mode == "image": + assert ( + image is not None + ), "Image is not provided for mode 'image' or 'multimodal'" + # return image features + image_embeds = self.visual_encoder.forward_features(image) + + image_features = self.vision_proj(image_embeds) + image_features = F.normalize(image_features, dim=-1) + + elif mode == "text": + assert ( + caption is not None + ), "text input is None for mode 'text' or 'multimodal'" + + text = self.tokenizer(caption, return_tensors="pt", padding=True).to( + self.device + ) + + # return text features + text_output = self.text_encoder( + text.input_ids, + attention_mask=text.attention_mask, + return_dict=True, + mode="text", + ) + text_embeds = text_output.last_hidden_state + + text_features = self.text_proj(text_embeds) + text_features = F.normalize(text_features, dim=-1) + + elif mode == "multimodal": + # return multimodel features + image_embeds = self.visual_encoder.forward_features(image) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + + text = self.tokenizer(caption, return_tensors="pt", padding=True).to( + self.device + ) + text.input_ids[:, 0] = self.tokenizer.enc_token_id + + output = self.text_encoder( + text.input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + multimodal_embeds = output.last_hidden_state + + return BlipOutputFeatures( + image_embeds=image_embeds, + image_embeds_proj=image_features, + text_embeds=text_embeds, + text_embeds_proj=text_features, + multimodal_embeds=multimodal_embeds, + ) + + @classmethod + def from_config(cls, cfg=None): + # set from_pretrained=True to load weights for 'bert-base-uncased' + image_encoder = VisionTransformerEncoder.from_config(cfg) + text_encoder = XBertEncoder.from_config(cfg) + + embed_dim = cfg.get("embed_dim", 256) + max_txt_len = cfg.get("max_txt_len", 30) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + embed_dim=embed_dim, + max_txt_len=max_txt_len, + ) + + # load pre-trained weights + pretrain_path = cfg.get("pretrained", None) + if pretrain_path is not None: + msg = model.load_from_pretrained(url_or_filename=pretrain_path) + else: + warnings.warn("No pretrained weights are loaded.") + + return model diff --git a/lavis/models/blip_models/blip_image_text_matching.py b/lavis/models/blip_models/blip_image_text_matching.py new file mode 100644 index 0000000000000000000000000000000000000000..a691984f8eec5540e476f7c188e36c1fccab5ea7 --- /dev/null +++ b/lavis/models/blip_models/blip_image_text_matching.py @@ -0,0 +1,199 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.blip_models.blip import BlipBase +from torch import nn +from lavis.models.med import XBertEncoder + +from lavis.models.vit import VisionTransformerEncoder + + +@registry.register_model("blip_image_text_matching") +class BlipITM(BlipBase): + """ + BLIP Image-Text Matching (ITM) model. + + Supported model types: + - base: fine-tuned BLIP retrieval weights on COCO dataset (Karpathy split). + - large: fine-tuned BLIP retrieval weights on COCO dataset (Karpathy split). + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip_image_text_matching", "base") + >>> model = load_model("blip_image_text_matching", "large") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "base": "configs/models/blip_itm_base.yaml", + "large": "configs/models/blip_itm_large.yaml", + } + + def __init__(self, image_encoder, text_encoder, embed_dim=256, max_txt_len=35): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.text_encoder = text_encoder + + self.visual_encoder = image_encoder + + self.max_txt_len = max_txt_len + + # creating projection layers for ITC + text_width = text_encoder.config.hidden_size + vision_width = image_encoder.vision_width + + self.vision_proj = nn.Linear(vision_width, embed_dim) + self.text_proj = nn.Linear(text_width, embed_dim) + + self.itm_head = nn.Linear(text_width, 2) + + def forward(self, samples, match_head="itm"): + image = samples["image"] + caption = samples["text_input"] + + image_embeds = self.visual_encoder.forward_features(image) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + + text = self.tokenizer( + caption, + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + if match_head == "itm": + encoder_input_ids = text.input_ids.clone() + encoder_input_ids[:, 0] = self.tokenizer.enc_token_id # extra code + output = self.text_encoder( + encoder_input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + itm_output = self.itm_head(output.last_hidden_state[:, 0, :]) + return itm_output + + elif match_head == "itc": + text_output = self.text_encoder( + text.input_ids, + attention_mask=text.attention_mask, + return_dict=True, + mode="text", + ) + image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + text_feat = F.normalize( + self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1 + ) + + sim = image_feat @ text_feat.t() + return sim + def itm_rank(self, image_embeds, image_atts, encoder_input_ids, match_head='itm'): + # breakpoint() + encoder_input_ids = encoder_input_ids.clone() + encoder_input_ids = encoder_input_ids[:, 3:] + text_attention_mask = (encoder_input_ids != self.tokenizer.pad_token_id).long() + + if match_head == 'itm': + # encoder_input_ids = encoder_input_ids.clone() + encoder_input_ids[:, 0] = self.tokenizer.enc_token_id + output = self.text_encoder(encoder_input_ids, + attention_mask=text_attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + # print(output.last_hidden_state.shape) + itm_output = self.itm_head(output.last_hidden_state[:, 0, :]) + itm_output = F.softmax(itm_output, dim=1)[:,1] + return itm_output #, mask, token_length + + elif match_head == 'itc': + encoder_input_ids[:, 0] = self.tokenizer.cls_token_id + text_output = self.text_encoder(encoder_input_ids, attention_mask=text_attention_mask, + return_dict=True, mode='text') + image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1) + + sim = image_feat @ text_feat.t() + return sim + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg) + text_encoder = XBertEncoder.from_config(cfg) + + embed_dim = cfg.get("embed_dim", 256) + max_txt_len = cfg.get("max_txt_len", 35) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + embed_dim=embed_dim, + max_txt_len=max_txt_len, + ) + + model.load_checkpoint_from_config(cfg) + + return model + + +def compute_gradcam(model, visual_input, text_input, tokenized_text, block_num=6): + model.text_encoder.base_model.base_model.encoder.layer[ + block_num + ].crossattention.self.save_attention = True + + output = model({"image": visual_input, "text_input": text_input}, match_head="itm") + loss = output[:, 1].sum() + + model.zero_grad() + loss.backward() + with torch.no_grad(): + mask = tokenized_text.attention_mask.view( + tokenized_text.attention_mask.size(0), 1, -1, 1, 1 + ) # (bsz,1,token_len, 1,1) + token_length = tokenized_text.attention_mask.sum(dim=-1) - 2 + token_length = token_length.cpu() + # grads and cams [bsz, num_head, seq_len, image_patch] + grads = model.text_encoder.base_model.base_model.encoder.layer[ + block_num + ].crossattention.self.get_attn_gradients() + cams = model.text_encoder.base_model.base_model.encoder.layer[ + block_num + ].crossattention.self.get_attention_map() + + # assume using vit with 576 num image patch + cams = cams[:, :, :, 1:].reshape(visual_input.size(0), 12, -1, 24, 24) * mask + grads = ( + grads[:, :, :, 1:].clamp(0).reshape(visual_input.size(0), 12, -1, 24, 24) + * mask + ) + + gradcams = cams * grads + gradcam_list = [] + + for ind in range(visual_input.size(0)): + token_length_ = token_length[ind] + gradcam = gradcams[ind].mean(0).cpu().detach() + # [enc token gradcam, average gradcam across token, gradcam for individual token] + gradcam = torch.cat( + ( + gradcam[0:1, :], + gradcam[1 : token_length_ + 1, :].sum(dim=0, keepdim=True) + / token_length_, + gradcam[1:, :], + ) + ) + gradcam_list.append(gradcam) + + return gradcam_list, output diff --git a/lavis/models/blip_models/blip_nlvr.py b/lavis/models/blip_models/blip_nlvr.py new file mode 100644 index 0000000000000000000000000000000000000000..a67d7a1b2c27a200efaae5dda5da1c5fc9ca78e8 --- /dev/null +++ b/lavis/models/blip_models/blip_nlvr.py @@ -0,0 +1,187 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os + +import torch +import torch.nn.functional as F +from lavis.common.dist_utils import download_cached_file +from lavis.common.registry import registry +from lavis.common.utils import get_abs_path, is_url +from lavis.models.base_model import MomentumDistilationMixin +from lavis.models.blip_models.blip import BlipBase +from lavis.models.blip_models.blip_outputs import BlipIntermediateOutput, BlipOutput +from lavis.models.blip_models.nlvr_encoder import BertModel +from lavis.models.vit import VisionTransformerEncoder, interpolate_pos_embed +from torch import nn +from transformers import BertConfig + + +@registry.register_model("blip_nlvr") +class BlipNLVR(BlipBase, MomentumDistilationMixin): + """ + Class for BLIP NLVR model. + + Supported model types: + - base: model with pre-trained BLIP weights, used as initialization for fine-tuning. + - nlvr: finetuned model on NLVR2 dataset. + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip_nlvr", "nlvr") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "nlvr": "configs/models/blip_nlvr.yaml", + } + + def __init__(self, image_encoder, text_encoder, num_classes): + super().__init__() + + self.tokenizer = self.init_tokenizer() + self.visual_encoder = image_encoder + self.text_encoder = text_encoder + + hidden_size = text_encoder.config.hidden_size + self.cls_head = nn.Sequential( + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, num_classes), + ) + + def forward(self, samples, is_train=True): + """ + Forward function for training and evaluation. + + Args: + samples (dict): a dict of input samples, which contains the following keys: + - image0 (torch.Tensor): input image 0, shape (batch_size, 3, H, W), default H=384, W=384. + - image1 (torch.Tensor): input image 1, shape (batch_size, 3, H, W), default H=384, W=384. + - text_input (list): list of strings, each string is a natural language sentence. + - label (torch.LongTensor): ground truth label with shape (batch_size,). + is_train (bool): whether the model is in training mode. + If True, the model will return the loss; + If False, the model will return the prediction. + + Examples: + >>> import torch + >>> from lavis.models import load_model + >>> model = load_model("blip_nlvr", "nlvr") + >>> samples = { + ... "image0": torch.randn(2, 3, 384, 384), + ... "image1": torch.randn(2, 3, 384, 384), + ... "text_input": ["there is a ferret in tall grass", "there are lips in one of the images"], + ... "label": torch.tensor([0, 1]), + ... } + >>> output = model(samples) + >>> output.keys() + odict_keys(['intermediate_output', 'loss']) + """ + text = samples["text_input"] + text = self.tokenizer(text, padding="longest", return_tensors="pt").to( + self.device + ) + text.input_ids[:, 0] = self.tokenizer.enc_token_id + + targets = samples["label"] + + image0 = samples["image0"] + image1 = samples["image1"] + images = torch.cat([image0, image1], dim=0) + + image_embeds = self.visual_encoder.forward_features(images) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + image0_embeds, image1_embeds = torch.split(image_embeds, targets.size(0)) + + encoder_output = self.text_encoder( + text.input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=[image0_embeds, image1_embeds], + encoder_attention_mask=[ + image_atts[: image0_embeds.size(0)], + image_atts[image0_embeds.size(0) :], + ], + return_dict=True, + ) + + prediction = self.cls_head(encoder_output.last_hidden_state[:, 0, :]) + + if is_train: + loss = F.cross_entropy(prediction, targets) + # return {"loss": loss} + return BlipOutput( + loss=loss, + intermediate_output=BlipIntermediateOutput( + image_embeds=torch.stack([image0_embeds, image1_embeds], dim=0), + encoder_output=encoder_output, + ), + ) + else: + return {"predictions": prediction, "targets": targets} + + def predict(self, samples): + output = self.forward(samples, is_train=False) + return output + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg) + + # text encoder + multimodal encoder + bert_config = BertConfig.from_json_file(get_abs_path(cfg["med_config_path"])) + text_encoder = BertModel(config=bert_config, add_pooling_layer=False) + + num_classes = cfg.get("num_classes", 3) + + assert num_classes > 1, "Invalid number of classes provided, found {}".format( + num_classes + ) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + num_classes=num_classes, + ) + + model.load_checkpoint_from_config(cfg) + + return model + + def load_from_pretrained(self, url_or_filename): + if is_url(url_or_filename): + cached_file = download_cached_file( + url_or_filename, check_hash=False, progress=True + ) + checkpoint = torch.load(cached_file, map_location="cpu") + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location="cpu") + else: + raise RuntimeError("checkpoint url or path is invalid") + state_dict = checkpoint["model"] + + state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed( + state_dict["visual_encoder.pos_embed"], self.visual_encoder + ) + + for key in list(state_dict.keys()): + if "crossattention.self." in key: + new_key0 = key.replace("self", "self0") + new_key1 = key.replace("self", "self1") + state_dict[new_key0] = state_dict[key] + state_dict[new_key1] = state_dict[key] + elif "crossattention.output.dense." in key: + new_key0 = key.replace("dense", "dense0") + new_key1 = key.replace("dense", "dense1") + state_dict[new_key0] = state_dict[key] + state_dict[new_key1] = state_dict[key] + + msg = self.load_state_dict(state_dict, strict=False) + print("load checkpoint from %s" % url_or_filename) + print(f"missing keys {msg.missing_keys}") + return msg diff --git a/lavis/models/blip_models/blip_outputs.py b/lavis/models/blip_models/blip_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..9d18ddcabb68f09e1b4952d337d0206efcd8e8ad --- /dev/null +++ b/lavis/models/blip_models/blip_outputs.py @@ -0,0 +1,116 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from dataclasses import dataclass +from typing import Optional + +import torch +from transformers.modeling_outputs import ( + ModelOutput, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, +) + + +@dataclass +class BlipSimilarity(ModelOutput): + sim_i2t: torch.FloatTensor = None + sim_t2i: torch.FloatTensor = None + + sim_i2t_m: Optional[torch.FloatTensor] = None + sim_t2i_m: Optional[torch.FloatTensor] = None + + sim_i2t_targets: Optional[torch.FloatTensor] = None + sim_t2i_targets: Optional[torch.FloatTensor] = None + + +@dataclass +class BlipIntermediateOutput(ModelOutput): + """ + Data class for intermediate outputs of BLIP models. + + image_embeds (torch.FloatTensor): Image embeddings, shape (batch_size, num_patches, embed_dim). + text_embeds (torch.FloatTensor): Text embeddings, shape (batch_size, seq_len, embed_dim). + + image_embeds_m (torch.FloatTensor): Image embeddings from momentum visual encoder, shape (batch_size, num_patches, embed_dim). + text_embeds_m (torch.FloatTensor): Text embeddings from momentum text encoder, shape (batch_size, seq_len, embed_dim). + + encoder_output (BaseModelOutputWithPoolingAndCrossAttentions): output from the image-grounded text encoder. + encoder_output_neg (BaseModelOutputWithPoolingAndCrossAttentions): output from the image-grounded text encoder for negative pairs. + + decoder_output (CausalLMOutputWithCrossAttentions): output from the image-grounded text decoder. + decoder_labels (torch.LongTensor): labels for the captioning loss. + + itm_logits (torch.FloatTensor): logits for the image-text matching loss, shape (batch_size * 3, 2). + itm_labels (torch.LongTensor): labels for the image-text matching loss, shape (batch_size * 3,) + + """ + + # uni-modal features + image_embeds: torch.FloatTensor = None + text_embeds: Optional[torch.FloatTensor] = None + + image_embeds_m: Optional[torch.FloatTensor] = None + text_embeds_m: Optional[torch.FloatTensor] = None + + # intermediate outputs of multimodal encoder + encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None + encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None + + itm_logits: Optional[torch.FloatTensor] = None + itm_labels: Optional[torch.LongTensor] = None + + # intermediate outputs of multimodal decoder + decoder_output: Optional[CausalLMOutputWithCrossAttentions] = None + decoder_labels: Optional[torch.LongTensor] = None + + +@dataclass +class BlipOutput(ModelOutput): + # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional. + sims: Optional[BlipSimilarity] = None + + intermediate_output: BlipIntermediateOutput = None + + loss: Optional[torch.FloatTensor] = None + + loss_itc: Optional[torch.FloatTensor] = None + + loss_itm: Optional[torch.FloatTensor] = None + + loss_lm: Optional[torch.FloatTensor] = None + + +@dataclass +class BlipOutputWithLogits(BlipOutput): + logits: torch.FloatTensor = None + logits_m: torch.FloatTensor = None + + +@dataclass +class BlipOutputFeatures(ModelOutput): + """ + Data class of features from BlipFeatureExtractor. + + Args: + image_embeds: (torch.FloatTensor) of shape (batch_size, num_patches+1, embed_dim), optional + image_features: (torch.FloatTensor) of shape (batch_size, num_patches+1, feature_dim), optional + text_embeds: (torch.FloatTensor) of shape (batch_size, sequence_length+1, embed_dim), optional + text_features: (torch.FloatTensor) of shape (batch_size, sequence_length+1, feature_dim), optional + + The first embedding or feature is for the [CLS] token. + + Features are obtained by projecting the corresponding embedding into a normalized low-dimensional space. + """ + + image_embeds: Optional[torch.FloatTensor] = None + image_embeds_proj: Optional[torch.FloatTensor] = None + + text_embeds: Optional[torch.FloatTensor] = None + text_embeds_proj: Optional[torch.FloatTensor] = None + + multimodal_embeds: Optional[torch.FloatTensor] = None diff --git a/lavis/models/blip_models/blip_pretrain.py b/lavis/models/blip_models/blip_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..a8077cae11eb35b8e09d4fdfd77ea3c58ef6ea0f --- /dev/null +++ b/lavis/models/blip_models/blip_pretrain.py @@ -0,0 +1,394 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from copy import deepcopy + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.base_model import MomentumDistilationMixin, SharedQueueMixin +from lavis.models.blip_models import tie_encoder_decoder_weights +from lavis.models.blip_models.blip import BlipBase +from lavis.models.blip_models.blip_outputs import ( + BlipOutput, + BlipSimilarity, + BlipIntermediateOutput, +) +from lavis.models.med import XBertEncoder, XBertLMHeadDecoder +from lavis.models.vit import VisionTransformerEncoder +from torch import nn + + +@registry.register_model("blip_pretrain") +class BlipPretrain(BlipBase, SharedQueueMixin, MomentumDistilationMixin): + """ + BLIP pretrain model. + + Supported model types: + - base: BLIP base model before pretraining. + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "base": "configs/models/blip_pretrain_base.yaml", + # "large": "configs/models/blip_pretrain_large.yaml", + } + + def __init__( + self, + image_encoder, + text_encoder, + text_decoder, + queue_size, + alpha=0.4, + embed_dim=256, + momentum=0.995, + tie_enc_dec_weights=True, + max_txt_len=30, + ): + super().__init__() + + self.tokenizer = self.init_tokenizer() + + text_encoder.resize_token_embeddings(len(self.tokenizer)) + text_decoder.resize_token_embeddings(len(self.tokenizer)) + + if tie_enc_dec_weights: + tie_encoder_decoder_weights( + encoder=text_encoder, + decoder=text_decoder.bert, + base_model_prefix="", + skip_key="/attention", + ) + + self.visual_encoder = image_encoder + + self.text_encoder = text_encoder + self.text_decoder = text_decoder + + # creating projection layers for ITC + text_width = text_encoder.config.hidden_size + vision_width = image_encoder.vision_width + + self.vision_proj = nn.Linear(vision_width, embed_dim) + self.text_proj = nn.Linear(text_width, embed_dim) + + self.itm_head = nn.Linear(text_width, 2) + + # create the momentum encoder + self.visual_encoder_m = deepcopy(self.visual_encoder) + self.text_encoder_m = deepcopy(self.text_encoder) + + self.vision_proj_m = deepcopy(self.vision_proj) + self.text_proj_m = deepcopy(self.text_proj) + + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.vision_proj, self.vision_proj_m], + [self.text_proj, self.text_proj_m], + ] + self.copy_params() + + # create the queue + self.register_buffer("image_queue", torch.randn(embed_dim, queue_size)) + self.register_buffer("text_queue", torch.randn(embed_dim, queue_size)) + self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) + + self.image_queue = nn.functional.normalize(self.image_queue, dim=0) + self.text_queue = nn.functional.normalize(self.text_queue, dim=0) + + self.queue_size = queue_size + self.momentum = momentum + self.temp = nn.Parameter(0.07 * torch.ones([])) + + self.alpha = alpha + self.max_txt_len = max_txt_len + + def _rampup_factor(self, epoch, iters, num_iters_per_epoch): + return min(1, (epoch * num_iters_per_epoch + iters) / (2 * num_iters_per_epoch)) + + def forward(self, samples): + + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). The input images. Default: H=224, W=224. + - text_input (list): A list of length batch_size, each element is a string of text/caption. + - epoch (int): The current epoch. + - iters (int): The current iteration. + - num_iters_per_epoch (int): The number of iterations per epoch. + + Returns: + BlipOutput: A BlipOutput object containing loss and intermediate output. See ``lavis.models.blip_models.blip_outputs.BlipOutput`` for more details. + + Examples: + >>> import torch + >>> from lavis.models import load_model + >>> model = load_model("blip_pretrain", "base") + >>> images = torch.randn(4, 3, 224, 224) + >>> text_input = ["caption of image 1", "another caption of image 1", "caption of image 2", "caption of image 3"] + >>> samples = {"image": images, "text_input": text_input, "epoch": 0, "iters": 0, "num_iters_per_epoch": 100} + >>> output = model(samples) + >>> output.keys() + odict_keys(['sims', 'intermediate_output', 'loss', 'loss_itc', 'loss_itm', 'loss_lm']) + + >>> output.intermediate_output.keys() + odict_keys(['image_embeds', 'text_embeds', 'image_embeds_m', 'text_embeds_m', 'encoder_output', 'encoder_output_neg', 'itm_logits', 'itm_labels', 'decoder_output', 'decoder_labels']) + >>> output.intermediate_output.image_embeds.shape + >>> # shape: (batch_size, num_patches, embed_dim) + torch.Size([4, 197, 768]) + >>> output.intermediate_output.text_embeds.shape + >>> # shape: (batch_size, max_txt_len, embed_dim) + torch.Size([4, 30, 768]) + >>> output.intermediate_output.image_embeds_m.shape + >>> # shape: (batch_size, num_patches, embed_dim) + torch.Size([4, 197, 768]) + >>> output.intermediate_output.text_embeds_m.shape + >>> # shape: (batch_size, max_txt_len, embed_dim) + torch.Size([4, 30, 768]) + >>> output.intermediate_output.itm_logits.shape + >>> # shape: (batch_size * 3, 2) + torch.Size([12, 2]) + >>> output.intermediate_output.itm_labels.shape + >>> # shape: (batch_size * 3,) + torch.Size([12]) + >>> output.intermediate_output.encoder_output.last_hidden_state.shape + >>> # shape: (batch_size, max_txt_len, embed_dim) + torch.Size([4, 30, 768]) + >>> output.intermediate_output.encoder_output_m.last_hidden_state.shape + >>> # shape: (batch_size, max_txt_len, embed_dim) + torch.Size([4, 30, 768]) + >>> output.intermediate_output.decoder_output.logits.shape + >>> # shape: (batch_size, max_txt_len, vocab_size) + torch.Size([4, 30, 30524]) + >>> output.intermediate_output.decoder_labels.shape + >>> # shape: (batch_size, max_txt_len) + torch.Size([4, 30]) + """ + + image = samples["image"] + caption = samples["text_input"] + + alpha = self.alpha * self._rampup_factor( + epoch=samples["epoch"], + iters=samples["iters"], + num_iters_per_epoch=samples["num_iters_per_epoch"], + ) + + with torch.no_grad(): + self.temp.clamp_(0.001, 0.5) + + # image embeddings and features + image_embeds = self.visual_encoder.forward_features(image) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + + text = self.tokenizer( + caption, + padding="max_length", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + + # text embeddings and features + text_output = self.text_encoder.forward_text(text) + text_embeds = text_output.last_hidden_state + text_feat = F.normalize(self.text_proj(text_embeds[:, 0, :]), dim=-1) + + # get momentum features + with torch.no_grad(): + self._momentum_update() + image_embeds_m = self.visual_encoder_m(image) + image_feat_m = F.normalize( + self.vision_proj_m(image_embeds_m[:, 0, :]), dim=-1 + ) + image_feat_all = torch.cat( + [image_feat_m.t(), self.image_queue.clone().detach()], dim=1 + ) + + text_output_m = self.text_encoder_m.forward_text(text) + text_embeds_m = text_output_m.last_hidden_state + text_feat_m = F.normalize(self.text_proj_m(text_embeds_m[:, 0, :]), dim=-1) + text_feat_all = torch.cat( + [text_feat_m.t(), self.text_queue.clone().detach()], dim=1 + ) + + sim_i2t_m = image_feat_m @ text_feat_all / self.temp + sim_t2i_m = text_feat_m @ image_feat_all / self.temp + + sim_targets = torch.zeros(sim_i2t_m.size()).to(image.device) + sim_targets.fill_diagonal_(1) + + sim_i2t_targets = ( + alpha * F.softmax(sim_i2t_m, dim=1) + (1 - alpha) * sim_targets + ) + sim_t2i_targets = ( + alpha * F.softmax(sim_t2i_m, dim=1) + (1 - alpha) * sim_targets + ) + + sim_i2t = image_feat @ text_feat_all / self.temp + sim_t2i = text_feat @ image_feat_all / self.temp + + loss_i2t = -torch.sum( + F.log_softmax(sim_i2t, dim=1) * sim_i2t_targets, dim=1 + ).mean() + loss_t2i = -torch.sum( + F.log_softmax(sim_t2i, dim=1) * sim_t2i_targets, dim=1 + ).mean() + + loss_itc = (loss_i2t + loss_t2i) / 2 + + self._dequeue_and_enqueue(image_feat_m, text_feat_m) + + # Image-text Matching + encoder_input_ids = text.input_ids.clone() + encoder_input_ids[:, 0] = self.tokenizer.enc_token_id + + # forward the positve image-text pair + bs = image.size(0) + output_pos = self.text_encoder( + encoder_input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + with torch.no_grad(): + weights_t2i = F.softmax(sim_t2i[:, :bs], dim=1) + 1e-4 + weights_t2i.fill_diagonal_(0) + weights_i2t = F.softmax(sim_i2t[:, :bs], dim=1) + 1e-4 + weights_i2t.fill_diagonal_(0) + + # select a negative image for each text + image_embeds_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_t2i[b], 1).item() + image_embeds_neg.append(image_embeds[neg_idx]) + image_embeds_neg = torch.stack(image_embeds_neg, dim=0) + + # select a negative text for each image + text_ids_neg = [] + text_atts_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_i2t[b], 1).item() + text_ids_neg.append(encoder_input_ids[neg_idx]) + text_atts_neg.append(text.attention_mask[neg_idx]) + + text_ids_neg = torch.stack(text_ids_neg, dim=0) + text_atts_neg = torch.stack(text_atts_neg, dim=0) + + text_ids_all = torch.cat([encoder_input_ids, text_ids_neg], dim=0) + text_atts_all = torch.cat([text.attention_mask, text_atts_neg], dim=0) + + image_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0) + image_atts_all = torch.cat([image_atts, image_atts], dim=0) + + output_neg = self.text_encoder( + text_ids_all, + attention_mask=text_atts_all, + encoder_hidden_states=image_embeds_all, + encoder_attention_mask=image_atts_all, + return_dict=True, + ) + + vl_embeddings = torch.cat( + [ + output_pos.last_hidden_state[:, 0, :], + output_neg.last_hidden_state[:, 0, :], + ], + dim=0, + ) + itm_logits = self.itm_head(vl_embeddings) + + itm_labels = torch.cat( + [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)], + dim=0, + ).to(image.device) + loss_itm = F.cross_entropy(itm_logits, itm_labels) + + # LM + decoder_input_ids = text.input_ids.clone() + decoder_input_ids[:, 0] = self.tokenizer.bos_token_id + decoder_targets = decoder_input_ids.masked_fill( + decoder_input_ids == self.tokenizer.pad_token_id, -100 + ) + + decoder_output = self.text_decoder( + decoder_input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + labels=decoder_targets, + return_dict=True, + ) + + loss_lm = decoder_output.loss + + return BlipOutput( + loss=loss_itc + loss_itm + loss_lm, + loss_itc=loss_itc, + loss_itm=loss_itm, + loss_lm=loss_lm, + sims=BlipSimilarity( + sim_i2t=sim_i2t, + sim_t2i=sim_t2i, + sim_i2t_m=sim_i2t_m, + sim_t2i_m=sim_t2i_m, + sim_i2t_targets=sim_i2t_targets, + sim_t2i_targets=sim_t2i_targets, + ), + intermediate_output=BlipIntermediateOutput( + image_embeds=image_embeds, + text_embeds=text_embeds, + image_embeds_m=image_embeds_m, + text_embeds_m=text_embeds_m, + encoder_output=output_pos, + encoder_output_neg=output_neg, + itm_logits=itm_logits, + itm_labels=itm_labels, + decoder_output=decoder_output, + decoder_labels=decoder_targets, + ), + ) + + def reset_queue_ptr(self): + self.queue_ptr = torch.zeros(1, dtype=torch.long) + + @classmethod + def from_config(cls, cfg=None): + # set from_pretrained=True to load weights for 'bert-base-uncased' + image_encoder = VisionTransformerEncoder.from_config(cfg, from_pretrained=True) + text_encoder = XBertEncoder.from_config(cfg, from_pretrained=True) + text_decoder = XBertLMHeadDecoder.from_config(cfg, from_pretrained=True) + + embed_dim = cfg.get("embed_dim", 256) + momentum = cfg.get("momentum", 0.995) + alpha = cfg.get("alpha", 0.4) + max_txt_len = cfg.get("max_txt_len", 30) + queue_size = cfg.get("queue_size", 57600) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + text_decoder=text_decoder, + embed_dim=embed_dim, + queue_size=queue_size, + momentum=momentum, + alpha=alpha, + tie_enc_dec_weights=True, + max_txt_len=max_txt_len, + ) + + # [IMPORTANT] to reset queue pointer to 0. + # Otherwise when updating last batch in the queue, the batch size and remaining queue length may be un-equal. + model.reset_queue_ptr() + + return model diff --git a/lavis/models/blip_models/blip_retrieval.py b/lavis/models/blip_models/blip_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..44e9c5c998d60400c2443112f69f4be5ad415048 --- /dev/null +++ b/lavis/models/blip_models/blip_retrieval.py @@ -0,0 +1,396 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from copy import deepcopy + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.albef_models import compute_sim_matrix +from lavis.models.base_model import ( + MomentumDistilationMixin, + SharedQueueMixin, + all_gather_with_grad, + concat_all_gather, +) +from lavis.models.blip_models.blip import BlipBase +from lavis.models.blip_models.blip_outputs import ( + BlipOutput, + BlipSimilarity, + BlipIntermediateOutput, +) +from lavis.models.med import XBertEncoder +from lavis.models.vit import VisionTransformerEncoder +from torch import nn + + +@registry.register_model("blip_retrieval") +class BlipRetrieval(BlipBase, MomentumDistilationMixin, SharedQueueMixin): + """ + BLIP retrieval model. + + Supported model types: + - coco: fine-tuned BLIP base model on COCO dataset (Karpathy split). + - flickr: fine-tuned BLIP base model on Flickr30k dataset. + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip_retrieval", "coco") + >>> model = load_model("blip_retrieval", "flickr") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "coco": "configs/models/blip_retrieval_coco.yaml", + "flickr": "configs/models/blip_retrieval_flickr.yaml", + } + + def __init__( + self, + image_encoder, + text_encoder, + queue_size, + alpha=0.4, + embed_dim=256, + momentum=0.995, + negative_all_rank=False, + max_txt_len=35, + ): + """ """ + super().__init__() + + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = image_encoder + + self.text_encoder = text_encoder + + # creating projection layers for ITC + text_width = text_encoder.config.hidden_size + vision_width = image_encoder.vision_width + + self.vision_proj = nn.Linear(vision_width, embed_dim) + self.text_proj = nn.Linear(text_width, embed_dim) + + self.itm_head = nn.Linear(text_width, 2) + + # create the momentum encoder + self.visual_encoder_m = deepcopy(self.visual_encoder) + self.text_encoder_m = deepcopy(self.text_encoder) + + self.vision_proj_m = deepcopy(self.vision_proj) + self.text_proj_m = deepcopy(self.text_proj) + + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.vision_proj, self.vision_proj_m], + [self.text_proj, self.text_proj_m], + ] + self.copy_params() + + # create the queue + self.register_buffer("image_queue", torch.randn(embed_dim, queue_size)) + self.register_buffer("text_queue", torch.randn(embed_dim, queue_size)) + self.register_buffer("idx_queue", torch.full((1, queue_size), -100)) + self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) + + self.image_queue = nn.functional.normalize(self.image_queue, dim=0) + self.text_queue = nn.functional.normalize(self.text_queue, dim=0) + + self.queue_size = queue_size + self.momentum = momentum + self.temp = nn.Parameter(0.07 * torch.ones([])) + + self.alpha = alpha + self.max_txt_len = max_txt_len + + self.negative_all_rank = negative_all_rank + + def _rampup_factor(self, epoch, iters, num_iters_per_epoch): + return min(1, (epoch * num_iters_per_epoch + iters) / (2 * num_iters_per_epoch)) + + def forward(self, samples): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). The input images. + - text_input (list): A list of length batch_size, each element is a string of text/caption. + - image_id (torch.Tensor): A tensor of shape (batch_size, ). The image ids, used to identify same images in batch. + - epoch (int): The current epoch. + - iters (int): The current iteration. + - num_iters_per_epoch (int): The number of iterations per epoch. + + Returns: + BlipOutput: A BlipOutput object. See ``lavis.models.blip_models.blip_outputs.BlipOutput`` for more details. + + Examples: + >>> import torch + >>> from lavis.models import load_model + >>> model = load_model("blip_retrieval", "coco") + >>> images = torch.randn(4, 3, 384, 384) + >>> text_input = ["caption of image 1", "another caption of image 1", "caption of image 2", "caption of image 3"] + >>> image_id = torch.tensor([1, 1, 2, 3]) + >>> samples = {"image": images, "text_input": text_input, "image_id": image_id, "epoch": 0, "iters": 0, "num_iters_per_epoch": 100} + >>> output = model(samples) + >>> output.keys() + odict_keys(['sims', 'intermediate_output', 'loss', 'loss_itc', 'loss_itm']) + """ + image = samples["image"] + caption = samples["text_input"] + idx = samples["image_id"] + + alpha = self.alpha * self._rampup_factor( + epoch=samples["epoch"], + iters=samples["iters"], + num_iters_per_epoch=samples["num_iters_per_epoch"], + ) + + with torch.no_grad(): + self.temp.clamp_(0.001, 0.5) + + image_embeds = self.visual_encoder.forward_features(image) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + image.device + ) + image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + + text = self.tokenizer( + caption, + padding="max_length", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(image.device) + + text_output = self.text_encoder.forward_text(text) + text_embeds = text_output.last_hidden_state + text_feat = F.normalize(self.text_proj(text_embeds[:, 0, :]), dim=-1) + + # Image-text Contrastive Learning + idx = idx.view(-1, 1) + idx_all = torch.cat([idx.t(), self.idx_queue.clone().detach()], dim=1) + pos_idx = torch.eq(idx, idx_all).float() + sim_targets = pos_idx / pos_idx.sum(1, keepdim=True) + + # get momentum features + with torch.no_grad(): + self._momentum_update() + image_embeds_m = self.visual_encoder_m(image) + image_feat_m = F.normalize( + self.vision_proj_m(image_embeds_m[:, 0, :]), dim=-1 + ) + image_feat_m_all = torch.cat( + [image_feat_m.t(), self.image_queue.clone().detach()], dim=1 + ) + + text_output_m = self.text_encoder_m.forward_text(text) + text_embeds_m = text_output_m.last_hidden_state + text_feat_m = F.normalize(self.text_proj_m(text_embeds_m[:, 0, :]), dim=-1) + text_feat_m_all = torch.cat( + [text_feat_m.t(), self.text_queue.clone().detach()], dim=1 + ) + + sim_i2t_m = image_feat_m @ text_feat_m_all / self.temp + sim_t2i_m = text_feat_m @ image_feat_m_all / self.temp + + sim_i2t_targets = ( + alpha * F.softmax(sim_i2t_m, dim=1) + (1 - alpha) * sim_targets + ) + sim_t2i_targets = ( + alpha * F.softmax(sim_t2i_m, dim=1) + (1 - alpha) * sim_targets + ) + + sim_i2t = image_feat @ text_feat_m_all / self.temp + sim_t2i = text_feat @ image_feat_m_all / self.temp + + loss_i2t = -torch.sum( + F.log_softmax(sim_i2t, dim=1) * sim_i2t_targets, dim=1 + ).mean() + loss_t2i = -torch.sum( + F.log_softmax(sim_t2i, dim=1) * sim_t2i_targets, dim=1 + ).mean() + + loss_itc = (loss_i2t + loss_t2i) / 2 + + self._dequeue_and_enqueue(image_feat_m, text_feat_m, idx) + + # Image-text Matching + encoder_input_ids = text.input_ids.clone() + encoder_input_ids[:, 0] = self.tokenizer.enc_token_id + + # forward the positve image-text pair + bs = image.size(0) + output_pos = self.text_encoder( + encoder_input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + idxs = concat_all_gather(idx) + if self.negative_all_rank: + # compute sample similarity + with torch.no_grad(): + mask = torch.eq(idx, idxs.t()) + + image_feat_world = concat_all_gather(image_feat) + text_feat_world = concat_all_gather(text_feat) + + sim_i2t = image_feat @ text_feat_world.t() / self.temp + sim_t2i = text_feat @ image_feat_world.t() / self.temp + + weights_i2t = F.softmax(sim_i2t, dim=1) + weights_i2t.masked_fill_(mask, 0) + + weights_t2i = F.softmax(sim_t2i, dim=1) + weights_t2i.masked_fill_(mask, 0) + + image_embeds_world = all_gather_with_grad(image_embeds) + + # select a negative image (from all ranks) for each text + image_embeds_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_t2i[b], 1).item() + image_embeds_neg.append(image_embeds_world[neg_idx]) + image_embeds_neg = torch.stack(image_embeds_neg, dim=0) + + # select a negative text (from all ranks) for each image + input_ids_world = concat_all_gather(encoder_input_ids) + att_mask_world = concat_all_gather(text.attention_mask) + + text_ids_neg = [] + text_atts_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_i2t[b], 1).item() + text_ids_neg.append(input_ids_world[neg_idx]) + text_atts_neg.append(att_mask_world[neg_idx]) + + else: + with torch.no_grad(): + mask = torch.eq(idx, idx.t()) + + sim_i2t = image_feat @ text_feat.t() / self.temp + sim_t2i = text_feat @ image_feat.t() / self.temp + + weights_i2t = F.softmax(sim_i2t, dim=1) + weights_i2t.masked_fill_(mask, 0) + + weights_t2i = F.softmax(sim_t2i, dim=1) + weights_t2i.masked_fill_(mask, 0) + + # select a negative image (from same rank) for each text + image_embeds_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_t2i[b], 1).item() + image_embeds_neg.append(image_embeds[neg_idx]) + image_embeds_neg = torch.stack(image_embeds_neg, dim=0) + + # select a negative text (from same rank) for each image + text_ids_neg = [] + text_atts_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_i2t[b], 1).item() + text_ids_neg.append(encoder_input_ids[neg_idx]) + text_atts_neg.append(text.attention_mask[neg_idx]) + + text_ids_neg = torch.stack(text_ids_neg, dim=0) + text_atts_neg = torch.stack(text_atts_neg, dim=0) + + text_ids_all = torch.cat([encoder_input_ids, text_ids_neg], dim=0) + text_atts_all = torch.cat([text.attention_mask, text_atts_neg], dim=0) + + image_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0) + image_atts_all = torch.cat([image_atts, image_atts], dim=0) + + output_neg = self.text_encoder( + text_ids_all, + attention_mask=text_atts_all, + encoder_hidden_states=image_embeds_all, + encoder_attention_mask=image_atts_all, + return_dict=True, + ) + + vl_embeddings = torch.cat( + [ + output_pos.last_hidden_state[:, 0, :], + output_neg.last_hidden_state[:, 0, :], + ], + dim=0, + ) + itm_logits = self.itm_head(vl_embeddings) + + itm_labels = torch.cat( + [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)], + dim=0, + ).to(self.device) + loss_itm = F.cross_entropy(itm_logits, itm_labels) + + return BlipOutput( + loss=loss_itc + loss_itm, + loss_itc=loss_itc, + loss_itm=loss_itm, + sims=BlipSimilarity( + sim_i2t=sim_i2t, + sim_t2i=sim_t2i, + sim_i2t_m=sim_i2t_m, + sim_t2i_m=sim_t2i_m, + sim_i2t_targets=sim_i2t_targets, + sim_t2i_targets=sim_t2i_targets, + ), + intermediate_output=BlipIntermediateOutput( + image_embeds=image_embeds, + image_embeds_m=image_embeds_m, + text_embeds=text_embeds, + text_embeds_m=text_embeds_m, + encoder_output=output_pos, + encoder_output_neg=output_neg, + itm_logits=itm_logits, + itm_labels=itm_labels, + ), + ) + + def reset_queue_ptr(self): + self.queue_ptr = torch.zeros(1, dtype=torch.long) + + @classmethod + def from_config(cls, cfg=None): + # set from_pretrained=True to load weights for 'bert-base-uncased' + image_encoder = VisionTransformerEncoder.from_config(cfg) + text_encoder = XBertEncoder.from_config(cfg) + + embed_dim = cfg.get("embed_dim", 256) + momentum = cfg.get("momentum", 0.995) + alpha = cfg.get("alpha", 0.4) + negative_all_rank = cfg.get("negative_all_rank", False) + + queue_size = cfg.get("queue_size", 0) + max_txt_len = cfg.get("max_txt_len", 35) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + queue_size=queue_size, + alpha=alpha, + embed_dim=embed_dim, + momentum=momentum, + negative_all_rank=negative_all_rank, + max_txt_len=max_txt_len, + ) + + model.load_checkpoint_from_config(cfg) + model.reset_queue_ptr() + + return model + + def compute_sim_matrix(self, data_loader, task_cfg): + """ + Compute similarity i2t, t2i matrix for the given data loader. + """ + k_test = task_cfg.k_test + + return compute_sim_matrix(model=self, data_loader=data_loader, k_test=k_test) diff --git a/lavis/models/blip_models/blip_vqa.py b/lavis/models/blip_models/blip_vqa.py new file mode 100644 index 0000000000000000000000000000000000000000..dd6e4144b8243e251d4c1c6451f88f97ef641a8b --- /dev/null +++ b/lavis/models/blip_models/blip_vqa.py @@ -0,0 +1,375 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.models.base_model import tile +from lavis.models.blip_models.blip import BlipBase +from lavis.models.blip_models.blip_outputs import ( + BlipOutput, + BlipIntermediateOutput, +) +from lavis.models.med import XBertEncoder, XBertLMHeadDecoder +from lavis.models.vit import VisionTransformerEncoder + + +@registry.register_model("blip_vqa") +class BlipVQA(BlipBase): + """ + BLIP VQA models. + + Supported model types: + - base: vqa model initialized with pre-trained BLIP base model on 115M image-text pairs after CapFilt; not fine-tuned. + - vqav2: fine-tuned BLIP base model on VQA v2.0 dataset. + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("blip_vqa", "vqav2") + >>> model = load_model("blip_vqa", "okvqa") + >>> model = load_model("blip_vqa", "aokvqa") + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "vqav2": "configs/models/blip_vqav2.yaml", + "okvqa": "configs/models/blip_vqa_okvqa.yaml", + "aokvqa": "configs/models/blip_vqa_aokvqa.yaml", + } + + def __init__(self, image_encoder, text_encoder, text_decoder, max_txt_len=35): + super().__init__() + self.tokenizer = self.init_tokenizer() + + self.visual_encoder = image_encoder + + self.text_encoder = text_encoder + self.text_decoder = text_decoder + + self.max_txt_len = max_txt_len + + def forward(self, samples): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). Default H=480, W=480. + - text_input (list): A list of strings, each string is a question + - answer (list): A list of strings, each string is an answer + - weight (torch.Tensor): A tensor used to weigh each answer in the loss computation. + The shape of the tensor is (sum(n_answers),) + - n_answers (torch.Tensor): A tensor shape (batch_size,) containing the number of answers + for each question in the batch. + + Returns: + A BlipOutput object containing loss and intermediate outputs, + see :class:`lavis.models.blip_outputs.BlipOutput` for more details. + + Examples: + ```python + >>> import torch + >>> from lavis.models import load_model + >>> model = load_model("blip_vqa") + >>> samples = { + ... "image": torch.rand(2, 3, 480, 480), + ... "text_input": ["What is this?", "What is that?"], + ... "answer": ["cat", "cat", "dog"], + ... "weight": torch.tensor([1.0, 1.0, 1.0]), + ... "n_answers": torch.tensor([2, 1]), + ... } + >>> output = model(samples) + >>> output.keys() + odict_keys(['intermediate_output', 'loss']) + >>> output.intermediate_output.keys() + odict_keys(['image_embeds', 'encoder_output', 'decoder_output', 'decoder_labels']) + ``` + """ + encoder_output, image_embeds = self.forward_encoder(samples) + loss, decoder_output, decoder_targets = self.forward_decoder( + samples=samples, encoder_out=encoder_output + ) + + return BlipOutput( + loss=loss, + intermediate_output=BlipIntermediateOutput( + image_embeds=image_embeds, + encoder_output=encoder_output, + decoder_output=decoder_output, + decoder_labels=decoder_targets, + ), + ) + + def forward_encoder(self, samples): + questions = samples["text_input"] + questions = self.tokenizer( + questions, + padding="longest", + truncation=True, + max_length=self.max_txt_len, + return_tensors="pt", + ).to(self.device) + questions.input_ids[:, 0] = self.tokenizer.enc_token_id + samples.update({"tokenized_text": questions}) + + image_embeds = self.visual_encoder.forward_features(samples["image"]) + encoder_output = self.text_encoder.forward_automask( + tokenized_text=samples["tokenized_text"], visual_embeds=image_embeds + ) + + return encoder_output, image_embeds + + def forward_decoder(self, samples, encoder_out, **kwargs): + answers = self.tokenizer( + samples["answer"], padding="longest", return_tensors="pt" + ).to(self.device) + answers.input_ids[:, 0] = self.tokenizer.bos_token_id + answer_targets = answers.input_ids.masked_fill( + answers.input_ids == self.tokenizer.pad_token_id, -100 + ) + + question_states = [] + question_atts = [] + + question = samples["tokenized_text"] + question_output = encoder_out + + for b, n in enumerate(samples["n_answers"]): + question_states += [question_output.last_hidden_state[b]] * n + question_atts += [question.attention_mask[b]] * n + + question_states = torch.stack(question_states, dim=0) + question_atts = torch.stack(question_atts, dim=0) + + answer_output = self.text_decoder( + answers.input_ids, + attention_mask=answers.attention_mask, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + labels=answer_targets, + return_dict=True, + reduction="none", + ) + + loss = samples["weight"] * answer_output.loss + bsz = samples["image"].size(0) + + loss = loss.sum() / bsz + + return loss, answer_output, answer_targets + + def predict_answers( + self, + samples, + num_beams=3, + inference_method="rank", + max_len=10, + min_len=1, + num_ans_candidates=128, + answer_list=None, + **kwargs + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). Default H=480, W=480. + - text_input (str or [str]): String or a list of strings, each string is a question. + The number of questions must be equal to the batch size. If a single string, will be converted to a list of string, with length 1 first. + num_beams (int): Number of beams for beam search. 1 means no beam search. + inference_method (str): Inference method. One of "rank", "generate". + - If "rank", the model will return answers with the highest probability from the answer list. + - If "generate", the model will generate answers. + max_len (int): Maximum length of generated answers. + min_len (int): Minimum length of generated answers. + num_ans_candidates (int): Number of answer candidates, used to filter out answers with low probability. + answer_list (list): A list of strings, each string is an answer. + + Returns: + List: A list of strings, each string is an answer. + + Examples: + ```python + >>> from PIL import Image + >>> from lavis.models import load_model_and_preprocess + >>> model, vis_processors, txt_processors = load_model_and_preprocess("blip_vqa", "vqav2") + >>> raw_image = Image.open("docs/data/merlion.png").convert("RGB") + >>> question = "Which city is this photo taken?" + >>> image = vis_processors["eval"](raw_image).unsqueeze(0) + >>> question = txt_processors["eval"](question) + >>> samples = {"image": image, "text_input": [question]} + >>> answers = model.predict_answers(samples) + >>> answers + ['singapore'] + >>> answer_list = ["Singapore", "London", "Palo Alto", "Tokyo"] + >>> answers = model.predict_answers(samples, answer_list=answer_list) + >>> answers + ['Singapore'] + ``` + """ + assert inference_method in [ + "rank", + "generate", + ], "Inference method must be one of 'rank' or 'generate', got {}.".format( + inference_method + ) + + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] + + assert len(samples["text_input"]) == samples["image"].size( + 0 + ), "The number of questions must be equal to the batch size." + + if inference_method == "generate": + return self._generate_answers( + samples, num_beams=num_beams, max_length=max_len, min_length=min_len + ) + elif inference_method == "rank": + assert answer_list is not None, "answer_list must be provided for ranking" + + num_ans_candidates = min(num_ans_candidates, len(answer_list)) + + return self._rank_answers( + samples, answer_list=answer_list, num_ans_candidates=num_ans_candidates + ) + + def _generate_answers(self, samples, num_beams=3, max_length=10, min_length=1): + encoder_out, _ = self.forward_encoder(samples) + + question_output = encoder_out + + question_states = question_output.last_hidden_state.repeat_interleave( + num_beams, dim=0 + ) + question_atts = torch.ones(question_states.size()[:-1], dtype=torch.long).to( + self.device + ) + + model_kwargs = { + "encoder_hidden_states": question_states, + "encoder_attention_mask": question_atts, + } + + bsz = samples["image"].size(0) + bos_ids = torch.full( + (bsz, 1), fill_value=self.tokenizer.bos_token_id, device=self.device + ) + + outputs = self.text_decoder.generate( + input_ids=bos_ids, + max_length=max_length, + min_length=min_length, + num_beams=num_beams, + eos_token_id=self.tokenizer.sep_token_id, + pad_token_id=self.tokenizer.pad_token_id, + **model_kwargs + ) + + # collect answers + answers = [] + for output in outputs: + answer = self.tokenizer.decode(output, skip_special_tokens=True) + answers.append(answer) + + return answers + + def _rank_answers(self, samples, answer_list, num_ans_candidates): + """ + Generate the first token of answers using decoder and select ${num_ans_candidates} + most probable ones. Then select answers from answer list, which start with the probable tokens. + Lastly, use the selected answers as the ground-truth labels for decoding and calculating LM loss. + Return the answers that minimize the losses as result. + + """ + answer_candidates = self.tokenizer( + answer_list, padding="longest", return_tensors="pt" + ).to(self.device) + answer_candidates.input_ids[:, 0] = self.tokenizer.bos_token_id + + answer_ids = answer_candidates.input_ids + answer_atts = answer_candidates.attention_mask + + question_output, _ = self.forward_encoder(samples) + question_states = question_output.last_hidden_state + + tokenized_question = samples["tokenized_text"] + question_atts = tokenized_question.attention_mask + + num_ques = question_states.size(0) + start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token + + start_output = self.text_decoder( + start_ids, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + return_dict=True, + reduction="none", + ) + logits = start_output.logits[:, 0, :] # first token's logit + + # topk_probs: top-k probability + # topk_ids: [num_question, k] + answer_first_token = answer_ids[:, 1] + prob_first_token = F.softmax(logits, dim=1).index_select( + dim=1, index=answer_first_token + ) + topk_probs, topk_ids = prob_first_token.topk(num_ans_candidates, dim=1) + + # answer input: [num_question*k, answer_len] + input_ids = [] + input_atts = [] + for b, topk_id in enumerate(topk_ids): + input_ids.append(answer_ids.index_select(dim=0, index=topk_id)) + input_atts.append(answer_atts.index_select(dim=0, index=topk_id)) + input_ids = torch.cat(input_ids, dim=0) + input_atts = torch.cat(input_atts, dim=0) + + targets_ids = input_ids.masked_fill( + input_ids == self.tokenizer.pad_token_id, -100 + ) + + # repeat encoder's output for top-k answers + question_states = tile(question_states, 0, num_ans_candidates) + question_atts = tile(question_atts, 0, num_ans_candidates) + + output = self.text_decoder( + input_ids, + attention_mask=input_atts, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + labels=targets_ids, + return_dict=True, + reduction="none", + ) + + log_probs_sum = -output.loss + log_probs_sum = log_probs_sum.view(num_ques, num_ans_candidates) + + max_topk_ids = log_probs_sum.argmax(dim=1) + max_ids = topk_ids[max_topk_ids >= 0, max_topk_ids] + + answers = [answer_list[max_id] for max_id in max_ids] + + return answers + + @classmethod + def from_config(cls, cfg=None): + image_encoder = VisionTransformerEncoder.from_config(cfg) + + # text encoder + multimodal encoder + text_encoder = XBertEncoder.from_config(cfg) + text_decoder = XBertLMHeadDecoder.from_config(cfg) + + max_txt_len = cfg.get("max_txt_len", 35) + + model = cls( + image_encoder=image_encoder, + text_encoder=text_encoder, + text_decoder=text_decoder, + max_txt_len=max_txt_len, + ) + + model.load_checkpoint_from_config(cfg) + + return model diff --git a/lavis/models/blip_models/nlvr_encoder.py b/lavis/models/blip_models/nlvr_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..2b12b1c34c1c5d5a5acc43b9dc3f26ef876515c2 --- /dev/null +++ b/lavis/models/blip_models/nlvr_encoder.py @@ -0,0 +1,960 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import math +from typing import Tuple + +import torch +import torch.utils.checkpoint +from torch import Tensor, device, nn +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from transformers.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from transformers.models.bert.configuration_bert import BertConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id + ) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + ) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + + self.config = config + + def forward( + self, + input_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[ + :, past_key_values_length : seq_length + past_key_values_length + ] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, "embedding_size" + ): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_width, self.all_head_size) + self.value = nn.Linear(config.encoder_width, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, + self.attention_head_size, + ) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1 + ) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype + ) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + relative_position_scores_key = torch.einsum( + "bhrd,lrd->bhlr", key_layer, positional_embedding + ) + attention_scores = ( + attention_scores + + relative_position_scores_query + + relative_position_scores_key + ) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = ( + (context_layer, attention_probs) if output_attentions else (context_layer,) + ) + + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config, twin=False, merge=False): + super().__init__() + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + if twin: + self.dense0 = nn.Linear(config.hidden_size, config.hidden_size) + self.dense1 = nn.Linear(config.hidden_size, config.hidden_size) + else: + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if merge: + self.act = ACT2FN[config.hidden_act] + self.merge_layer = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.merge = True + else: + self.merge = False + + def forward(self, hidden_states, input_tensor): + if type(hidden_states) == list: + hidden_states0 = self.dense0(hidden_states[0]) + hidden_states1 = self.dense1(hidden_states[1]) + if self.merge: + # hidden_states = self.merge_layer(self.act(torch.cat([hidden_states0,hidden_states1],dim=-1))) + hidden_states = self.merge_layer( + torch.cat([hidden_states0, hidden_states1], dim=-1) + ) + else: + hidden_states = (hidden_states0 + hidden_states1) / 2 + else: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config, is_cross_attention=False, layer_num=-1): + super().__init__() + if is_cross_attention: + self.self0 = BertSelfAttention(config, is_cross_attention) + self.self1 = BertSelfAttention(config, is_cross_attention) + else: + self.self = BertSelfAttention(config, is_cross_attention) + self.output = BertSelfOutput( + config, + twin=is_cross_attention, + merge=(is_cross_attention and layer_num >= 6), + ) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.self.num_attention_heads, + self.self.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = ( + self.self.attention_head_size * self.self.num_attention_heads + ) + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + if type(encoder_hidden_states) == list: + self_outputs0 = self.self0( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states[0], + encoder_attention_mask[0], + past_key_value, + output_attentions, + ) + self_outputs1 = self.self1( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states[1], + encoder_attention_mask[1], + past_key_value, + output_attentions, + ) + attention_output = self.output( + [self_outputs0[0], self_outputs1[0]], hidden_states + ) + + outputs = (attention_output,) + self_outputs0[ + 1: + ] # add attentions if we output them + else: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[ + 1: + ] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.layer_num = layer_num + if self.config.add_cross_attention: + self.crossattention = BertAttention( + config, + is_cross_attention=self.config.add_cross_attention, + layer_num=layer_num, + ) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + mode=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = ( + past_key_value[:2] if past_key_value is not None else None + ) + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + if mode == "multimodal": + assert ( + encoder_hidden_states is not None + ), "encoder_hidden_states must be given for cross-attention layers" + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = ( + outputs + cross_attention_outputs[1:-1] + ) # add cross attentions if we output attention weights + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config, i) for i in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + mode="multimodal", + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + () if output_attentions and self.config.add_cross_attention else None + ) + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + mode=mode, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + mode=mode, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BertModel(BertPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: Tensor, + input_shape: Tuple[int], + device: device, + is_decoder: bool, + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = ( + seq_ids[None, None, :].repeat(batch_size, seq_length, 1) + <= seq_ids[None, :, None] + ) + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, seq_length, prefix_seq_len), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = ( + causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype + ) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + mode="multimodal", + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = inputs_embeds.device + elif encoder_embeds is not None: + input_shape = encoder_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = encoder_embeds.device + else: + raise ValueError( + "You have to specify either input_ids or inputs_embeds or encoder_embeds" + ) + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] if past_key_values is not None else 0 + ) + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), device=device + ) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ + 0 + ].size() + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) for mask in encoder_attention_mask + ] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + mode=mode, + ) + sequence_output = encoder_outputs[0] + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) diff --git a/lavis/models/clip_models/__init__.py b/lavis/models/clip_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..325e25255550a00fdd082deb82a8a0da567cadb0 --- /dev/null +++ b/lavis/models/clip_models/__init__.py @@ -0,0 +1,14 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/mlfoundations/open_clip +""" + +""" OpenAI pretrained model functions +Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP. + +Originally MIT License, Copyright (c) 2021 OpenAI. +""" diff --git a/lavis/models/clip_models/__pycache__/__init__.cpython-310.pyc b/lavis/models/clip_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ab7319ca863169cd8e8c67fe51862c459f06e45 Binary files /dev/null and b/lavis/models/clip_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/clip_models/__pycache__/clip_outputs.cpython-310.pyc b/lavis/models/clip_models/__pycache__/clip_outputs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1a4182b5761587413e5ab42b44acd96015edeb8 Binary files /dev/null and b/lavis/models/clip_models/__pycache__/clip_outputs.cpython-310.pyc differ diff --git a/lavis/models/clip_models/__pycache__/model.cpython-310.pyc b/lavis/models/clip_models/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a7f0e6d64f8ad7b9369ae545c391c5a3c0d3d9a Binary files /dev/null and b/lavis/models/clip_models/__pycache__/model.cpython-310.pyc differ diff --git a/lavis/models/clip_models/__pycache__/pretrained.cpython-310.pyc b/lavis/models/clip_models/__pycache__/pretrained.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e38813ee406b5e690e6fc4efec43c3f91dd664d7 Binary files /dev/null and b/lavis/models/clip_models/__pycache__/pretrained.cpython-310.pyc differ diff --git a/lavis/models/clip_models/__pycache__/timm_model.cpython-310.pyc b/lavis/models/clip_models/__pycache__/timm_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d32c8da3891083931c91b5ec26531fe1b87c260 Binary files /dev/null and b/lavis/models/clip_models/__pycache__/timm_model.cpython-310.pyc differ diff --git a/lavis/models/clip_models/__pycache__/transform.cpython-310.pyc b/lavis/models/clip_models/__pycache__/transform.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03221dae42d7810b0919a92861689a6209eb9533 Binary files /dev/null and b/lavis/models/clip_models/__pycache__/transform.cpython-310.pyc differ diff --git a/lavis/models/clip_models/__pycache__/utils.cpython-310.pyc b/lavis/models/clip_models/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42dd7bee87665357909690c95227545c4cb795e3 Binary files /dev/null and b/lavis/models/clip_models/__pycache__/utils.cpython-310.pyc differ diff --git a/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz b/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113 --- /dev/null +++ b/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a +size 1356917 diff --git a/lavis/models/clip_models/clip_outputs.py b/lavis/models/clip_models/clip_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..3a7bb032e01189d923c4e78b63bec94138d481f7 --- /dev/null +++ b/lavis/models/clip_models/clip_outputs.py @@ -0,0 +1,43 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/mlfoundations/open_clip +""" + +from dataclasses import dataclass + +from typing import Optional + +import torch +from transformers.modeling_outputs import ModelOutput + + +@dataclass +class ClipOutputFeatures(ModelOutput): + """ + Data class of features from AlbefFeatureExtractor. + + Args: + image_embeds: `torch.FloatTensor` of shape `(batch_size, 1, embed_dim)`, `optional` + image_features: `torch.FloatTensor` of shape `(batch_size, 1, feature_dim)`, `optional` + text_embeds: `torch.FloatTensor` of shape `(batch_size, 1, embed_dim)`, `optional` + text_features: `torch.FloatTensor` of shape `(batch_size, 1, feature_dim)`, `optional` + """ + + image_embeds: Optional[torch.FloatTensor] = None + image_embeds_proj: Optional[torch.FloatTensor] = None + + text_embeds: Optional[torch.FloatTensor] = None + text_embeds_proj: Optional[torch.FloatTensor] = None + + +@dataclass +class ClipOutput(ModelOutput): + intermediate_output: Optional[ClipOutputFeatures] = None + + logit_scale_exp: Optional[torch.FloatTensor] = None + + loss: Optional[torch.FloatTensor] = None diff --git a/lavis/models/clip_models/loss.py b/lavis/models/clip_models/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..da92413b1a26df994eb48c714a4c03be6c409fcf --- /dev/null +++ b/lavis/models/clip_models/loss.py @@ -0,0 +1,141 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import logging +import torch +import torch.distributed.nn +from torch import distributed as dist, nn as nn +from torch.nn import functional as F + +try: + import horovod.torch as hvd +except ImportError: + hvd = None + + +def gather_features( + image_features, + text_features, + local_loss=False, + gather_with_grad=False, + rank=0, + world_size=1, + use_horovod=False, +): + if use_horovod: + assert hvd is not None, "Please install horovod" + if gather_with_grad: + all_image_features = hvd.allgather(image_features) + all_text_features = hvd.allgather(text_features) + else: + with torch.no_grad(): + all_image_features = hvd.allgather(image_features) + all_text_features = hvd.allgather(text_features) + if not local_loss: + # ensure grads for local rank when all_* features don't have a gradient + gathered_image_features = list( + all_image_features.chunk(world_size, dim=0) + ) + gathered_text_features = list( + all_text_features.chunk(world_size, dim=0) + ) + gathered_image_features[rank] = image_features + gathered_text_features[rank] = text_features + all_image_features = torch.cat(gathered_image_features, dim=0) + all_text_features = torch.cat(gathered_text_features, dim=0) + else: + # We gather tensors from all gpus + if gather_with_grad: + all_image_features = torch.cat( + torch.distributed.nn.all_gather(image_features), dim=0 + ) + all_text_features = torch.cat( + torch.distributed.nn.all_gather(text_features), dim=0 + ) + else: + gathered_image_features = [ + torch.zeros_like(image_features) for _ in range(world_size) + ] + gathered_text_features = [ + torch.zeros_like(text_features) for _ in range(world_size) + ] + dist.all_gather(gathered_image_features, image_features) + dist.all_gather(gathered_text_features, text_features) + if not local_loss: + # ensure grads for local rank when all_* features don't have a gradient + gathered_image_features[rank] = image_features + gathered_text_features[rank] = text_features + all_image_features = torch.cat(gathered_image_features, dim=0) + all_text_features = torch.cat(gathered_text_features, dim=0) + + return all_image_features, all_text_features + + +class ClipLoss(nn.Module): + def __init__( + self, + local_loss=False, + gather_with_grad=False, + cache_labels=False, + rank=0, + world_size=1, + use_horovod=False, + ): + super().__init__() + self.local_loss = local_loss + self.gather_with_grad = gather_with_grad + self.cache_labels = cache_labels + self.rank = rank + self.world_size = world_size + self.use_horovod = use_horovod + + # cache state + self.prev_num_logits = 0 + self.labels = {} + + def forward(self, image_features, text_features, logit_scale): + device = image_features.device + if self.world_size > 1: + all_image_features, all_text_features = gather_features( + image_features, + text_features, + self.local_loss, + self.gather_with_grad, + self.rank, + self.world_size, + self.use_horovod, + ) + + if self.local_loss: + logits_per_image = logit_scale * image_features @ all_text_features.T + logits_per_text = logit_scale * text_features @ all_image_features.T + else: + logits_per_image = ( + logit_scale * all_image_features @ all_text_features.T + ) + logits_per_text = logits_per_image.T + else: + logits_per_image = logit_scale * image_features @ text_features.T + logits_per_text = logit_scale * text_features @ image_features.T + + # calculated ground-truth and cache if enabled + num_logits = logits_per_image.shape[0] + if self.prev_num_logits != num_logits or device not in self.labels: + labels = torch.arange(num_logits, device=device, dtype=torch.long) + if self.world_size > 1 and self.local_loss: + labels = labels + num_logits * self.rank + if self.cache_labels: + self.labels[device] = labels + self.prev_num_logits = num_logits + else: + labels = self.labels[device] + + total_loss = ( + F.cross_entropy(logits_per_image, labels) + + F.cross_entropy(logits_per_text, labels) + ) / 2 + return total_loss diff --git a/lavis/models/clip_models/model.py b/lavis/models/clip_models/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8c3d5651848c6935e584abab1f9ecaad873b5392 --- /dev/null +++ b/lavis/models/clip_models/model.py @@ -0,0 +1,1254 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/mlfoundations/open_clip +""" + +""" CLIP Model +Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. +""" + +import datetime +import json +import logging +import os +import re +import time +import warnings +from collections import OrderedDict +from copy import deepcopy +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from lavis.common.registry import registry +from lavis.common.utils import get_abs_path +from lavis.models.base_model import BaseModel +from lavis.models.clip_models.clip_outputs import ClipOutput, ClipOutputFeatures +from lavis.models.clip_models.timm_model import TimmModel +from lavis.models.clip_models.transform import image_transform +from lavis.models.clip_models.utils import freeze_batch_norm_2d +from lavis.tasks.multimodal_classification import MultimodalClassificationTask +from torch import nn + +from .pretrained import ( + download_pretrained, + get_pretrained_url, + list_pretrained_tag_models, +) + +_MODEL_CONFIG_PATHS = [Path(__file__).parent.parent.parent / f"configs/models/clip/"] +_MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + + self.relu = nn.ReLU(inplace=True) + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential( + OrderedDict( + [ + ("-1", nn.AvgPool2d(stride)), + ( + "0", + nn.Conv2d( + inplanes, + planes * self.expansion, + 1, + stride=1, + bias=False, + ), + ), + ("1", nn.BatchNorm2d(planes * self.expansion)), + ] + ) + ) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + return out + + +class AttentionPool2d(nn.Module): + def __init__( + self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None + ): + super().__init__() + self.positional_embedding = nn.Parameter( + torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5 + ) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + + def forward(self, x): + x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute( + 2, 0, 1 + ) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = F.multi_head_attention_forward( + query=x, + key=x, + value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat( + [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias] + ), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False, + ) + + return x[0] + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, layers, output_dim, heads, image_size=224, width=64): + super().__init__() + self.output_dim = output_dim + self.image_size = image_size + + # the 3-layer stem + self.conv1 = nn.Conv2d( + 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.bn1 = nn.BatchNorm2d(width // 2) + self.conv2 = nn.Conv2d( + width // 2, width // 2, kernel_size=3, padding=1, bias=False + ) + self.bn2 = nn.BatchNorm2d(width // 2) + self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.avgpool = nn.AvgPool2d(2) + self.relu = nn.ReLU(inplace=True) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim) + + self.init_parameters() + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def init_parameters(self): + if self.attnpool is not None: + std = self.attnpool.c_proj.in_features**-0.5 + nn.init.normal_(self.attnpool.q_proj.weight, std=std) + nn.init.normal_(self.attnpool.k_proj.weight, std=std) + nn.init.normal_(self.attnpool.v_proj.weight, std=std) + nn.init.normal_(self.attnpool.c_proj.weight, std=std) + + for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]: + for name, param in resnet_block.named_parameters(): + if name.endswith("bn3.weight"): + nn.init.zeros_(param) + + def lock(self, unlocked_groups=0, freeze_bn_stats=False): + assert ( + unlocked_groups == 0 + ), "partial locking not currently supported for this model" + for param in self.parameters(): + param.requires_grad = False + if freeze_bn_stats: + freeze_batch_norm_2d(self) + + def stem(self, x): + for conv, bn in [ + (self.conv1, self.bn1), + (self.conv2, self.bn2), + (self.conv3, self.bn3), + ]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + def forward(self, x): + x = self.stem(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.attnpool(x) + + return x + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + return x.to(orig_type) + + +class QuickGELU(nn.Module): + # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, d_model: int, n_head: int, act_layer: Callable = nn.GELU): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict( + [ + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", act_layer()), + ("c_proj", nn.Linear(d_model * 4, d_model)), + ] + ) + ) + self.ln_2 = LayerNorm(d_model) + + def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0] + + def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + x = x + self.attention(self.ln_1(x), attn_mask=attn_mask) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__( + self, width: int, layers: int, heads: int, act_layer: Callable = nn.GELU + ): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.ModuleList( + [ + ResidualAttentionBlock(width, heads, act_layer=act_layer) + for _ in range(layers) + ] + ) + + def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + for r in self.resblocks: + x = r(x, attn_mask=attn_mask) + return x + + +class VisualTransformer(nn.Module): + def __init__( + self, + image_size: int, + patch_size: int, + width: int, + layers: int, + heads: int, + output_dim: int, + act_layer: Callable = nn.GELU, + ): + super().__init__() + self.image_size = image_size + self.output_dim = output_dim + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False, + ) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter( + scale * torch.randn((image_size // patch_size) ** 2 + 1, width) + ) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads, act_layer=act_layer) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + def lock(self, unlocked_groups=0, freeze_bn_stats=False): + assert ( + unlocked_groups == 0 + ), "partial locking not currently supported for this model" + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x = torch.cat( + [ + self.class_embedding.to(x.dtype) + + torch.zeros( + x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device + ), + x, + ], + dim=1, + ) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x + + +@dataclass +class CLIPVisionCfg: + layers: Union[Tuple[int, int, int, int], int] = 12 + width: int = 768 + patch_size: int = 16 + image_size: Union[Tuple[int, int], int] = 224 + timm_model_name: str = ( + None # a valid model name overrides layers, width, patch_size + ) + timm_model_pretrained: bool = ( + False # use (imagenet) pretrained weights for named model + ) + timm_pool: str = ( + "avg" # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '') + ) + timm_proj: str = ( + "linear" # linear projection for timm model output ('linear', 'mlp', '') + ) + + +@dataclass +class CLIPTextCfg: + context_length: int + vocab_size: int + width: int + heads: int + layers: int + + +@registry.register_model("clip") +@registry.register_model("clip_feature_extractor") +class CLIP(BaseModel): + PRETRAINED_MODEL_CONFIG_DICT = { + "ViT-B-32": "configs/models/clip_vit_base32.yaml", + "ViT-B-16": "configs/models/clip_vit_base16.yaml", + "ViT-L-14": "configs/models/clip_vit_large14.yaml", + "ViT-L-14-336": "configs/models/clip_vit_large14_336.yaml", + "RN50": "configs/models/clip_resnet50.yaml", + } + + def __init__( + self, + embed_dim: int, + vision_cfg: CLIPVisionCfg, + text_cfg: CLIPTextCfg, + quick_gelu: bool = False, + ): + from .tokenizer import tokenize + + super().__init__() + + self.tokenizer = tokenize + self._loss = None + + if isinstance(vision_cfg, dict): + vision_cfg = CLIPVisionCfg(**vision_cfg) + if isinstance(text_cfg, dict): + text_cfg = CLIPTextCfg(**text_cfg) + + self.context_length = text_cfg.context_length + + # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more + # memory efficient in recent PyTorch releases (>= 1.10). + # NOTE: timm models always use native GELU regardless of quick_gelu flag. + act_layer = QuickGELU if quick_gelu else nn.GELU + + if vision_cfg.timm_model_name: + self.visual = TimmModel( + vision_cfg.timm_model_name, + pretrained=vision_cfg.timm_model_pretrained, + pool=vision_cfg.timm_pool, + proj=vision_cfg.timm_proj, + embed_dim=embed_dim, + image_size=vision_cfg.image_size, + ) + act_layer = ( + nn.GELU + ) # so that text transformer doesn't use QuickGELU w/ timm models + elif isinstance(vision_cfg.layers, (tuple, list)): + vision_heads = vision_cfg.width * 32 // 64 + self.visual = ModifiedResNet( + layers=vision_cfg.layers, + output_dim=embed_dim, + heads=vision_heads, + image_size=vision_cfg.image_size, + width=vision_cfg.width, + ) + else: + vision_heads = vision_cfg.width // 64 + self.visual = VisualTransformer( + image_size=vision_cfg.image_size, + patch_size=vision_cfg.patch_size, + width=vision_cfg.width, + layers=vision_cfg.layers, + heads=vision_heads, + output_dim=embed_dim, + act_layer=act_layer, + ) + + self.transformer = Transformer( + width=text_cfg.width, + layers=text_cfg.layers, + heads=text_cfg.heads, + act_layer=act_layer, + ) + + self.vocab_size = text_cfg.vocab_size + self.token_embedding = nn.Embedding(text_cfg.vocab_size, text_cfg.width) + self.positional_embedding = nn.Parameter( + torch.empty(self.context_length, text_cfg.width) + ) + self.ln_final = LayerNorm(text_cfg.width) + + self.text_projection = nn.Parameter(torch.empty(text_cfg.width, embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + self.register_buffer("attn_mask", self.build_attention_mask(), persistent=False) + + self.prompt_templates = openai_imagenet_template + self.classifier = None + + self.init_parameters() + + @property + def loss(self): + if self._loss is None: + from lavis.models.clip_models.loss import ClipLoss + from torch import distributed as dist + + self._loss = ClipLoss( + world_size=dist.get_world_size(), + rank=dist.get_rank(), + local_loss=False, + gather_with_grad=False, + use_horovod=False, + ) + + return self._loss + + def init_parameters(self): + nn.init.normal_(self.token_embedding.weight, std=0.02) + nn.init.normal_(self.positional_embedding, std=0.01) + nn.init.constant_(self.logit_scale, np.log(1 / 0.07)) + + if hasattr(self.visual, "init_parameters"): + self.visual.init_parameters() + + proj_std = (self.transformer.width**-0.5) * ( + (2 * self.transformer.layers) ** -0.5 + ) + attn_std = self.transformer.width**-0.5 + fc_std = (2 * self.transformer.width) ** -0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + if self.text_projection is not None: + nn.init.normal_(self.text_projection, std=self.transformer.width**-0.5) + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float("-inf")) + mask.triu_(1) # zero out the lower diagonal + return mask + + def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False): + # lock image tower as per LiT - https://arxiv.org/abs/2111.07991 + self.visual.lock( + unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats + ) + + def encode_image(self, image): + return self.visual(image) + + def encode_text(self, text): + x = self.token_embedding(text) # [batch_size, n_ctx, d_model] + + x = x + self.positional_embedding + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x, attn_mask=self.attn_mask) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x) + + # x.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection + + return x + + # def forward(self, image, text): + def forward(self, samples): + image = samples.get("image") + text = samples.get("text_input") + + if text is not None: + text = self.tokenizer(text).to(self.device) + + if image is None: + return self.encode_text(text) + elif text is None: + return self.encode_image(image) + image_embeds = self.encode_image(image) + image_features = F.normalize(image_embeds, dim=-1) + + text_embeds = self.encode_text(text) + text_features = F.normalize(text_embeds, dim=-1) + + loss = self.loss(image_features, text_features, self.logit_scale.exp()) + + # return image_features, text_features, self.logit_scale.exp() + # return {"loss": loss} + return ClipOutput( + intermediate_output=ClipOutputFeatures( + image_embeds=image_embeds, + image_embeds_proj=image_features, + text_embeds=text_embeds, + text_embeds_proj=text_features, + ), + loss=loss, + logit_scale_exp=self.logit_scale.exp(), + ) + + def extract_features(self, samples): + """ + Extract features from the model for samples. + + Keys allowed are "image" and "text_input" in samples. + If either key is missing, the corresponding features are not extracted. + + Args: + samples: dict of samples to extract features from. + + Returns: + ClipOutputFeatures object with features for the samples. + """ + image = samples.get("image") + text = samples.get("text_input") + + if text is not None: + text = self.tokenizer(text).to(self.device) + + if image is None: + return self.encode_text(text) + elif text is None: + return self.encode_image(image) + + image_embeds = self.encode_image(image) + image_features = F.normalize(image_embeds, dim=-1) + + text_embeds = self.encode_text(text) + text_features = F.normalize(text_embeds, dim=-1) + + return ClipOutputFeatures( + image_embeds=image_embeds, + image_embeds_proj=image_features, + text_embeds=text_embeds, + text_embeds_proj=text_features, + ) + + def predict(self, samples): + image = samples["image"] + targets = samples["label"] + + image_features = self.encode_image(image) + image_features = F.normalize(image_features, dim=-1) + + logits = 100.0 * image_features @ self.classifier + + return {"predictions": logits, "targets": targets} + + def before_evaluation(self, dataset, task_type, **kwargs): + if task_type == MultimodalClassificationTask: + self.classifier = self.zero_shot_classifier( + classnames=dataset.classnames, + templates=self.prompt_templates, + ) + + def zero_shot_classifier(self, classnames, templates): + with torch.no_grad(): + zeroshot_weights = [] + for classname in classnames: + texts = [ + template(classname) for template in templates + ] # format with class + texts = self.tokenizer(texts).to(self.device) # tokenize + + class_embeddings = self.encode_text(texts) + class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0) + class_embedding /= class_embedding.norm() + zeroshot_weights.append(class_embedding) + zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(self.device) + return zeroshot_weights + + @classmethod + def default_config_path(cls, model_type="base"): + model_type = "ViT-B-32" if model_type == "base" else model_type + + assert ( + model_type in cls.PRETRAINED_MODEL_CONFIG_DICT + ), "Unknown model type {}. \n Available types: {}".format( + model_type, cls.PRETRAINED_MODEL_CONFIG_DICT.keys() + ) + return get_abs_path(cls.PRETRAINED_MODEL_CONFIG_DICT[model_type]) + + @classmethod + def from_config(cls, cfg=None): + model_name = cfg.model_type + pretrained = cfg.pretrained + + precision = cfg.get("precision", "fp32") + + return create_model( + model_name=model_name, pretrained=pretrained, precision=precision + ) + + def zero_shot_predict(self, image_path, categories): + assert isinstance( + categories, list + ), f"categories must be a list, got {type(categories)}." + assert os.path.exists(image_path), f"File {image_path} does not exist." + + from lavis.processors.clip_processors import ClipImageEvalProcessor + from PIL import Image + + image_preprocess = ClipImageEvalProcessor() + image = image_preprocess(Image.open(image_path)).unsqueeze(0) + + text = self.tokenizer(categories) + + with torch.no_grad(): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + image_features /= image_features.norm(dim=-1, keepdim=True) + text_features /= text_features.norm(dim=-1, keepdim=True) + + text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1) + + print("Label probs:", text_probs) # prints: [[1., 0., 0.]] + + def compute_sim_matrix(self, data_loader, **kwargs): + logging.info("Computing features for evaluation...") + start_time = time.time() + + texts = data_loader.dataset.text + num_text = len(texts) + text_bs = 256 + text_features = [] + + for i in range(0, num_text, text_bs): + + text = texts[i : min(num_text, i + text_bs)] + text_input = self.tokenizer(text).to(self.device) + + text_feat = self.encode_text(text_input) + text_feat = F.normalize(text_feat, dim=-1) + + text_features.append(text_feat) + + text_features = torch.cat(text_features, dim=0) + + image_features = [] + for samples in data_loader: + image = samples["image"] + + image = image.to(self.device) + image_feat = self.encode_image(image) + image_feat = F.normalize(image_feat, dim=-1) + + image_features.append(image_feat) + + image_features = torch.cat(image_features, dim=0) + + sims_matrix_i2t = image_features @ text_features.t() + sims_matrix_t2i = sims_matrix_i2t.t() + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + logging.info("Evaluation time {}".format(total_time_str)) + + return sims_matrix_i2t.cpu().numpy(), sims_matrix_t2i.cpu().numpy() + + +def convert_weights_to_fp16(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(l): + if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)): + l.weight.data = l.weight.data.half() + if l.bias is not None: + l.bias.data = l.bias.data.half() + + if isinstance(l, nn.MultiheadAttention): + for attr in [ + *[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], + "in_proj_bias", + "bias_k", + "bias_v", + ]: + tensor = getattr(l, attr) + if tensor is not None: + tensor.data = tensor.data.half() + + for name in ["text_projection", "proj"]: + if hasattr(l, name): + attr = getattr(l, name) + if attr is not None: + attr.data = attr.data.half() + + model.apply(_convert_weights_to_fp16) + + +def build_model_from_openai_state_dict(state_dict: dict): + vit = "visual.proj" in state_dict + + if vit: + vision_width = state_dict["visual.conv1.weight"].shape[0] + vision_layers = len( + [ + k + for k in state_dict.keys() + if k.startswith("visual.") and k.endswith(".attn.in_proj_weight") + ] + ) + vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] + grid_size = round( + (state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5 + ) + image_size = vision_patch_size * grid_size + else: + counts: list = [ + len( + set( + k.split(".")[2] + for k in state_dict + if k.startswith(f"visual.layer{b}") + ) + ) + for b in [1, 2, 3, 4] + ] + vision_layers = tuple(counts) + vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0] + output_width = round( + (state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5 + ) + vision_patch_size = None + assert ( + output_width**2 + 1 + == state_dict["visual.attnpool.positional_embedding"].shape[0] + ) + image_size = output_width * 32 + + embed_dim = state_dict["text_projection"].shape[1] + context_length = state_dict["positional_embedding"].shape[0] + vocab_size = state_dict["token_embedding.weight"].shape[0] + transformer_width = state_dict["ln_final.weight"].shape[0] + transformer_heads = transformer_width // 64 + transformer_layers = len( + set( + k.split(".")[2] + for k in state_dict + if k.startswith(f"transformer.resblocks") + ) + ) + + vision_cfg = CLIPVisionCfg( + layers=vision_layers, + width=vision_width, + patch_size=vision_patch_size, + image_size=image_size, + ) + text_cfg = CLIPTextCfg( + context_length=context_length, + vocab_size=vocab_size, + width=transformer_width, + heads=transformer_heads, + layers=transformer_layers, + ) + model = CLIP( + embed_dim, + vision_cfg=vision_cfg, + text_cfg=text_cfg, + quick_gelu=True, # OpenAI models were trained with QuickGELU + ) + + for key in ["input_resolution", "context_length", "vocab_size"]: + state_dict.pop(key, None) + + convert_weights_to_fp16(model) + model.load_state_dict(state_dict) + return model.eval() + + +def trace_model(model, batch_size=256, device=torch.device("cpu")): + model.eval() + image_size = model.visual.image_size + example_images = torch.ones((batch_size, 3, image_size, image_size), device=device) + example_text = torch.zeros( + (batch_size, model.context_length), dtype=torch.int, device=device + ) + model = torch.jit.trace_module( + model, + inputs=dict( + forward=(example_images, example_text), + encode_text=(example_text,), + encode_image=(example_images,), + ), + ) + model.visual.image_size = image_size + return + + +def _natural_key(string_): + return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())] + + +def _rescan_model_configs(): + global _MODEL_CONFIGS + + config_ext = (".json",) + config_files = [] + for config_path in _MODEL_CONFIG_PATHS: + if config_path.is_file() and config_path.suffix in config_ext: + config_files.append(config_path) + elif config_path.is_dir(): + for ext in config_ext: + config_files.extend(config_path.glob(f"*{ext}")) + + for cf in config_files: + with open(cf, "r") as f: + model_cfg = json.load(f) + if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")): + _MODEL_CONFIGS[cf.stem] = model_cfg + + _MODEL_CONFIGS = { + k: v + for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0])) + } + + +_rescan_model_configs() # initial populate of model config registry + + +def load_state_dict(checkpoint_path: str, map_location="cpu"): + checkpoint = torch.load(checkpoint_path, map_location=map_location) + if isinstance(checkpoint, dict) and "state_dict" in checkpoint: + state_dict = checkpoint["state_dict"] + else: + state_dict = checkpoint + if next(iter(state_dict.items()))[0].startswith("module"): + state_dict = {k[7:]: v for k, v in state_dict.items()} + return state_dict + + +def create_model( + model_name: str, + pretrained: str = "", + precision: str = "fp32", + device: torch.device = torch.device("cpu"), + jit: bool = False, + force_quick_gelu: bool = False, + pretrained_image: bool = False, +): + model_name = model_name.replace( + "/", "-" + ) # for callers using old naming with / in ViT names + + if pretrained.lower() == "openai": + logging.info(f"Loading pretrained {model_name} from OpenAI.") + model = load_openai_model(model_name, device=device, jit=jit) + # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372 + if precision == "amp" or precision == "fp32": + model = model.float() + else: + logging.info(f"No pretrained weights loaded for {model_name} model.") + if model_name in _MODEL_CONFIGS: + logging.info(f"Loading {model_name} model config.") + model_cfg = deepcopy(_MODEL_CONFIGS[model_name]) + else: + logging.error( + f"Model config for {model_name} not found; available models {list_models()}." + ) + raise RuntimeError(f"Model config for {model_name} not found.") + + if force_quick_gelu: + # override for use of QuickGELU on non-OpenAI transformer models + model_cfg["quick_gelu"] = True + + if pretrained_image: + if "timm_model_name" in model_cfg.get("vision_cfg", {}): + # pretrained weight loading for timm models set via vision_cfg + model_cfg["vision_cfg"]["timm_model_pretrained"] = True + else: + assert ( + False + ), "pretrained image towers currently only supported for timm models" + + model = CLIP(**model_cfg) + + if pretrained: + checkpoint_path = "" + url = get_pretrained_url(model_name, pretrained) + if url: + checkpoint_path = download_pretrained(url) + elif os.path.exists(pretrained): + checkpoint_path = pretrained + + if checkpoint_path: + logging.info(f"Loading pretrained {model_name} weights ({pretrained}).") + model.load_state_dict(load_state_dict(checkpoint_path)) + else: + logging.warning( + f"Pretrained weights ({pretrained}) not found for model {model_name}." + ) + raise RuntimeError( + f"Pretrained weights ({pretrained}) not found for model {model_name}." + ) + + model.to(device=device) + if precision == "fp16": + assert device.type != "cpu" + convert_weights_to_fp16(model) + + if jit: + model = torch.jit.script(model) + + return model + + +def create_model_and_transforms( + model_name: str, + pretrained: str = "", + precision: str = "fp32", + device: torch.device = torch.device("cpu"), + jit: bool = False, + force_quick_gelu: bool = False, + pretrained_image: bool = False, +): + model = create_model( + model_name, + pretrained, + precision, + device, + jit, + force_quick_gelu=force_quick_gelu, + pretrained_image=pretrained_image, + ) + preprocess_train = image_transform(model.visual.image_size, is_train=True) + preprocess_val = image_transform(model.visual.image_size, is_train=False) + return model, preprocess_train, preprocess_val + + +def list_models(): + """enumerate available model architectures based on config files""" + return list(_MODEL_CONFIGS.keys()) + + +def add_model_config(path): + """add model config path or file and update registry""" + if not isinstance(path, Path): + path = Path(path) + _MODEL_CONFIG_PATHS.append(path) + _rescan_model_configs() + + +def list_openai_models() -> List[str]: + """Returns the names of available CLIP models""" + return list_pretrained_tag_models("openai") + + +def load_openai_model( + name: str, + device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", + jit=True, +): + """Load a CLIP model + Parameters + ---------- + name : str + A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict + device : Union[str, torch.device] + The device to put the loaded model + jit : bool + Whether to load the optimized JIT model (default) or more hackable non-JIT model. + Returns + ------- + model : torch.nn.Module + The CLIP model + preprocess : Callable[[PIL.Image], torch.Tensor] + A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input + """ + if get_pretrained_url(name, "openai"): + model_path = download_pretrained(get_pretrained_url(name, "openai")) + elif os.path.isfile(name): + model_path = name + else: + raise RuntimeError( + f"Model {name} not found; available models = {list_openai_models()}" + ) + + try: + # loading JIT archive + model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() + state_dict = None + except RuntimeError: + # loading saved state dict + if jit: + warnings.warn( + f"File {model_path} is not a JIT archive. Loading as a state dict instead" + ) + jit = False + state_dict = torch.load(model_path, map_location="cpu") + + if not jit: + try: + model = build_model_from_openai_state_dict( + state_dict or model.state_dict() + ).to(device) + except KeyError: + sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} + model = build_model_from_openai_state_dict(sd).to(device) + + if str(device) == "cpu": + model.float() + return model + + # patch the device names + device_holder = torch.jit.trace( + lambda: torch.ones([]).to(torch.device(device)), example_inputs=[] + ) + device_node = [ + n + for n in device_holder.graph.findAllNodes("prim::Constant") + if "Device" in repr(n) + ][-1] + + def patch_device(module): + try: + graphs = [module.graph] if hasattr(module, "graph") else [] + except RuntimeError: + graphs = [] + + if hasattr(module, "forward1"): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes("prim::Constant"): + if "value" in node.attributeNames() and str(node["value"]).startswith( + "cuda" + ): + node.copyAttributes(device_node) + + model.apply(patch_device) + patch_device(model.encode_image) + patch_device(model.encode_text) + + # patch dtype to float32 on CPU + if str(device) == "cpu": + float_holder = torch.jit.trace( + lambda: torch.ones([]).float(), example_inputs=[] + ) + float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] + float_node = float_input.node() + + def patch_float(module): + try: + graphs = [module.graph] if hasattr(module, "graph") else [] + except RuntimeError: + graphs = [] + + if hasattr(module, "forward1"): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes("aten::to"): + inputs = list(node.inputs()) + for i in [ + 1, + 2, + ]: # dtype can be the second or third argument to aten::to() + if inputs[i].node()["value"] == 5: + inputs[i].node().copyAttributes(float_node) + + model.apply(patch_float) + patch_float(model.encode_image) + patch_float(model.encode_text) + model.float() + + # ensure image_size attr available at consistent location for both jit and non-jit + model.visual.image_size = model.input_resolution.item() + return model + + +openai_imagenet_template = [ + lambda c: f"a bad photo of a {c}.", + lambda c: f"a photo of many {c}.", + lambda c: f"a sculpture of a {c}.", + lambda c: f"a photo of the hard to see {c}.", + lambda c: f"a low resolution photo of the {c}.", + lambda c: f"a rendering of a {c}.", + lambda c: f"graffiti of a {c}.", + lambda c: f"a bad photo of the {c}.", + lambda c: f"a cropped photo of the {c}.", + lambda c: f"a tattoo of a {c}.", + lambda c: f"the embroidered {c}.", + lambda c: f"a photo of a hard to see {c}.", + lambda c: f"a bright photo of a {c}.", + lambda c: f"a photo of a clean {c}.", + lambda c: f"a photo of a dirty {c}.", + lambda c: f"a dark photo of the {c}.", + lambda c: f"a drawing of a {c}.", + lambda c: f"a photo of my {c}.", + lambda c: f"the plastic {c}.", + lambda c: f"a photo of the cool {c}.", + lambda c: f"a close-up photo of a {c}.", + lambda c: f"a black and white photo of the {c}.", + lambda c: f"a painting of the {c}.", + lambda c: f"a painting of a {c}.", + lambda c: f"a pixelated photo of the {c}.", + lambda c: f"a sculpture of the {c}.", + lambda c: f"a bright photo of the {c}.", + lambda c: f"a cropped photo of a {c}.", + lambda c: f"a plastic {c}.", + lambda c: f"a photo of the dirty {c}.", + lambda c: f"a jpeg corrupted photo of a {c}.", + lambda c: f"a blurry photo of the {c}.", + lambda c: f"a photo of the {c}.", + lambda c: f"a good photo of the {c}.", + lambda c: f"a rendering of the {c}.", + lambda c: f"a {c} in a video game.", + lambda c: f"a photo of one {c}.", + lambda c: f"a doodle of a {c}.", + lambda c: f"a close-up photo of the {c}.", + lambda c: f"a photo of a {c}.", + lambda c: f"the origami {c}.", + lambda c: f"the {c} in a video game.", + lambda c: f"a sketch of a {c}.", + lambda c: f"a doodle of the {c}.", + lambda c: f"a origami {c}.", + lambda c: f"a low resolution photo of a {c}.", + lambda c: f"the toy {c}.", + lambda c: f"a rendition of the {c}.", + lambda c: f"a photo of the clean {c}.", + lambda c: f"a photo of a large {c}.", + lambda c: f"a rendition of a {c}.", + lambda c: f"a photo of a nice {c}.", + lambda c: f"a photo of a weird {c}.", + lambda c: f"a blurry photo of a {c}.", + lambda c: f"a cartoon {c}.", + lambda c: f"art of a {c}.", + lambda c: f"a sketch of the {c}.", + lambda c: f"a embroidered {c}.", + lambda c: f"a pixelated photo of a {c}.", + lambda c: f"itap of the {c}.", + lambda c: f"a jpeg corrupted photo of the {c}.", + lambda c: f"a good photo of a {c}.", + lambda c: f"a plushie {c}.", + lambda c: f"a photo of the nice {c}.", + lambda c: f"a photo of the small {c}.", + lambda c: f"a photo of the weird {c}.", + lambda c: f"the cartoon {c}.", + lambda c: f"art of the {c}.", + lambda c: f"a drawing of the {c}.", + lambda c: f"a photo of the large {c}.", + lambda c: f"a black and white photo of a {c}.", + lambda c: f"the plushie {c}.", + lambda c: f"a dark photo of a {c}.", + lambda c: f"itap of a {c}.", + lambda c: f"graffiti of the {c}.", + lambda c: f"a toy {c}.", + lambda c: f"itap of my {c}.", + lambda c: f"a photo of a cool {c}.", + lambda c: f"a photo of a small {c}.", + lambda c: f"a tattoo of the {c}.", +] diff --git a/lavis/models/clip_models/pics/CLIP.png b/lavis/models/clip_models/pics/CLIP.png new file mode 100644 index 0000000000000000000000000000000000000000..a1b5ec9171fd7a51e36e845a02304eb837142ba1 Binary files /dev/null and b/lavis/models/clip_models/pics/CLIP.png differ diff --git a/lavis/models/clip_models/pretrained.py b/lavis/models/clip_models/pretrained.py new file mode 100644 index 0000000000000000000000000000000000000000..a8d9834952263a0cd19c775d2576628e4ee580cd --- /dev/null +++ b/lavis/models/clip_models/pretrained.py @@ -0,0 +1,182 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/mlfoundations/open_clip +""" + +import hashlib +import os +import urllib +import warnings + +from tqdm import tqdm + +_RN50 = dict( + openai="https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", + yfcc15m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt", + cc12m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt", +) + +_RN50_quickgelu = dict( + openai="https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", + yfcc15m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt", + cc12m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt", +) + +_RN101 = dict( + openai="https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt", + yfcc15m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt", +) + +_RN101_quickgelu = dict( + openai="https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt", + yfcc15m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt", +) + +_RN50x4 = dict( + openai="https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt", +) + +_RN50x16 = dict( + openai="https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt", +) + +_RN50x64 = dict( + openai="https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt", +) + +_VITB32 = dict( + openai="https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", + laion400m_e31="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt", + laion400m_e32="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt", + laion400m_avg="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_avg-8a00ab3c.pt", +) + +_VITB32_quickgelu = dict( + openai="https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", + laion400m_e31="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt", + laion400m_e32="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt", + laion400m_avg="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_avg-8a00ab3c.pt", +) + +_VITB16 = dict( + openai="https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt", +) + +_VITL14 = dict( + openai="https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt", +) + +_VITL14_336 = dict( + openai="https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt" +) + +_PRETRAINED = { + "RN50": _RN50, + "RN50-quickgelu": _RN50_quickgelu, + "RN101": _RN101, + "RN101-quickgelu": _RN101_quickgelu, + "RN50x4": _RN50x4, + "RN50x16": _RN50x16, + "ViT-B-32": _VITB32, + "ViT-B-32-quickgelu": _VITB32_quickgelu, + "ViT-B-16": _VITB16, + "ViT-L-14": _VITL14, + "ViT-L-14-336": _VITL14_336, +} + + +def list_pretrained(as_str: bool = False): + """returns list of pretrained models + Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True + """ + return [ + ":".join([k, t]) if as_str else (k, t) + for k in _PRETRAINED.keys() + for t in _PRETRAINED[k].keys() + ] + + +def list_pretrained_tag_models(tag: str): + """return all models having the specified pretrain tag""" + models = [] + for k in _PRETRAINED.keys(): + if tag in _PRETRAINED[k]: + models.append(k) + return models + + +def list_pretrained_model_tags(model: str): + """return all pretrain tags for the specified model architecture""" + tags = [] + if model in _PRETRAINED: + tags.extend(_PRETRAINED[model].keys()) + return tags + + +def get_pretrained_url(model: str, tag: str): + if model not in _PRETRAINED: + return "" + model_pretrained = _PRETRAINED[model] + tag = tag.lower() + if tag not in model_pretrained: + return "" + return model_pretrained[tag] + + +def download_pretrained(url: str, root: str = os.path.expanduser("~/.cache/clip")): + os.makedirs(root, exist_ok=True) + filename = os.path.basename(url) + + if "openaipublic" in url: + expected_sha256 = url.split("/")[-2] + else: + expected_sha256 = "" + + download_target = os.path.join(root, filename) + + if os.path.exists(download_target) and not os.path.isfile(download_target): + raise RuntimeError(f"{download_target} exists and is not a regular file") + + if os.path.isfile(download_target): + if expected_sha256: + if ( + hashlib.sha256(open(download_target, "rb").read()).hexdigest() + == expected_sha256 + ): + return download_target + else: + warnings.warn( + f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file" + ) + else: + return download_target + + with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: + with tqdm( + total=int(source.info().get("Content-Length")), + ncols=80, + unit="iB", + unit_scale=True, + ) as loop: + while True: + buffer = source.read(8192) + if not buffer: + break + + output.write(buffer) + loop.update(len(buffer)) + + if ( + expected_sha256 + and hashlib.sha256(open(download_target, "rb").read()).hexdigest() + != expected_sha256 + ): + raise RuntimeError( + f"Model has been downloaded but the SHA256 checksum does not not match" + ) + + return download_target diff --git a/lavis/models/clip_models/timm_model.py b/lavis/models/clip_models/timm_model.py new file mode 100644 index 0000000000000000000000000000000000000000..13bf04fc23e2691902f8b7da67ac99d19a696116 --- /dev/null +++ b/lavis/models/clip_models/timm_model.py @@ -0,0 +1,561 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/mlfoundations/open_clip +""" + +""" timm model adapter +Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model. +""" +import math +import warnings +from collections import OrderedDict +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from torch import nn as nn + +try: + import timm + from timm.models.layers import Mlp, to_2tuple + + # from timm.models.layers.attention_pool2d import RotAttentionPool2d + # from timm.models.layers.attention_pool2d import ( + # AttentionPool2d as AbsAttentionPool2d, + # ) + +except ImportError as e: + timm = None + +from lavis.models.clip_models.utils import freeze_batch_norm_2d + + +class TimmModel(nn.Module): + """timm model adapter + # FIXME this adapter is a work in progress, may change in ways that break weight compat + """ + + def __init__( + self, + model_name, + embed_dim, + image_size=224, + pool="avg", + proj="linear", + drop=0.0, + pretrained=False, + ): + super().__init__() + if timm is None: + raise RuntimeError("Please `pip install timm` to use timm models.") + + self.image_size = to_2tuple(image_size) + self.trunk = timm.create_model(model_name, pretrained=pretrained) + feat_size = self.trunk.default_cfg.get("pool_size", None) + feature_ndim = 1 if not feat_size else 2 + if pool in ("abs_attn", "rot_attn"): + assert feature_ndim == 2 + # if attn pooling used, remove both classifier and default pool + self.trunk.reset_classifier(0, global_pool="") + else: + # reset global pool if pool config set, otherwise leave as network default + reset_kwargs = dict(global_pool=pool) if pool else {} + self.trunk.reset_classifier(0, **reset_kwargs) + prev_chs = self.trunk.num_features + + head_layers = OrderedDict() + if pool == "abs_attn": + head_layers["pool"] = AttentionPool2d( + prev_chs, feat_size=feat_size, out_features=embed_dim + ) + prev_chs = embed_dim + elif pool == "rot_attn": + head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim) + prev_chs = embed_dim + else: + assert proj, "projection layer needed if non-attention pooling is used." + + # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used + if proj == "linear": + head_layers["drop"] = nn.Dropout(drop) + head_layers["proj"] = nn.Linear(prev_chs, embed_dim) + elif proj == "mlp": + head_layers["mlp"] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop) + + self.head = nn.Sequential(head_layers) + + def lock(self, unlocked_groups=0, freeze_bn_stats=False): + """lock modules + Args: + unlocked_groups (int): leave last n layer groups unlocked (default: 0) + """ + if not unlocked_groups: + # lock full model + for param in self.trunk.parameters(): + param.requires_grad = False + if freeze_bn_stats: + freeze_batch_norm_2d(self.trunk) + else: + # NOTE: partial freeze requires latest timm (master) branch and is subject to change + try: + # FIXME import here until API stable and in an official release + from timm.models.helpers import group_modules, group_parameters + except ImportError: + raise RuntimeError( + "Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`" + ) + matcher = self.trunk.group_matcher() + gparams = group_parameters(self.trunk, matcher) + max_layer_id = max(gparams.keys()) + max_layer_id = max_layer_id - unlocked_groups + for group_idx in range(max_layer_id + 1): + group = gparams[group_idx] + for param in group: + self.trunk.get_parameter(param).requires_grad = False + if freeze_bn_stats: + gmodules = group_modules(self.trunk, matcher, reverse=True) + gmodules = {k for k, v in gmodules.items() if v <= max_layer_id} + freeze_batch_norm_2d(self.trunk, gmodules) + + def forward(self, x): + x = self.trunk(x) + x = self.head(x) + return x + + +class RotAttentionPool2d(nn.Module): + """Attention based 2D feature pooling w/ rotary (relative) pos embedding. + This is a multi-head attention based replacement for (spatial) average pooling in NN architectures. + Adapted from the AttentionPool2d in CLIP w/ rotary embedding instead of learned embed. + https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py + NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from + train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW + """ + + def __init__( + self, + in_features: int, + out_features: int = None, + embed_dim: int = None, + num_heads: int = 4, + qkv_bias: bool = True, + ): + super().__init__() + embed_dim = embed_dim or in_features + out_features = out_features or in_features + self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias) + self.proj = nn.Linear(embed_dim, out_features) + self.num_heads = num_heads + assert embed_dim % num_heads == 0 + self.head_dim = embed_dim // num_heads + self.scale = self.head_dim**-0.5 + self.pos_embed = RotaryEmbedding(self.head_dim) + + trunc_normal_(self.qkv.weight, std=in_features**-0.5) + nn.init.zeros_(self.qkv.bias) + + def forward(self, x): + B, _, H, W = x.shape + N = H * W + x = x.reshape(B, -1, N).permute(0, 2, 1) + + x = torch.cat([x.mean(1, keepdim=True), x], dim=1) + + x = ( + self.qkv(x) + .reshape(B, N + 1, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = x[0], x[1], x[2] + + qc, q = q[:, :, :1], q[:, :, 1:] + sin_emb, cos_emb = self.pos_embed.get_embed((H, W)) + q = apply_rot_embed(q, sin_emb, cos_emb) + q = torch.cat([qc, q], dim=2) + + kc, k = k[:, :, :1], k[:, :, 1:] + k = apply_rot_embed(k, sin_emb, cos_emb) + k = torch.cat([kc, k], dim=2) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(B, N + 1, -1) + x = self.proj(x) + return x[:, 0] + + +class AttentionPool2d(nn.Module): + """Attention based 2D feature pooling w/ learned (absolute) pos embedding. + This is a multi-head attention based replacement for (spatial) average pooling in NN architectures. + It was based on impl in CLIP by OpenAI + https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py + NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network. + """ + + def __init__( + self, + in_features: int, + feat_size: Union[int, Tuple[int, int]], + out_features: int = None, + embed_dim: int = None, + num_heads: int = 4, + qkv_bias: bool = True, + ): + super().__init__() + + embed_dim = embed_dim or in_features + out_features = out_features or in_features + assert embed_dim % num_heads == 0 + self.feat_size = to_2tuple(feat_size) + self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias) + self.proj = nn.Linear(embed_dim, out_features) + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.scale = self.head_dim**-0.5 + + spatial_dim = self.feat_size[0] * self.feat_size[1] + self.pos_embed = nn.Parameter(torch.zeros(spatial_dim + 1, in_features)) + trunc_normal_(self.pos_embed, std=in_features**-0.5) + trunc_normal_(self.qkv.weight, std=in_features**-0.5) + nn.init.zeros_(self.qkv.bias) + + def forward(self, x): + B, _, H, W = x.shape + N = H * W + assert self.feat_size[0] == H + assert self.feat_size[1] == W + x = x.reshape(B, -1, N).permute(0, 2, 1) + x = torch.cat([x.mean(1, keepdim=True), x], dim=1) + x = x + self.pos_embed.unsqueeze(0).to(x.dtype) + + x = ( + self.qkv(x) + .reshape(B, N + 1, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = x[0], x[1], x[2] + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(B, N + 1, -1) + x = self.proj(x) + return x[:, 0] + + +def pixel_freq_bands( + num_bands: int, + max_freq: float = 224.0, + linear_bands: bool = True, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +): + if linear_bands: + bands = torch.linspace(1.0, max_freq / 2, num_bands, dtype=dtype, device=device) + else: + bands = 2 ** torch.linspace( + 0, math.log(max_freq, 2) - 1, num_bands, dtype=dtype, device=device + ) + return bands * torch.pi + + +def inv_freq_bands( + num_bands: int, + temperature: float = 100000.0, + step: int = 2, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +) -> torch.Tensor: + inv_freq = 1.0 / ( + temperature + ** (torch.arange(0, num_bands, step, dtype=dtype, device=device) / num_bands) + ) + return inv_freq + + +def build_sincos2d_pos_embed( + feat_shape: List[int], + dim: int = 64, + temperature: float = 10000.0, + reverse_coord: bool = False, + interleave_sin_cos: bool = False, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +) -> torch.Tensor: + """ + Args: + feat_shape: + dim: + temperature: + reverse_coord: stack grid order W, H instead of H, W + interleave_sin_cos: sin, cos, sin, cos stack instead of sin, sin, cos, cos + dtype: + device: + Returns: + """ + assert ( + dim % 4 == 0 + ), "Embed dimension must be divisible by 4 for sin-cos 2D position embedding" + pos_dim = dim // 4 + bands = inv_freq_bands( + pos_dim, temperature=temperature, step=1, dtype=dtype, device=device + ) + + if reverse_coord: + feat_shape = feat_shape[::-1] # stack W, H instead of H, W + grid = ( + torch.stack( + torch.meshgrid( + [torch.arange(s, device=device, dtype=dtype) for s in feat_shape] + ) + ) + .flatten(1) + .transpose(0, 1) + ) + pos2 = grid.unsqueeze(-1) * bands.unsqueeze(0) + # FIXME add support for unflattened spatial dim? + + stack_dim = ( + 2 if interleave_sin_cos else 1 + ) # stack sin, cos, sin, cos instead of sin sin cos cos + pos_emb = torch.stack([torch.sin(pos2), torch.cos(pos2)], dim=stack_dim).flatten(1) + return pos_emb + + +def build_fourier_pos_embed( + feat_shape: List[int], + bands: Optional[torch.Tensor] = None, + num_bands: int = 64, + max_res: int = 224, + linear_bands: bool = False, + include_grid: bool = False, + concat_out: bool = True, + in_pixels: bool = True, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +) -> List[torch.Tensor]: + if bands is None: + if in_pixels: + bands = pixel_freq_bands( + num_bands, + float(max_res), + linear_bands=linear_bands, + dtype=dtype, + device=device, + ) + else: + bands = inv_freq_bands(num_bands, step=1, dtype=dtype, device=device) + else: + if device is None: + device = bands.device + if dtype is None: + dtype = bands.dtype + + if in_pixels: + grid = torch.stack( + torch.meshgrid( + [ + torch.linspace(-1.0, 1.0, steps=s, device=device, dtype=dtype) + for s in feat_shape + ] + ), + dim=-1, + ) + else: + grid = torch.stack( + torch.meshgrid( + [torch.arange(s, device=device, dtype=dtype) for s in feat_shape] + ), + dim=-1, + ) + grid = grid.unsqueeze(-1) + pos = grid * bands + + pos_sin, pos_cos = pos.sin(), pos.cos() + out = (grid, pos_sin, pos_cos) if include_grid else (pos_sin, pos_cos) + # FIXME torchscript doesn't like multiple return types, probably need to always cat? + if concat_out: + out = torch.cat(out, dim=-1) + return out + + +class FourierEmbed(nn.Module): + def __init__( + self, + max_res: int = 224, + num_bands: int = 64, + concat_grid=True, + keep_spatial=False, + ): + super().__init__() + self.max_res = max_res + self.num_bands = num_bands + self.concat_grid = concat_grid + self.keep_spatial = keep_spatial + self.register_buffer( + "bands", pixel_freq_bands(max_res, num_bands), persistent=False + ) + + def forward(self, x): + B, C = x.shape[:2] + feat_shape = x.shape[2:] + emb = build_fourier_pos_embed( + feat_shape, + self.bands, + include_grid=self.concat_grid, + dtype=x.dtype, + device=x.device, + ) + emb = emb.transpose(-1, -2).flatten(len(feat_shape)) + batch_expand = (B,) + (-1,) * (x.ndim - 1) + + # FIXME support nD + if self.keep_spatial: + x = torch.cat( + [x, emb.unsqueeze(0).expand(batch_expand).permute(0, 3, 1, 2)], dim=1 + ) + else: + x = torch.cat( + [x.permute(0, 2, 3, 1), emb.unsqueeze(0).expand(batch_expand)], dim=-1 + ) + x = x.reshape(B, feat_shape.numel(), -1) + + return x + + +def rot(x): + return torch.stack([-x[..., 1::2], x[..., ::2]], -1).reshape(x.shape) + + +def apply_rot_embed(x: torch.Tensor, sin_emb, cos_emb): + return x * cos_emb + rot(x) * sin_emb + + +def apply_rot_embed_list(x: List[torch.Tensor], sin_emb, cos_emb): + if isinstance(x, torch.Tensor): + x = [x] + return [t * cos_emb + rot(t) * sin_emb for t in x] + + +def apply_rot_embed_split(x: torch.Tensor, emb): + split = emb.shape[-1] // 2 + return x * emb[:, :split] + rot(x) * emb[:, split:] + + +def build_rotary_pos_embed( + feat_shape: List[int], + bands: Optional[torch.Tensor] = None, + dim: int = 64, + max_freq: float = 224, + linear_bands: bool = False, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +): + """ + NOTE: shape arg should include spatial dim only + """ + feat_shape = torch.Size(feat_shape) + + sin_emb, cos_emb = build_fourier_pos_embed( + feat_shape, + bands=bands, + num_bands=dim // 4, + max_res=max_freq, + linear_bands=linear_bands, + concat_out=False, + device=device, + dtype=dtype, + ) + N = feat_shape.numel() + sin_emb = sin_emb.reshape(N, -1).repeat_interleave(2, -1) + cos_emb = cos_emb.reshape(N, -1).repeat_interleave(2, -1) + return sin_emb, cos_emb + + +class RotaryEmbedding(nn.Module): + """Rotary position embedding + NOTE: This is my initial attempt at impl rotary embedding for spatial use, it has not + been well tested, and will likely change. It will be moved to its own file. + The following impl/resources were referenced for this impl: + * https://github.com/lucidrains/vit-pytorch/blob/6f3a5fcf0bca1c5ec33a35ef48d97213709df4ba/vit_pytorch/rvt.py + * https://blog.eleuther.ai/rotary-embeddings/ + """ + + def __init__(self, dim, max_res=224, linear_bands: bool = False): + super().__init__() + self.dim = dim + self.register_buffer( + "bands", + pixel_freq_bands(dim // 4, max_res, linear_bands=linear_bands), + persistent=False, + ) + + def get_embed(self, shape: List[int]): + return build_rotary_pos_embed(shape, self.bands) + + def forward(self, x): + # assuming channel-first tensor where spatial dim are >= 2 + sin_emb, cos_emb = self.get_embed(x.shape[2:]) + return apply_rot_embed(x, sin_emb, cos_emb) + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) diff --git a/lavis/models/clip_models/tokenizer.py b/lavis/models/clip_models/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..7e19124df29ace4b7e0599d1082e80d38aca0748 --- /dev/null +++ b/lavis/models/clip_models/tokenizer.py @@ -0,0 +1,203 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/mlfoundations/open_clip +""" + +""" CLIP tokenizer +Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. +""" +import gzip +import html +import os +from functools import lru_cache +from typing import Union, List + +import ftfy +import regex as re +import torch + + +@lru_cache() +def default_bpe(): + return os.path.join( + os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz" + ) + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r"\s+", " ", text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + def __init__(self, bpe_path: str = default_bpe(), special_tokens=None): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode("utf-8").split("\n") + merges = merges[1 : 49152 - 256 - 2 + 1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v + "" for v in vocab] + for merge in merges: + vocab.append("".join(merge)) + if not special_tokens: + special_tokens = ["", ""] + else: + special_tokens = ["", ""] + special_tokens + vocab.extend(special_tokens) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {t: t for t in special_tokens} + special = "|".join(special_tokens) + self.pat = re.compile( + special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE, + ) + + self.vocab_size = len(self.encoder) + self.all_special_ids = [self.encoder[t] for t in special_tokens] + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + "",) + pairs = get_pairs(word) + + if not pairs: + return token + "" + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) + bpe_tokens.extend( + self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ") + ) + return bpe_tokens + + def decode(self, tokens): + text = "".join([self.decoder[token] for token in tokens]) + text = ( + bytearray([self.byte_decoder[c] for c in text]) + .decode("utf-8", errors="replace") + .replace("", " ") + ) + return text + + +_tokenizer = SimpleTokenizer() + + +def tokenize( + texts: Union[str, List[str]], context_length: int = 77 +) -> torch.LongTensor: + """ + Returns the tokenized representation of given input string(s) + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + context_length : int + The context length to use; all CLIP models use 77 as the context length + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] + + sot_token = _tokenizer.encoder[""] + eot_token = _tokenizer.encoder[""] + all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + tokens = tokens[:context_length] # Truncate + result[i, : len(tokens)] = torch.tensor(tokens) + + return result diff --git a/lavis/models/clip_models/transform.py b/lavis/models/clip_models/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..e1104418cf7fb3f9bf169d52a0f8a051b9200c42 --- /dev/null +++ b/lavis/models/clip_models/transform.py @@ -0,0 +1,111 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/mlfoundations/open_clip +""" + +from typing import Optional, Sequence, Tuple + +import torch +import torch.nn as nn +import torchvision.transforms.functional as F + + +from torchvision.transforms import ( + Normalize, + Compose, + RandomResizedCrop, + InterpolationMode, + ToTensor, + Resize, + CenterCrop, +) + + +class ResizeMaxSize(nn.Module): + def __init__( + self, max_size, interpolation=InterpolationMode.BICUBIC, fn="max", fill=0 + ): + super().__init__() + if not isinstance(max_size, int): + raise TypeError(f"Size should be int. Got {type(max_size)}") + self.max_size = max_size + self.interpolation = interpolation + self.fn = min if fn == "min" else min + self.fill = fill + + def forward(self, img): + if isinstance(img, torch.Tensor): + height, width = img.shape[:2] + else: + width, height = img.size + scale = self.max_size / float(max(height, width)) + if scale != 1.0: + new_size = tuple(round(dim * scale) for dim in (height, width)) + img = F.resize(img, new_size, self.interpolation) + pad_h = self.max_size - new_size[0] + pad_w = self.max_size - new_size[1] + img = F.pad( + img, + padding=[ + pad_w // 2, + pad_h // 2, + pad_w - pad_w // 2, + pad_h - pad_h // 2, + ], + fill=self.fill, + ) + return img + + +def _convert_to_rgb(image): + return image.convert("RGB") + + +def image_transform( + image_size: int, + is_train: bool, + mean: Optional[Tuple[float, ...]] = None, + std: Optional[Tuple[float, ...]] = None, + resize_longest_max: bool = False, + fill_color: int = 0, +): + mean = mean or (0.48145466, 0.4578275, 0.40821073) # OpenAI dataset mean + std = std or (0.26862954, 0.26130258, 0.27577711) # OpenAI dataset std + if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: + # for square size, pass size as int so that Resize() uses aspect preserving shortest edge + image_size = image_size[0] + + normalize = Normalize(mean=mean, std=std) + if is_train: + return Compose( + [ + RandomResizedCrop( + image_size, + scale=(0.9, 1.0), + interpolation=InterpolationMode.BICUBIC, + ), + _convert_to_rgb, + ToTensor(), + normalize, + ] + ) + else: + if resize_longest_max: + transforms = [ResizeMaxSize(image_size, fill=fill_color)] + else: + transforms = [ + Resize(image_size, interpolation=InterpolationMode.BICUBIC), + CenterCrop(image_size), + ] + transforms.extend( + [ + _convert_to_rgb, + ToTensor(), + normalize, + ] + ) + return Compose(transforms) diff --git a/lavis/models/clip_models/utils.py b/lavis/models/clip_models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9ba9191a8c8043ed07d96144b4c10fcffb08cc9c --- /dev/null +++ b/lavis/models/clip_models/utils.py @@ -0,0 +1,49 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/mlfoundations/open_clip +""" + +from torch import nn as nn +from torchvision.ops.misc import FrozenBatchNorm2d + + +def freeze_batch_norm_2d(module, module_match={}, name=""): + """ + Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is + itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and + returned. Otherwise, the module is walked recursively and submodules are converted in place. + Args: + module (torch.nn.Module): Any PyTorch module. + module_match (dict): Dictionary of full module names to freeze (all if empty) + name (str): Full module name (prefix) + Returns: + torch.nn.Module: Resulting module + Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 + """ + res = module + is_match = True + if module_match: + is_match = name in module_match + if is_match and isinstance( + module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm) + ): + res = FrozenBatchNorm2d(module.num_features) + res.num_features = module.num_features + res.affine = module.affine + if module.affine: + res.weight.data = module.weight.data.clone().detach() + res.bias.data = module.bias.data.clone().detach() + res.running_mean.data = module.running_mean.data + res.running_var.data = module.running_var.data + res.eps = module.eps + else: + for child_name, child in module.named_children(): + full_child_name = ".".join([name, child_name]) if name else child_name + new_child = freeze_batch_norm_2d(child, module_match, full_child_name) + if new_child is not child: + res.add_module(child_name, new_child) + return res diff --git a/lavis/models/clip_vit.py b/lavis/models/clip_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..6b73df57abeb1d894bfd7b41ae03be8ede86845c --- /dev/null +++ b/lavis/models/clip_vit.py @@ -0,0 +1,263 @@ +from collections import OrderedDict +from itertools import repeat +import collections.abc +import math + +import torch +import torch.nn.functional as F +from torch import nn + +from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper + +from lavis.models.eva_vit import convert_weights_to_fp16 +from lavis.common.dist_utils import download_cached_file + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.relu1 = nn.ReLU(inplace=True) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu2 = nn.ReLU(inplace=True) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu3 = nn.ReLU(inplace=True) + + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential(OrderedDict([ + ("-1", nn.AvgPool2d(stride)), + ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)), + ("1", nn.BatchNorm2d(planes * self.expansion)) + ])) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu1(self.bn1(self.conv1(x))) + out = self.relu2(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu3(out) + return out + + +class AttentionPool2d(nn.Module): + def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None): + super().__init__() + self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + + def forward(self, x): + x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = F.multi_head_attention_forward( + query=x, key=x, value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False + ) + + return x[0] + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, use_grad_checkpointing=False): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential(OrderedDict([ + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model)) + ])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + if use_grad_checkpointing: + self.attn = checkpoint_wrapper(self.attn) + self.mlp = checkpoint_wrapper(self.mlp) + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None + return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_grad_checkpointing=False): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, use_grad_checkpointing and i>12) for i in range(layers)]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class VisionTransformer(nn.Module): + def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, use_grad_checkpointing: bool): + super().__init__() + self.input_resolution = input_resolution + self.num_features = width + self.num_heads = heads + self.num_patches = (input_resolution // patch_size) ** 2 + self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) + + scale = width ** -0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn(self.num_patches + 1, width)) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads, use_grad_checkpointing=use_grad_checkpointing) + +# self.ln_final = LayerNorm(width) + + def forward(self, x: torch.Tensor): + + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + +# x = self.ln_final(x) + return x + + def get_num_layer(self, var_name=""): + if var_name in ("class_embedding", "positional_embedding", "conv1", "ln_pre"): + return 0 + elif var_name.startswith("transformer.resblocks"): + layer_id = int(var_name.split('.')[2]) + return layer_id + 1 + else: + return len(self.transformer.resblocks) + + +# From PyTorch internals +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + return parse +to_2tuple = _ntuple(2) + +def interpolate_pos_embed(model, state_dict, interpolation: str = 'bicubic', seq_dim=1): + # Rescale the grid of position embeddings when loading from state_dict + old_pos_embed = state_dict.get('positional_embedding', None) + + grid_size = round((model.positional_embedding.shape[0] - 1) ** 0.5) + if old_pos_embed is None: + return + grid_size = to_2tuple(grid_size) + extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more) + new_seq_len = grid_size[0] * grid_size[1] + extra_tokens + if new_seq_len == old_pos_embed.shape[0]: + return + + if extra_tokens: + pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:] + else: + pos_emb_tok, pos_emb_img = None, old_pos_embed + + old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img)))) + + print('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size) + pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2) + pos_emb_img = F.interpolate( + pos_emb_img, + size=grid_size, + mode=interpolation, + align_corners=True, + ) + pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0] + if pos_emb_tok is not None: + new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0) + else: + new_pos_embed = pos_emb_img + state_dict['positional_embedding'] = new_pos_embed + + +def create_clip_vit_L(img_size=224,use_checkpoint=False,precision="fp16"): + model = VisionTransformer( + input_resolution=img_size, + patch_size=14, + width=1024, + layers=23, + heads=16, + use_grad_checkpointing=use_checkpoint, + ) + url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/clip_vit_L.pth" + cached_file = download_cached_file( + url, check_hash=False, progress=True + ) + state_dict = torch.load(cached_file, map_location="cpu") + interpolate_pos_embed(model,state_dict) + + incompatible_keys = model.load_state_dict(state_dict, strict=False) + # print(incompatible_keys) + + if precision == "fp16": + convert_weights_to_fp16(model) + return model diff --git a/lavis/models/eva_vit.py b/lavis/models/eva_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..5b80b8206e35456ef3c7eb3278e2b65901a8aa7e --- /dev/null +++ b/lavis/models/eva_vit.py @@ -0,0 +1,455 @@ +# Based on EVA, BEIT, timm and DeiT code bases +# https://github.com/baaivision/EVA +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/facebookresearch/deit/ +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import math +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import drop_path, to_2tuple, trunc_normal_ +from timm.models.registry import register_model + +from lavis.common.dist_utils import download_cached_file + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + **kwargs + } + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the orignal BERT implement + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., window_size=None, attn_head_dim=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) + # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, + window_size=None, attn_head_dim=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if init_values is not None and init_values > 0: + self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) + self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias=None): + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class RelativePositionBias(nn.Module): + + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.02) + + def forward(self): + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +class VisionTransformer(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, + use_mean_pooling=True, init_scale=0.001, use_checkpoint=False): + super().__init__() + self.image_size = img_size + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + if use_abs_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + self.use_checkpoint = use_checkpoint + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None) + for i in range(depth)]) +# self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim) +# self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None +# self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + # trunc_normal_(self.mask_token, std=.02) +# if isinstance(self.head, nn.Linear): +# trunc_normal_(self.head.weight, std=.02) + self.apply(self._init_weights) + self.fix_init_weight() +# if isinstance(self.head, nn.Linear): +# self.head.weight.data.mul_(init_scale) +# self.head.bias.data.mul_(init_scale) + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, rel_pos_bias) + else: + x = blk(x, rel_pos_bias) + return x +# x = self.norm(x) + +# if self.fc_norm is not None: +# t = x[:, 1:, :] +# return self.fc_norm(t.mean(1)) +# else: +# return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) +# x = self.head(x) + return x + + def get_intermediate_layers(self, x): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + features = [] + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for blk in self.blocks: + x = blk(x, rel_pos_bias) + features.append(x) + + return features + + def get_num_layer(self, var_name=""): + if var_name in ("cls_token", "mask_token", "pos_embed"): + return 0 + elif var_name.startswith("patch_embed"): + return 0 + elif var_name.startswith("rel_pos_bias"): + return len(self.blocks) - 1 + elif var_name.startswith("blocks"): + layer_id = int(var_name.split('.')[1]) + return layer_id + 1 + else: + return len(self.blocks) + + +def interpolate_pos_embed(model, checkpoint_model): + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'].float() + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed + + +def convert_weights_to_fp16(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(l): + if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)): + l.weight.data = l.weight.data.half() + if l.bias is not None: + l.bias.data = l.bias.data.half() + +# if isinstance(l, (nn.MultiheadAttention, Attention)): +# for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]: +# tensor = getattr(l, attr) +# if tensor is not None: +# tensor.data = tensor.data.half() + + model.apply(_convert_weights_to_fp16) + + +def create_eva_vit_g(img_size=224,drop_path_rate=0.4,use_checkpoint=False,precision="fp16"): + model = VisionTransformer( + img_size=img_size, + patch_size=14, + use_mean_pooling=False, + embed_dim=1408, + depth=39, + num_heads=1408//88, + mlp_ratio=4.3637, + qkv_bias=True, + drop_path_rate=drop_path_rate, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + use_checkpoint=use_checkpoint, + ) + url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth" + cached_file = download_cached_file( + url, check_hash=False, progress=True + ) + state_dict = torch.load(cached_file, map_location="cpu") + interpolate_pos_embed(model,state_dict) + + incompatible_keys = model.load_state_dict(state_dict, strict=False) +# print(incompatible_keys) + + if precision == "fp16": +# model.to("cuda") + convert_weights_to_fp16(model) + return model \ No newline at end of file diff --git a/lavis/models/gpt_models/__pycache__/gpt_dialogue.cpython-310.pyc b/lavis/models/gpt_models/__pycache__/gpt_dialogue.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b498c02242882e6407a69f5356bfb37db7fd5197 Binary files /dev/null and b/lavis/models/gpt_models/__pycache__/gpt_dialogue.cpython-310.pyc differ diff --git a/lavis/models/gpt_models/gpt_dialogue.py b/lavis/models/gpt_models/gpt_dialogue.py new file mode 100644 index 0000000000000000000000000000000000000000..1ea769701c7119a3a11b43627519cf51b8f66adf --- /dev/null +++ b/lavis/models/gpt_models/gpt_dialogue.py @@ -0,0 +1,110 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import torch.nn as nn +from lavis.common.registry import registry +from lavis.models.base_model import BaseModel +from torch.nn import CrossEntropyLoss, MSELoss +from transformers import GPT2LMHeadModel +from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions + + +@registry.register_model("gpt_dialogue") +class GPTDialogue(BaseModel, GPT2LMHeadModel): + + PRETRAINED_MODEL_CONFIG_DICT = {"base": "configs/models/gpt_dialogue_base.yaml"} + + def __init__(self, config, len_video_ft=4224): + + super().__init__(config) + + self.video_ff = nn.Linear(len_video_ft, config.n_embd) + self.video_ff_out = nn.Linear(config.n_embd, len_video_ft) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + samples, + past_key_values=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + + input_embs = self.transformer.wte(samples["input_ids"]) + video_embs = self.video_ff(samples["video_fts"]) + input_embs = torch.cat([video_embs, input_embs], dim=1) + + transformer_outputs = self.transformer( + attention_mask=samples["attn_mask"], + token_type_ids=samples["token_type_ids"], + inputs_embeds=input_embs, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + loss = None + if samples["labels"] is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = samples["labels"][..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) + ) + + if samples["video_fts"] is not None: + len_video_fts = samples["video_fts"].shape[1] + video_logits = self.video_ff_out(hidden_states[:, :len_video_fts, :]) + # Shift so that tokens < n predict n + shift_logits = video_logits[..., :-1, :].contiguous() + shift_labels = samples["video_fts"][..., 1:, :].contiguous() + # Flatten the tokens + loss_fct = MSELoss(reduction="mean") + video_loss = loss_fct(shift_logits, shift_labels) + + if loss is not None: + loss = loss + video_loss + else: + loss = video_loss + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + @classmethod + def from_config(cls, cfg): + model = cls.__bases__[1].from_pretrained("gpt2") + model.resize_token_embeddings(cfg["len_tokenizer"]) + return model diff --git a/lavis/models/img2prompt_models/__init__.py b/lavis/models/img2prompt_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bf9d1ec8674c89a26e8a1d374c6ea80a16bc6c5b --- /dev/null +++ b/lavis/models/img2prompt_models/__init__.py @@ -0,0 +1,11 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch + + + diff --git a/lavis/models/img2prompt_models/__pycache__/__init__.cpython-310.pyc b/lavis/models/img2prompt_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c266fd5ac55baa13f855db8b0be1c45113b86a9c Binary files /dev/null and b/lavis/models/img2prompt_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/img2prompt_models/__pycache__/img2prompt_vqa.cpython-310.pyc b/lavis/models/img2prompt_models/__pycache__/img2prompt_vqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5e360e713d2392cc768abd89ff6633214fd60e0 Binary files /dev/null and b/lavis/models/img2prompt_models/__pycache__/img2prompt_vqa.cpython-310.pyc differ diff --git a/lavis/models/img2prompt_models/img2prompt_vqa.py b/lavis/models/img2prompt_models/img2prompt_vqa.py new file mode 100644 index 0000000000000000000000000000000000000000..0df1807eb3cae7e62d5fe85d5058eb3e9657c4b1 --- /dev/null +++ b/lavis/models/img2prompt_models/img2prompt_vqa.py @@ -0,0 +1,587 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + (CVPR 23') From Images to Textual Prompts: Zero-shot VQA with Frozen Large Language Models, + by Jiaxian Guo, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Boyang Li, Dacheng Tao, Steven CH Hoi + + Initially referred as Img2prompt_vqa, later Img2LLM_vqa. +""" + +import random + +import spacy +import torch +import torch.nn.functional as F +from transformers import T5ForConditionalGeneration, T5Tokenizer + +from lavis.common.dist_utils import download_cached_file +from lavis.common.registry import registry +from lavis.models.base_model import BaseModel +from lavis.models.blip_models.blip_image_text_matching import compute_gradcam + +open_pos = ["NOUN", "VERB", "ADJ", "ADV", "NUM"] + + + +@registry.register_model("img2prompt_vqa") +class Img2PromptVQA(BaseModel): + """ + Img2Prompt_VQA model consists of three submodels for zero-shot VQA: + 1. Image-questioning matching model + 2. Image captioning model + 3. Large Language model + + Supported model types: + - base: BLIPITM, BLIPCaption, PNPUnifiedQAv2FiD (t5-base) + - large: BLIPITM, BLIPCaption, PNPUnifiedQAv2FiD (t5-large) + - 3b: BLIPITM, BLIPCaption, PNPUnifiedQAv2FiD (t5-3b) + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("img2prompt_vqa", "base", is_eval=True) + """ + + PRETRAINED_MODEL_CONFIG_DICT = { + "base": "configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml", + } + + def __init__( + self, + image_question_matching_model, + image_captioning_model, + question_generation_model, + question_generation_tokenizer, + offload_model=False, + ): + super().__init__() + + self.image_question_matching_model = image_question_matching_model + self.image_captioning_model = image_captioning_model + self.question_generation_model = question_generation_model + self.question_generation_tokenizer = question_generation_tokenizer + self.offload_model = offload_model + self.nlp = spacy.load("en_core_web_sm") + + def forward_itm(self, samples, block_num=7): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + block_num (int): The index of cross-attention block for gradcam computation. + + Returns: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + - gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + """ + image = samples["image"] + question = [text.strip("?") for text in samples["text_input"]] + tokenized_text = self.image_question_matching_model.tokenizer( + question, padding="longest", truncation=True, return_tensors="pt" + ).to(self.image_question_matching_model.device) + with torch.set_grad_enabled(True): + gradcams, _ = compute_gradcam( + model=self.image_question_matching_model, + visual_input=image, + text_input=question, + tokenized_text=tokenized_text, + block_num=block_num, + ) + + gradcams = [gradcam_[1] for gradcam_ in gradcams] + samples["gradcams"] = torch.stack(gradcams).reshape( + samples["image"].size(0), -1 + ) + + return samples + + def itm_rank(self, image_embeds, image_atts, encoder_input_ids, match_head="itm"): + # breakpoint() + encoder_input_ids = encoder_input_ids.clone() + encoder_input_ids = encoder_input_ids[:, self.prompt_length - 1 :] + text_attention_mask = (encoder_input_ids != self.tokenizer.pad_token_id).long() + + if match_head == "itm": + # encoder_input_ids = encoder_input_ids.clone() + encoder_input_ids[:, 0] = self.tokenizer.enc_token_id + output = self.text_encoder( + encoder_input_ids, + attention_mask=text_attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + itm_output = self.itm_head(output.last_hidden_state[:, 0, :]) + return itm_output # , mask, token_length + + elif match_head == "itc": + encoder_input_ids[:, 0] = self.tokenizer.cls_token_id + text_output = self.text_encoder( + encoder_input_ids, + attention_mask=text_attention_mask, + return_dict=True, + mode="text", + ) + image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + text_feat = F.normalize( + self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1 + ) + + sim = image_feat @ text_feat.t() + return sim + + def forward_cap( + self, + samples, + cap_max_length=20, + cap_min_length=0, + top_p=1, + top_k=50, + repetition_penalty=1.0, + num_captions=100, + num_patches=20, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + - gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + cap_max_length (int): The maximum length of the caption to be generated. + cap_min_length (int): The minimum length of the caption to be generated. + top_p (float): The cumulative probability for nucleus sampling. + top_k (float): The number of the highest probability tokens for top-k sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_captions (int): Number of captions generated for each image. + num_patches (int): Number of patches sampled for each image. + + Returns: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + - gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + - captions (nested list): A nested list of strings of total length batch_size * num_captions + """ + encoder_out = self.image_captioning_model.forward_encoder(samples) + captions = [[] for _ in range(encoder_out.size(0))] + + min_num_captions = 0 + + while min_num_captions < num_captions: + encoder_out_samples = [] + for i in range(num_captions): + patch_id = ( + torch.multinomial( + samples["gradcams"].to(self.image_captioning_model.device), + num_patches, + ).reshape(encoder_out.size(0), -1) + + 1 + ) + patch_id = ( + patch_id.sort(dim=1) + .values.unsqueeze(-1) + .expand(-1, -1, encoder_out.size(2)) + ) + encoder_out_sample = torch.gather(encoder_out, 1, patch_id) + encoder_out_samples.append(encoder_out_sample) + + stacked = torch.stack(encoder_out_samples, dim=1) + image_embeds = torch.flatten( + stacked, start_dim=0, end_dim=1 + ) # (bsz*num_seq, num_patch, dim) + + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( + self.image_captioning_model.device + ) + model_kwargs = { + "encoder_hidden_states": image_embeds, + "encoder_attention_mask": image_atts, + } + + prompt = [self.image_captioning_model.prompt] * image_embeds.size(0) + prompt = self.image_captioning_model.tokenizer( + prompt, return_tensors="pt" + ).to(self.image_captioning_model.device) + prompt.input_ids[:, 0] = self.image_captioning_model.tokenizer.bos_token_id + prompt.input_ids = prompt.input_ids[:, :-1] + + decoder_out = self.image_captioning_model.text_decoder.generate( + input_ids=prompt.input_ids, + max_length=cap_max_length, + min_length=cap_min_length, + do_sample=True, + top_p=top_p, + top_k=top_k, + num_return_sequences=1, + eos_token_id=self.image_captioning_model.tokenizer.sep_token_id, + pad_token_id=self.image_captioning_model.tokenizer.pad_token_id, + repetition_penalty=repetition_penalty, + **model_kwargs + ) + + itm_outputs = self.image_question_matching_model.itm_rank( + image_embeds, image_atts, encoder_input_ids=decoder_out + ) # caption filter + + outputs = self.image_captioning_model.tokenizer.batch_decode( + decoder_out, skip_special_tokens=True + ) + + for counter, output in enumerate(outputs): + ind = counter // num_captions + if len(captions[ind]) < num_captions: + caption = output[len(self.image_captioning_model.prompt) :] + overlap_caption = [1 for caps in captions[ind] if caption in caps] + # print(itm_outputs) + if ( + len(overlap_caption) == 0 and itm_outputs[counter] >= 0.5 + ): # image filter + captions[ind].append(caption) + + min_num_captions = min([len(i) for i in captions]) + + samples["captions"] = captions + + return samples + + def answer_extraction(self, caption, num_question_generation=30): + cap_use = "" + # print(caption) + caption = caption + ans_to_cap_dict = {} + answers = [] + for cap_idx, cap in enumerate(caption): + # print(cap) + cap_use += cap + cap = cap.strip().strip(".") + # print(cap) + cap = self.nlp(cap) + for token in cap: # Noun /Verb/Adj//NUM + if token.pos_ in open_pos: + if token.text.lower() not in ans_to_cap_dict: + ans_to_cap_dict[token.text.lower()] = [cap_idx] + else: + if cap_idx not in ans_to_cap_dict[token.text.lower()]: + ans_to_cap_dict[token.text.lower()].append(cap_idx) + answers.append(token.text) + for ent in cap.ents: + + if ent.text not in answers: + if ent.text.lower() not in ans_to_cap_dict: + ans_to_cap_dict[ent.text.lower()] = [cap_idx] + else: + if cap_idx not in ans_to_cap_dict[ent.text.lower()]: + ans_to_cap_dict[ent.text.lower()].append(cap_idx) + answers.append(ent.text) + for chunk in cap.noun_chunks: + if len(chunk.text.split()) < 4: + if chunk.text.lower() not in ans_to_cap_dict: + ans_to_cap_dict[chunk.text.lower()] = [cap_idx] + else: + if cap_idx not in ans_to_cap_dict[chunk.text.lower()]: + ans_to_cap_dict[chunk.text.lower()].append(cap_idx) + # print(chunk.text) + answers.append(chunk.text) + answers = sorted(answers, key=answers.count, reverse=True) + real_answers = [] + for i in answers: + i = i + "." + if i not in real_answers: + real_answers.append(i) + + contexts_for_question_generation = [] + answers = [] + for ans in real_answers[ + :num_question_generation + ]: # Generate questions for 30 answers with max frequencies. + contexts_for_question_generation.append( + "answer: %s context: %s." % (ans, cap_use) + ) + answers.append(ans) + contexts_for_question_generation.append( + "answer: %s context: %s." % ("yes.", cap_use) + ) + answers.append("yes.") + return contexts_for_question_generation, answers, ans_to_cap_dict + + def forward_qa_generation(self, samples): + caption = samples["captions"][0] + ( + contexts_for_question_generation, + answers, + ans_to_cap_dict, + ) = self.answer_extraction(caption) + inputs = self.question_generation_tokenizer( + contexts_for_question_generation, + padding="longest", + truncation=True, + max_length=2048, + return_tensors="pt", + ).to(self.device) + question_size = inputs.input_ids.shape[0] + cur_b = 0 + true_input_size = 10 + outputs_list = [] + while cur_b < question_size: + outputs = self.question_generation_model.generate( + input_ids=inputs.input_ids[cur_b : cur_b + true_input_size], + attention_mask=inputs.attention_mask[cur_b : cur_b + true_input_size], + num_beams=3, + max_length=30, + ) + questions = self.question_generation_tokenizer.batch_decode( + outputs, skip_special_tokens=True + ) + outputs_list += questions + cur_b += true_input_size + questions = outputs_list + samples["questions"] = questions + samples["answers"] = answers + samples["ans_to_cap_dict"] = ans_to_cap_dict + # results.append({"question_id": ques_id, "question":questions,"answer":answers}) + return samples + + def create_context_prompt(self, samples, num_caps_per_img=30): + ans_dict_queid = samples["ans_to_cap_dict"] + # print(ans_dict_queid) + caption = samples["captions"][0] + answers = samples["answers"] + Context_Prompt = "" + mycontexts_id = [] + for idx in range(num_caps_per_img): + cap_id_list = ans_dict_queid.get( + answers[(len(answers) - 1 - idx) % len(answers)][:-1].lower(), [0] + ) + for cap_id in cap_id_list: + if cap_id not in mycontexts_id: + Context_Prompt += caption[cap_id] + mycontexts_id.append(cap_id) + break # We just take one cap for each answer + samples["Context_Prompt"] = Context_Prompt + return Context_Prompt + + def create_task_prompt( + self, samples, question_type="neural", num_question_per_img=30 + ): + syn_question_queid = samples["questions"] + syn_ans_queid = samples["answers"] + Task_Prompt = "" + for idx in range(num_question_per_img): + # if config['random_question']: + # qa_idx = random.randint(0, len(syn_question_queid) - 1) + # else: + qa_idx = idx + if ( + question_type != "rule" and num_question_per_img > 0 and idx < 1 + ): ## yes and no questions for vqav2 + # Task_Prompt += "Question:" + # Task_Prompt += syn_question_queid_next[-1] + # Task_Prompt += '\n' + # Task_Prompt += "Answer:no\n" + Task_Prompt += "Question:" + Task_Prompt += syn_question_queid[-1] + Task_Prompt += "\n" + Task_Prompt += "Answer:" + Task_Prompt += "yes\n" + Task_Prompt += "Question:Is this a toilet?\n" + Task_Prompt += "Answer:no\n" + if "question_type" == "rule": # Rule-Based Question Generation + Noun_Questions = [ + "What item is this in this picture?", + "What item is that in this picture?", + ] + + Verb_Questions = [ + "What action is being done in this picture?", + "Why is this item doing in this picture?", + "Which action is being taken in this picture?", + "What action is item doing in this picture?", + "What action is item performing in this picture?", + ] + + Adj_Questions = [ + "How to describe one item in this picture?", + "What is item's ADJ TYPE in this picture?", + "What is the ADJ TYPE in this picture?", + ] + + Task_Prompt += "Question:" + doc = self.nlp(syn_ans_queid[(qa_idx) % len(syn_ans_queid)][:-1].lower()) + if doc[-1].pos_ == "NOUN": + Task_Prompt += Noun_Questions[ + random.randint(0, len(Noun_Questions) - 1) + ] + elif doc[-1].pos_ == "VERB": + Task_Prompt += Verb_Questions[ + random.randint(0, len(Verb_Questions) - 1) + ] + elif doc[-1].pos_ == "ADJ": + Task_Prompt += Adj_Questions[ + random.randint(0, len(Adj_Questions) - 1) + ] + + Task_Prompt += "\n" + + Task_Prompt += "Answer:" + Task_Prompt += syn_ans_queid[(qa_idx) % len(syn_ans_queid)][:-1].lower() + Task_Prompt += "\n" + samples["Task_Prompt"] = Task_Prompt + # print(Task_Prompt) + return Task_Prompt + + def prompts_construction( + self, + samples, + question_type="neural", + num_caps_per_img=30, + num_question_per_img=30, + ): + Prompt = "Please reason the answer of the questions according to the given contexts.\n" + + Context_Prompt = self.create_context_prompt(samples, num_caps_per_img) + + Task_Prompt = self.create_task_prompt( + samples, question_type, num_question_per_img + ) + + Img2Prompt = ( + Prompt + + "Contexts:" + + Context_Prompt + + "\n" + + Task_Prompt + + "Question:" + + samples["text_input"][0] + + "\nAnswer:" + ) + return Img2Prompt + + def prepare_LLM_input( + self, + samples, + num_beams=1, + inference_method="generate", + max_len=20, + min_len=0, + internal_bsz_fid=1, + num_captions=50, + num_captions_fid=1, + cap_max_length=20, + cap_min_length=10, + top_k=50, + top_p=1, + repetition_penalty=1, + num_patches=20, + block_num=7, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). Default H=480, W=480. + - text_input (str or [str]): String or a list of strings, each string is a question. + The number of questions must be equal to the batch size. If a single string, will be converted to a list of string, with length 1 first. + num_beams (int): Number of beams for beam search. 1 means no beam search. + inference_method (str): Inference method. Must be "generate". The model will generate answers. + max_len (int): Maximum length of generated answers. + min_len (int): Minimum length of generated answers. + internal_bsz_fid (int): Internal batch size when using FiD decoding. + num_captions (int): Number of captions generated for each image. + num_captions_fid (int): Number of captions concatenated with a question during FiD decoding. + cap_max_length (int): The maximum length of the caption to be generated. + cap_min_length (int): The minimum length of the caption to be generated. + top_k (float): The number of the highest probability tokens for top-k sampling. + top_p (float): The cumulative probability for nucleus sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_patches (int): Number of patches sampled for each image. + block_num (int): The index of cross-attention block for gradcam computation. + + Returns: + List: A list of strings, each string is an answer. + gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + captions (nested list): A nested list of strings of total length batch_size * num_captions + """ + assert inference_method in [ + "generate", + ], "Inference method must be 'generate', got {}.".format(inference_method) + + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] + + assert len(samples["text_input"]) == samples["image"].size( + 0 + ), "The number of questions must be equal to the batch size." + + samples = self.forward_itm(samples, block_num=block_num) + + samples = self.forward_cap( + samples, + cap_max_length=cap_max_length, + cap_min_length=cap_min_length, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + num_captions=num_captions, + num_patches=num_patches, + ) + + if self.offload_model: + samples["image"] = samples["image"].to("cpu") + self.image_question_matching_model.to("cpu") + self.image_captioning_model.to("cpu") + torch.cuda.empty_cache() + + pred_answers = self.forward_qa( + samples, + num_beams=num_beams, + max_len=max_len, + min_len=min_len, + internal_bsz_fid=internal_bsz_fid, + num_captions=num_captions, + num_captions_fid=num_captions_fid, + ) + + if self.offload_model: + self.image_question_matching_model.to(self.question_answering_model.device) + self.image_captioning_model.to(self.question_answering_model.device) + + return pred_answers, samples["captions"], samples["gradcams"] + + @classmethod + def from_config(cls, model_config): + itm_config = model_config.image_question_matching_model + cap_config = model_config.image_captioning_model + + itm_cls = registry.get_model_class(itm_config.arch) + cap_cls = registry.get_model_class(cap_config.arch) + + image_question_matching_model = itm_cls.from_config(itm_config) + image_captioning_model = cap_cls.from_config(cap_config) + + question_generation_tokenizer = T5Tokenizer.from_pretrained( + "google/t5-large-lm-adapt" + ) + question_generation_model = T5ForConditionalGeneration.from_pretrained( + "google/t5-large-lm-adapt" + ) + cached_file = download_cached_file( + "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth", + check_hash=False, + progress=True, + ) + checkpoint = torch.load(cached_file, map_location="cpu") + state_dict = checkpoint["model"] + question_generation_model.load_state_dict(state_dict) + model = cls( + image_question_matching_model=image_question_matching_model, + image_captioning_model=image_captioning_model, + question_generation_model=question_generation_model, + question_generation_tokenizer=question_generation_tokenizer, + offload_model=False, + ) + + return model diff --git a/lavis/models/med.py b/lavis/models/med.py new file mode 100644 index 0000000000000000000000000000000000000000..e963ffb3b3d3a1389e0da1ba0f9c9ac9fb6e84b2 --- /dev/null +++ b/lavis/models/med.py @@ -0,0 +1,1416 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on huggingface code base + https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert +""" + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +from torch import Tensor, device +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss +import torch.nn.functional as F +from transformers import BatchEncoding, PreTrainedTokenizer + +from transformers.activations import ACT2FN +from transformers.file_utils import ( + ModelOutput, +) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from transformers.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from transformers.utils import logging +from transformers.models.bert.configuration_bert import BertConfig +from lavis.common.utils import get_abs_path + +from lavis.models.base_model import BaseEncoder + +logging.set_verbosity_error() +logger = logging.get_logger(__name__) + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id + ) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size + ) + + if config.add_type_embeddings: + self.token_type_embeddings = nn.Embedding( + config.type_vocab_size, config.hidden_size + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + ) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + + self.config = config + + def forward( + self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[ + :, past_key_values_length : seq_length + past_key_values_length + ] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if token_type_ids is not None: + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + else: + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, "embedding_size" + ): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_width, self.all_head_size) + self.value = nn.Linear(config.encoder_width, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, + self.attention_head_size, + ) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1 + ) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype + ) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + relative_position_scores_key = torch.einsum( + "bhrd,lrd->bhlr", key_layer, positional_embedding + ) + attention_scores = ( + attention_scores + + relative_position_scores_query + + relative_position_scores_key + ) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = ( + (context_layer, attention_probs) if output_attentions else (context_layer,) + ) + + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.self = BertSelfAttention(config, is_cross_attention) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.self.num_attention_heads, + self.self.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = ( + self.self.attention_head_size * self.self.num_attention_heads + ) + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[ + 1: + ] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.layer_num = layer_num + + # compatibility for ALBEF and BLIP + try: + # ALBEF & ALPRO + fusion_layer = self.config.fusion_layer + add_cross_attention = ( + fusion_layer <= layer_num and self.config.add_cross_attention + ) + + self.fusion_layer = fusion_layer + except AttributeError: + # BLIP + self.fusion_layer = self.config.num_hidden_layers + add_cross_attention = self.config.add_cross_attention + + # if self.config.add_cross_attention: + if add_cross_attention: + self.crossattention = BertAttention( + config, is_cross_attention=self.config.add_cross_attention + ) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + mode=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = ( + past_key_value[:2] if past_key_value is not None else None + ) + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + # TODO line 482 in albef/models/xbert.py + # compatibility for ALBEF and BLIP + if mode in ["multimodal", "fusion"] and hasattr(self, "crossattention"): + assert ( + encoder_hidden_states is not None + ), "encoder_hidden_states must be given for cross-attention layers" + + if isinstance(encoder_hidden_states, list): + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states[ + (self.layer_num - self.fusion_layer) + % len(encoder_hidden_states) + ], + encoder_attention_mask[ + (self.layer_num - self.fusion_layer) + % len(encoder_hidden_states) + ], + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] + + else: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = ( + outputs + cross_attention_outputs[1:-1] + ) # add cross attentions if we output attention weights + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config, i) for i in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + mode="multimodal", + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + () if output_attentions and self.config.add_cross_attention else None + ) + + next_decoder_cache = () if use_cache else None + + try: + # ALBEF + fusion_layer = self.config.fusion_layer + except AttributeError: + # BLIP + fusion_layer = self.config.num_hidden_layers + + if mode == "text": + start_layer = 0 + # output_layer = self.config.fusion_layer + output_layer = fusion_layer + + elif mode == "fusion": + # start_layer = self.config.fusion_layer + start_layer = fusion_layer + output_layer = self.config.num_hidden_layers + + elif mode == "multimodal": + start_layer = 0 + output_layer = self.config.num_hidden_layers + + # compatibility for ALBEF and BLIP + # for i in range(self.config.num_hidden_layers): + for i in range(start_layer, output_layer): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + # TODO pay attention to this. + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + mode=mode, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + mode=mode, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BertModel(BertPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: Tensor, + input_shape: Tuple[int], + device: device, + is_decoder: bool, + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = ( + seq_ids[None, None, :].repeat(batch_size, seq_length, 1) + <= seq_ids[None, :, None] + ) + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, seq_length, prefix_seq_len), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = ( + causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype + ) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + mode="multimodal", + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = inputs_embeds.device + elif encoder_embeds is not None: + input_shape = encoder_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = encoder_embeds.device + else: + raise ValueError( + "You have to specify either input_ids or inputs_embeds or encoder_embeds" + ) + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] if past_key_values is not None else 0 + ) + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), device=device + ) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ + 0 + ].size() + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) for mask in encoder_attention_mask + ] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + mode=mode, + ) + sequence_output = encoder_outputs[0] + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + # token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + mode="multimodal", + soft_labels=None, + alpha=0, + return_logits=False, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + # token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_embeds=encoder_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + mode=mode, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), labels.view(-1) + ) + + if soft_labels is not None: + loss_distill = -torch.sum( + F.log_softmax(prediction_scores, dim=-1) * soft_labels, dim=-1 + ) + loss_distill = loss_distill[labels != -100].mean() + masked_lm_loss = (1 - alpha) * masked_lm_loss + alpha * loss_distill + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ( + ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + ) + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, attention_mask=None, **model_kwargs + ): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + assert ( + self.config.pad_token_id is not None + ), "The PAD token should be defined for generation" + attention_mask = torch.cat( + [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], + dim=-1, + ) + dummy_token = torch.full( + (effective_batch_size, 1), + self.config.pad_token_id, + dtype=torch.long, + device=input_ids.device, + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +class BertLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + reduction="mean", + mode="multimodal", + soft_labels=None, + alpha=0, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + Returns: + Example:: + >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig + >>> import torch + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.logits + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + mode=mode, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1), + ) + if reduction == "none": + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + if soft_labels is not None: + loss_distill = -torch.sum( + F.log_softmax(shifted_prediction_scores, dim=-1) * soft_labels, dim=-1 + ) + loss_distill = (loss_distill * (labels != -100)).sum(1) + lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past=None, attention_mask=None, **model_kwargs + ): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), + "is_decoder": True, + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx) for past_state in layer_past + ), + ) + return reordered_past + + +class XBertLMHeadDecoder(BertLMHeadModel): + """ + This class decouples the decoder forward logic from the VL model. + In this way, different VL models can share this decoder as long as + they feed encoder_embeds as required. + """ + + @classmethod + def from_config(cls, cfg, from_pretrained=False): + + med_config_path = get_abs_path(cfg.get("med_config_path")) + med_config = BertConfig.from_json_file(med_config_path) + + if from_pretrained: + return cls.from_pretrained("bert-base-uncased", config=med_config) + else: + return cls(config=med_config) + + def generate_from_encoder( + self, + tokenized_prompt, + visual_embeds, + sep_token_id, + pad_token_id, + use_nucleus_sampling=False, + num_beams=3, + max_length=30, + min_length=10, + top_p=0.9, + repetition_penalty=1.0, + **kwargs + ): + + if not use_nucleus_sampling: + num_beams = num_beams + visual_embeds = visual_embeds.repeat_interleave(num_beams, dim=0) + + image_atts = torch.ones(visual_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + + model_kwargs = { + "encoder_hidden_states": visual_embeds, + "encoder_attention_mask": image_atts, + } + + if use_nucleus_sampling: + # nucleus sampling + outputs = self.generate( + input_ids=tokenized_prompt.input_ids, + max_length=max_length, + min_length=min_length, + do_sample=True, + top_p=top_p, + num_return_sequences=1, + eos_token_id=sep_token_id, + pad_token_id=pad_token_id, + repetition_penalty=1.1, + **model_kwargs + ) + else: + # beam search + outputs = self.generate( + input_ids=tokenized_prompt.input_ids, + max_length=max_length, + min_length=min_length, + num_beams=num_beams, + eos_token_id=sep_token_id, + pad_token_id=pad_token_id, + repetition_penalty=repetition_penalty, + **model_kwargs + ) + + return outputs + + +class XBertEncoder(BertModel, BaseEncoder): + @classmethod + def from_config(cls, cfg, from_pretrained=False): + + med_config_path = get_abs_path(cfg.get("med_config_path")) + med_config = BertConfig.from_json_file(med_config_path) + + if from_pretrained: + return cls.from_pretrained( + "bert-base-uncased", config=med_config, add_pooling_layer=False + ) + else: + return cls(config=med_config, add_pooling_layer=False) + + def forward_automask(self, tokenized_text, visual_embeds, **kwargs): + image_atts = torch.ones(visual_embeds.size()[:-1], dtype=torch.long).to( + self.device + ) + + text = tokenized_text + text_output = super().forward( + text.input_ids, + attention_mask=text.attention_mask, + encoder_hidden_states=visual_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + return text_output + + def forward_text(self, tokenized_text, **kwargs): + text = tokenized_text + token_type_ids = kwargs.get("token_type_ids", None) + + text_output = super().forward( + text.input_ids, + attention_mask=text.attention_mask, + token_type_ids=token_type_ids, + return_dict=True, + mode="text", + ) + + return text_output diff --git a/lavis/models/pnp_vqa_models/__init__.py b/lavis/models/pnp_vqa_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..44178e5503d448c954785201b5261eaa0df71ec5 --- /dev/null +++ b/lavis/models/pnp_vqa_models/__init__.py @@ -0,0 +1,29 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch + + +def prepare_qa_input(sample, num_captions, num_captions_fid): + sample_question_captions = [] + + for question, captions in zip(sample['text_input'], sample['captions']): + assert isinstance(captions, list) + question_captions = [] + question_caption = '' + for cap_id, cap_ in enumerate(captions[0:num_captions]): + question_caption += (cap_.strip() + '. ') + if (cap_id + 1) != num_captions and ((cap_id + 1) % num_captions_fid == 0): + question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip() + question_captions.append(question_caption) + question_caption = '' + if (cap_id + 1) == num_captions: + question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip() + question_captions.append(question_caption) + sample_question_captions.append(question_captions) + + sample['question_captions'] = sample_question_captions diff --git a/lavis/models/pnp_vqa_models/__pycache__/__init__.cpython-310.pyc b/lavis/models/pnp_vqa_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c03e21c7d3c6f56b0cb20f2d03c404e556e41d3 Binary files /dev/null and b/lavis/models/pnp_vqa_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/pnp_vqa_models/__pycache__/pnp_unifiedqav2_fid.cpython-310.pyc b/lavis/models/pnp_vqa_models/__pycache__/pnp_unifiedqav2_fid.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a2b7c7302b26631c96d745909fe9df5647fe552 Binary files /dev/null and b/lavis/models/pnp_vqa_models/__pycache__/pnp_unifiedqav2_fid.cpython-310.pyc differ diff --git a/lavis/models/pnp_vqa_models/__pycache__/pnp_vqa.cpython-310.pyc b/lavis/models/pnp_vqa_models/__pycache__/pnp_vqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..158fdea02254995dc57e90cd8bd04a95b33f3eb6 Binary files /dev/null and b/lavis/models/pnp_vqa_models/__pycache__/pnp_vqa.cpython-310.pyc differ diff --git a/lavis/models/pnp_vqa_models/pnp_unifiedqav2_fid.py b/lavis/models/pnp_vqa_models/pnp_unifiedqav2_fid.py new file mode 100644 index 0000000000000000000000000000000000000000..43da9ac1452aa2aa4d5de48409ced8628b34b093 --- /dev/null +++ b/lavis/models/pnp_vqa_models/pnp_unifiedqav2_fid.py @@ -0,0 +1,87 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on facebookresearch code base + https://github.com/facebookresearch/FiD +""" + +import torch +import torch.nn as nn +from lavis.common.registry import registry +from lavis.models.base_model import BaseModel +from lavis.common.utils import get_abs_path +from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration + + +@registry.register_model("pnp_unifiedqav2_fid") +class PNPUnifiedQAv2FiD(T5ForConditionalGeneration, BaseModel): + + PRETRAINED_MODEL_CONFIG_DICT = {} + + def __init__(self, config, model_path): + super().__init__(config) + + self.tokenizer = T5Tokenizer.from_pretrained(model_path) + + def forward(self, input_ids=None, attention_mask=None, **kwargs): + if input_ids != None: + if input_ids.dim() == 3: + self.encoder.num_contexts = input_ids.size(1) + input_ids = input_ids.view(input_ids.size(0), -1) + if attention_mask != None: + attention_mask = attention_mask.view(attention_mask.size(0), -1) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + **kwargs + ) + + def generate(self, input_ids, attention_mask, num_beams=1, min_length=0, max_length=20): + self.encoder.num_contexts = input_ids.size(1) + + return super().generate( + input_ids=input_ids.view(input_ids.size(0), -1), + attention_mask=attention_mask.view(attention_mask.size(0), -1), + num_beams=num_beams, + min_length=min_length, + max_length=max_length + ) + + def load_unifiedqa(self, state_dict): + self.load_state_dict(state_dict) + self.encoder = T5EncoderWrapper(self.encoder) + + @classmethod + def from_config(cls, cfg): + model_path = cfg.get('pretrained') + t5_config_path = get_abs_path(cfg.get("t5_config_path")) + t5_config = T5Config.from_json_file(t5_config_path) + model = cls(t5_config, model_path) + model.load_unifiedqa(T5ForConditionalGeneration.from_pretrained(model_path).state_dict()) + + return model + + +class T5EncoderWrapper(torch.nn.Module): + + def __init__(self, encoder): + super().__init__() + + self.encoder = encoder + self.block = self.encoder.block + self.parallelize = self.encoder.parallelize + self.main_input_name = encoder.main_input_name + + def forward(self, input_ids=None, attention_mask=None, **kwargs): + bsz, total_length = input_ids.shape + context_length = total_length // self.num_contexts + input_ids = input_ids.view(bsz*self.num_contexts, context_length) + attention_mask = attention_mask.view(bsz*self.num_contexts, context_length) + outputs = self.encoder(input_ids, attention_mask, **kwargs) + outputs = (outputs[0].view(bsz, self.num_contexts*context_length, -1), ) + outputs[1:] + + return outputs \ No newline at end of file diff --git a/lavis/models/pnp_vqa_models/pnp_vqa.py b/lavis/models/pnp_vqa_models/pnp_vqa.py new file mode 100644 index 0000000000000000000000000000000000000000..59b9d888bdcb999ca65eabfda7c457b7041524c4 --- /dev/null +++ b/lavis/models/pnp_vqa_models/pnp_vqa.py @@ -0,0 +1,340 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import torch.nn as nn +from itertools import chain +from lavis.common.registry import registry +from lavis.models.base_model import BaseModel +from torch.nn import CrossEntropyLoss, MSELoss +from transformers import T5ForConditionalGeneration +from lavis.models.pnp_vqa_models import prepare_qa_input +from lavis.models.blip_models.blip_image_text_matching import compute_gradcam +from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions + + +@registry.register_model("pnp_vqa") +class PNPVQA(BaseModel): + """ + PNPVQA model consists of three submodels for zero-shot VQA: + 1. Image-questioning matching model + 2. Image captioning model + 3. Question answering model + + Supported model types: + - base: BLIPITM, BLIPCaption, PNPUnifiedQAv2FiD (t5-base) + - large: BLIPITM, BLIPCaption, PNPUnifiedQAv2FiD (t5-large) + - 3b: BLIPITM, BLIPCaption, PNPUnifiedQAv2FiD (t5-3b) + + Usage: + >>> from lavis.models import load_model + >>> model = load_model("pnp_vqa", "base", is_eval=True) + >>> model = load_model("pnp_vqa", "large", is_eval=True) + >>> model = load_model("pnp_vqa", "3b", is_eval=True) + """ + + PRETRAINED_MODEL_CONFIG_DICT = {"base": "configs/models/pnp-vqa/pnp_vqa_base.yaml", + "large": "configs/models/pnp-vqa/pnp_vqa_large.yaml", + "3b": "configs/models/pnp-vqa/pnp_vqa_3b.yaml", + } + + def __init__(self, image_question_matching_model, image_captioning_model, + question_answering_model, offload_model=False): + super().__init__() + + self.image_question_matching_model = image_question_matching_model + self.image_captioning_model = image_captioning_model + self.question_answering_model = question_answering_model + self.offload_model = offload_model + + def forward_itm(self, samples, block_num=7): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + block_num (int): The index of cross-attention block for gradcam computation. + + Returns: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + - gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + """ + image = samples['image'] + question = [text.strip('?') for text in samples['text_input']] + tokenized_text = self.image_question_matching_model.tokenizer(question, padding='longest', truncation=True, + return_tensors="pt").to(self.image_question_matching_model.device) + with torch.set_grad_enabled(True): + gradcams, _ = compute_gradcam(model=self.image_question_matching_model, + visual_input=image, + text_input=question, + tokenized_text=tokenized_text, + block_num=block_num) + + gradcams = [gradcam_[1] for gradcam_ in gradcams] + samples['gradcams'] = torch.stack(gradcams).reshape(samples['image'].size(0), -1) + + return samples + + def forward_cap( + self, + samples, + cap_max_length=20, + cap_min_length=0, + top_p=1, + top_k=50, + repetition_penalty=1.0, + num_captions=100, + num_patches=20, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + - gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + cap_max_length (int): The maximum length of the caption to be generated. + cap_min_length (int): The minimum length of the caption to be generated. + top_p (float): The cumulative probability for nucleus sampling. + top_k (float): The number of the highest probability tokens for top-k sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_captions (int): Number of captions generated for each image. + num_patches (int): Number of patches sampled for each image. + + Returns: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + - gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + - captions (nested list): A nested list of strings of total length batch_size * num_captions + """ + encoder_out = self.image_captioning_model.forward_encoder(samples) + captions = [[] for _ in range(encoder_out.size(0))] + + min_num_captions = 0 + + while min_num_captions < num_captions: + encoder_out_samples = [] + for i in range(num_captions): + patch_id = torch.multinomial(samples['gradcams'].to(self.image_captioning_model.device), + num_patches).reshape(encoder_out.size(0), -1) + 1 + patch_id = patch_id.sort(dim=1).values.unsqueeze(-1).expand(-1, -1, encoder_out.size(2)) + encoder_out_sample = torch.gather(encoder_out, 1, patch_id) + encoder_out_samples.append(encoder_out_sample) + + stacked = torch.stack(encoder_out_samples, dim=1) + image_embeds = torch.flatten(stacked, start_dim=0, end_dim=1) #(bsz*num_seq, num_patch, dim) + + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(self.image_captioning_model.device) + model_kwargs = { + "encoder_hidden_states": image_embeds, + "encoder_attention_mask": image_atts, + } + + prompt = [self.image_captioning_model.prompt] * image_embeds.size(0) + prompt = self.image_captioning_model.tokenizer(prompt, + return_tensors="pt").to(self.image_captioning_model.device) + prompt.input_ids[:, 0] = self.image_captioning_model.tokenizer.bos_token_id + prompt.input_ids = prompt.input_ids[:, :-1] + + decoder_out = self.image_captioning_model.text_decoder.generate( + input_ids=prompt.input_ids, + max_length=cap_max_length, + min_length=cap_min_length, + do_sample=True, + top_p=top_p, + top_k=top_k, + num_return_sequences=1, + eos_token_id=self.image_captioning_model.tokenizer.sep_token_id, + pad_token_id=self.image_captioning_model.tokenizer.pad_token_id, + repetition_penalty=repetition_penalty, + **model_kwargs) + + outputs = self.image_captioning_model.tokenizer.batch_decode(decoder_out, skip_special_tokens=True) + + for counter, output in enumerate(outputs): + ind = counter//num_captions + if len(captions[ind]) < num_captions: + caption = output[len(self.image_captioning_model.prompt):] + overlap_caption = [1 for caps in captions[ind] if caption in caps] + if len(overlap_caption) == 0: + captions[ind].append(caption) + + min_num_captions = min([len(i) for i in captions]) + + samples['captions'] = captions + + return samples + + def forward_qa( + self, + samples, + num_beams=1, + max_len=20, + min_len=0, + internal_bsz_fid=1, + num_captions=100, + num_captions_fid=1, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) + - text_input (list): A list of strings of length batch_size + - gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + - captions (nested list): A nested list of strings of total length batch_size * num_captions + - question_captions (nested list): A nested list of concatenated strings of questions and captions + num_beams (int): Number of beams for beam search. 1 means no beam search. + max_len (int): Maximum length of generated answers. + min_len (int): Minimum length of generated answers. + internal_bsz_fid (int): Internal batch size when using FiD decoding. + num_captions (int): Number of captions generated for each image. + num_captions_fid (int): Number of captions concatenated with a question during FiD decoding. + + Returns: + List: A list of strings, each string is an answer. + """ + prepare_qa_input(samples, num_captions=num_captions, num_captions_fid=num_captions_fid) + + pred_answers = [] + question_captions = samples['question_captions'] + question_captions_chunk = [question_captions[i:i + internal_bsz_fid] + for i in range(0, len(question_captions), internal_bsz_fid)] + question_captions_chunk = list(chain(*question_captions_chunk)) + + for question_caption in question_captions_chunk: + question_caption_input = self.question_answering_model.tokenizer(question_caption, padding='longest', + truncation=True, return_tensors="pt").to(self.question_answering_model.device) + + question_caption_input.input_ids = question_caption_input.input_ids.reshape( + internal_bsz_fid, -1, question_caption_input.input_ids.size(1)) + question_caption_input.attention_mask = question_caption_input.attention_mask.reshape( + internal_bsz_fid, -1, question_caption_input.attention_mask.size(1)) + + outputs = self.question_answering_model.generate(input_ids=question_caption_input.input_ids, + attention_mask=question_caption_input.attention_mask, + num_beams=num_beams, + min_length=min_len, + max_length=max_len, + ) + + for output in outputs: + pred_answer = self.question_answering_model.tokenizer.decode(output, skip_special_tokens=True) + pred_answers.append(pred_answer) + + return pred_answers + + def predict_answers( + self, + samples, + num_beams=1, + inference_method="generate", + max_len=20, + min_len=0, + internal_bsz_fid=1, + num_captions=50, + num_captions_fid=1, + cap_max_length=20, + cap_min_length=10, + top_k=50, + top_p=1, + repetition_penalty=1, + num_patches=50, + block_num=7, + ): + """ + Args: + samples (dict): A dictionary containing the following keys: + - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W). Default H=480, W=480. + - text_input (str or [str]): String or a list of strings, each string is a question. + The number of questions must be equal to the batch size. If a single string, will be converted to a list of string, with length 1 first. + num_beams (int): Number of beams for beam search. 1 means no beam search. + inference_method (str): Inference method. Must be "generate". The model will generate answers. + max_len (int): Maximum length of generated answers. + min_len (int): Minimum length of generated answers. + internal_bsz_fid (int): Internal batch size when using FiD decoding. + num_captions (int): Number of captions generated for each image. + num_captions_fid (int): Number of captions concatenated with a question during FiD decoding. + cap_max_length (int): The maximum length of the caption to be generated. + cap_min_length (int): The minimum length of the caption to be generated. + top_k (float): The number of the highest probability tokens for top-k sampling. + top_p (float): The cumulative probability for nucleus sampling. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + num_patches (int): Number of patches sampled for each image. + block_num (int): The index of cross-attention block for gradcam computation. + + Returns: + List: A list of strings, each string is an answer. + gradcams (torch.Tensor): A tensor of shape (batch_size, H*W) + captions (nested list): A nested list of strings of total length batch_size * num_captions + """ + assert inference_method in [ + "generate", + ], "Inference method must be 'generate', got {}.".format( + inference_method + ) + + if isinstance(samples["text_input"], str): + samples["text_input"] = [samples["text_input"]] + + assert len(samples["text_input"]) == samples["image"].size( + 0 + ), "The number of questions must be equal to the batch size." + + samples = self.forward_itm(samples, block_num=block_num) + + samples = self.forward_cap(samples, + cap_max_length=cap_max_length, + cap_min_length=cap_min_length, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + num_captions=num_captions, + num_patches=num_patches) + + if self.offload_model: + samples['image'] = samples['image'].to('cpu') + self.image_question_matching_model.to('cpu') + self.image_captioning_model.to('cpu') + torch.cuda.empty_cache() + + pred_answers = self.forward_qa(samples, + num_beams=num_beams, + max_len=max_len, + min_len=min_len, + internal_bsz_fid=internal_bsz_fid, + num_captions=num_captions, + num_captions_fid=num_captions_fid) + + if self.offload_model: + self.image_question_matching_model.to(self.question_answering_model.device) + self.image_captioning_model.to(self.question_answering_model.device) + + return pred_answers, samples['captions'], samples['gradcams'] + + @classmethod + def from_config(cls, model_config): + itm_config = model_config.image_question_matching_model + cap_config = model_config.image_captioning_model + qa_config = model_config.question_answering_model + + itm_cls = registry.get_model_class(itm_config.arch) + cap_cls = registry.get_model_class(cap_config.arch) + qa_cls = registry.get_model_class(qa_config.arch) + + image_question_matching_model = itm_cls.from_config(itm_config) + image_captioning_model = cap_cls.from_config(cap_config) + question_answering_model = qa_cls.from_config(qa_config) + + model = cls(image_question_matching_model=image_question_matching_model, + image_captioning_model=image_captioning_model, + question_answering_model=question_answering_model, + offload_model= True if model_config.model_type == '3b' else False, + ) + + return model \ No newline at end of file diff --git a/lavis/models/timesformer/__init__.py b/lavis/models/timesformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1da75fc4c6577d8629ccb82f7a2b97b116c5b2bc --- /dev/null +++ b/lavis/models/timesformer/__init__.py @@ -0,0 +1,8 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/facebookresearch/TimeSformer +""" diff --git a/lavis/models/timesformer/__pycache__/__init__.cpython-310.pyc b/lavis/models/timesformer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d6fbd577d16cb9dc96859acb4607ec8849863c0 Binary files /dev/null and b/lavis/models/timesformer/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/timesformer/__pycache__/helpers.cpython-310.pyc b/lavis/models/timesformer/__pycache__/helpers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72ecf71e766dbad0731f092ee057141a852325df Binary files /dev/null and b/lavis/models/timesformer/__pycache__/helpers.cpython-310.pyc differ diff --git a/lavis/models/timesformer/__pycache__/vit.cpython-310.pyc b/lavis/models/timesformer/__pycache__/vit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a837e608b00f4fbb742f94f5b9723799d690509 Binary files /dev/null and b/lavis/models/timesformer/__pycache__/vit.cpython-310.pyc differ diff --git a/lavis/models/timesformer/__pycache__/vit_utils.cpython-310.pyc b/lavis/models/timesformer/__pycache__/vit_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf303b5daebf44780cdfcc0a9431c83263f429d0 Binary files /dev/null and b/lavis/models/timesformer/__pycache__/vit_utils.cpython-310.pyc differ diff --git a/lavis/models/timesformer/conv2d_same.py b/lavis/models/timesformer/conv2d_same.py new file mode 100644 index 0000000000000000000000000000000000000000..ad23cc5a75e48d08137053c5e481a2feb8356b50 --- /dev/null +++ b/lavis/models/timesformer/conv2d_same.py @@ -0,0 +1,116 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/facebookresearch/TimeSformer +""" + +# Copyright 2020 Ross Wightman +# Conv2d w/ Same Padding + +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Tuple, Optional + +import math +from typing import List, Tuple + +from .vit_utils import is_static_pad, get_padding + +# Dynamically pad input x with 'SAME' padding for conv with specified args +def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0): + ih, iw = x.size()[-2:] + pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding( + iw, k[1], s[1], d[1] + ) + if pad_h > 0 or pad_w > 0: + x = F.pad( + x, + [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], + value=value, + ) + return x + + +# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution +def get_same_padding(x: int, k: int, s: int, d: int): + return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0) + + +def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]: + dynamic = False + if isinstance(padding, str): + # for any string padding, the padding will be calculated for you, one of three ways + padding = padding.lower() + if padding == "same": + # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact + if is_static_pad(kernel_size, **kwargs): + # static case, no extra overhead + padding = get_padding(kernel_size, **kwargs) + else: + # dynamic 'SAME' padding, has runtime/GPU memory overhead + padding = 0 + dynamic = True + elif padding == "valid": + # 'VALID' padding, same as padding=0 + padding = 0 + else: + # Default to PyTorch style 'same'-ish symmetric padding + padding = get_padding(kernel_size, **kwargs) + return padding, dynamic + + +def conv2d_same( + x, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None, + stride: Tuple[int, int] = (1, 1), + padding: Tuple[int, int] = (0, 0), + dilation: Tuple[int, int] = (1, 1), + groups: int = 1, +): + x = pad_same(x, weight.shape[-2:], stride, dilation) + return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups) + + +class Conv2dSame(nn.Conv2d): + """Tensorflow like 'SAME' convolution wrapper for 2D convolutions""" + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + ): + super(Conv2dSame, self).__init__( + in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias + ) + + def forward(self, x): + return conv2d_same( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + + +def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs): + padding = kwargs.pop("padding", "") + kwargs.setdefault("bias", False) + padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs) + if is_dynamic: + return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs) + else: + return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs) diff --git a/lavis/models/timesformer/features.py b/lavis/models/timesformer/features.py new file mode 100644 index 0000000000000000000000000000000000000000..a1ef6bb31fae6253a1e3f23a2570c290d5cdf432 --- /dev/null +++ b/lavis/models/timesformer/features.py @@ -0,0 +1,308 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/facebookresearch/TimeSformer +""" + +# Copyright 2020 Ross Wightman + +from collections import OrderedDict, defaultdict +from copy import deepcopy +from functools import partial +from typing import Dict, List, Tuple + +import torch +import torch.nn as nn + + +class FeatureInfo: + def __init__(self, feature_info: List[Dict], out_indices: Tuple[int]): + prev_reduction = 1 + for fi in feature_info: + # sanity check the mandatory fields, there may be additional fields depending on the model + assert "num_chs" in fi and fi["num_chs"] > 0 + assert "reduction" in fi and fi["reduction"] >= prev_reduction + prev_reduction = fi["reduction"] + assert "module" in fi + self.out_indices = out_indices + self.info = feature_info + + def from_other(self, out_indices: Tuple[int]): + return FeatureInfo(deepcopy(self.info), out_indices) + + def get(self, key, idx=None): + """Get value by key at specified index (indices) + if idx == None, returns value for key at each output index + if idx is an integer, return value for that feature module index (ignoring output indices) + if idx is a list/tupple, return value for each module index (ignoring output indices) + """ + if idx is None: + return [self.info[i][key] for i in self.out_indices] + if isinstance(idx, (tuple, list)): + return [self.info[i][key] for i in idx] + else: + return self.info[idx][key] + + def get_dicts(self, keys=None, idx=None): + """return info dicts for specified keys (or all if None) at specified indices (or out_indices if None)""" + if idx is None: + if keys is None: + return [self.info[i] for i in self.out_indices] + else: + return [{k: self.info[i][k] for k in keys} for i in self.out_indices] + if isinstance(idx, (tuple, list)): + return [ + self.info[i] if keys is None else {k: self.info[i][k] for k in keys} + for i in idx + ] + else: + return ( + self.info[idx] if keys is None else {k: self.info[idx][k] for k in keys} + ) + + def channels(self, idx=None): + """feature channels accessor""" + return self.get("num_chs", idx) + + def reduction(self, idx=None): + """feature reduction (output stride) accessor""" + return self.get("reduction", idx) + + def module_name(self, idx=None): + """feature module name accessor""" + return self.get("module", idx) + + def __getitem__(self, item): + return self.info[item] + + def __len__(self): + return len(self.info) + + +class FeatureHooks: + """Feature Hook Helper + This module helps with the setup and extraction of hooks for extracting features from + internal nodes in a model by node name. This works quite well in eager Python but needs + redesign for torcscript. + """ + + def __init__(self, hooks, named_modules, out_map=None, default_hook_type="forward"): + # setup feature hooks + modules = {k: v for k, v in named_modules} + for i, h in enumerate(hooks): + hook_name = h["module"] + m = modules[hook_name] + hook_id = out_map[i] if out_map else hook_name + hook_fn = partial(self._collect_output_hook, hook_id) + hook_type = h["hook_type"] if "hook_type" in h else default_hook_type + if hook_type == "forward_pre": + m.register_forward_pre_hook(hook_fn) + elif hook_type == "forward": + m.register_forward_hook(hook_fn) + else: + assert False, "Unsupported hook type" + self._feature_outputs = defaultdict(OrderedDict) + + def _collect_output_hook(self, hook_id, *args): + x = args[ + -1 + ] # tensor we want is last argument, output for fwd, input for fwd_pre + if isinstance(x, tuple): + x = x[0] # unwrap input tuple + self._feature_outputs[x.device][hook_id] = x + + def get_output(self, device) -> Dict[str, torch.tensor]: + output = self._feature_outputs[device] + self._feature_outputs[device] = OrderedDict() # clear after reading + return output + + +def _module_list(module, flatten_sequential=False): + # a yield/iter would be better for this but wouldn't be compatible with torchscript + ml = [] + for name, module in module.named_children(): + if flatten_sequential and isinstance(module, nn.Sequential): + # first level of Sequential containers is flattened into containing model + for child_name, child_module in module.named_children(): + combined = [name, child_name] + ml.append(("_".join(combined), ".".join(combined), child_module)) + else: + ml.append((name, name, module)) + return ml + + +def _get_feature_info(net, out_indices): + feature_info = getattr(net, "feature_info") + if isinstance(feature_info, FeatureInfo): + return feature_info.from_other(out_indices) + elif isinstance(feature_info, (list, tuple)): + return FeatureInfo(net.feature_info, out_indices) + else: + assert False, "Provided feature_info is not valid" + + +def _get_return_layers(feature_info, out_map): + module_names = feature_info.module_name() + return_layers = {} + for i, name in enumerate(module_names): + return_layers[name] = ( + out_map[i] if out_map is not None else feature_info.out_indices[i] + ) + return return_layers + + +class FeatureDictNet(nn.ModuleDict): + """Feature extractor with OrderedDict return + Wrap a model and extract features as specified by the out indices, the network is + partially re-built from contained modules. + There is a strong assumption that the modules have been registered into the model in the same + order as they are used. There should be no reuse of the same nn.Module more than once, including + trivial modules like `self.relu = nn.ReLU`. + Only submodules that are directly assigned to the model class (`model.feature1`) or at most + one Sequential container deep (`model.features.1`, with flatten_sequent=True) can be captured. + All Sequential containers that are directly assigned to the original model will have their + modules assigned to this module with the name `model.features.1` being changed to `model.features_1` + Arguments: + model (nn.Module): model from which we will extract the features + out_indices (tuple[int]): model output indices to extract features for + out_map (sequence): list or tuple specifying desired return id for each out index, + otherwise str(index) is used + feature_concat (bool): whether to concatenate intermediate features that are lists or tuples + vs select element [0] + flatten_sequential (bool): whether to flatten sequential modules assigned to model + """ + + def __init__( + self, + model, + out_indices=(0, 1, 2, 3, 4), + out_map=None, + feature_concat=False, + flatten_sequential=False, + ): + super(FeatureDictNet, self).__init__() + self.feature_info = _get_feature_info(model, out_indices) + self.concat = feature_concat + self.return_layers = {} + return_layers = _get_return_layers(self.feature_info, out_map) + modules = _module_list(model, flatten_sequential=flatten_sequential) + remaining = set(return_layers.keys()) + layers = OrderedDict() + for new_name, old_name, module in modules: + layers[new_name] = module + if old_name in remaining: + # return id has to be consistently str type for torchscript + self.return_layers[new_name] = str(return_layers[old_name]) + remaining.remove(old_name) + if not remaining: + break + assert not remaining and len(self.return_layers) == len( + return_layers + ), f"Return layers ({remaining}) are not present in model" + self.update(layers) + + def _collect(self, x) -> (Dict[str, torch.Tensor]): + out = OrderedDict() + for name, module in self.items(): + x = module(x) + if name in self.return_layers: + out_id = self.return_layers[name] + if isinstance(x, (tuple, list)): + # If model tap is a tuple or list, concat or select first element + # FIXME this may need to be more generic / flexible for some nets + out[out_id] = torch.cat(x, 1) if self.concat else x[0] + else: + out[out_id] = x + return out + + def forward(self, x) -> Dict[str, torch.Tensor]: + return self._collect(x) + + +class FeatureListNet(FeatureDictNet): + """Feature extractor with list return + See docstring for FeatureDictNet above, this class exists only to appease Torchscript typing constraints. + In eager Python we could have returned List[Tensor] vs Dict[id, Tensor] based on a member bool. + """ + + def __init__( + self, + model, + out_indices=(0, 1, 2, 3, 4), + out_map=None, + feature_concat=False, + flatten_sequential=False, + ): + super(FeatureListNet, self).__init__( + model, + out_indices=out_indices, + out_map=out_map, + feature_concat=feature_concat, + flatten_sequential=flatten_sequential, + ) + + def forward(self, x) -> (List[torch.Tensor]): + return list(self._collect(x).values()) + + +class FeatureHookNet(nn.ModuleDict): + """FeatureHookNet + Wrap a model and extract features specified by the out indices using forward/forward-pre hooks. + If `no_rewrite` is True, features are extracted via hooks without modifying the underlying + network in any way. + If `no_rewrite` is False, the model will be re-written as in the + FeatureList/FeatureDict case by folding first to second (Sequential only) level modules into this one. + FIXME this does not currently work with Torchscript, see FeatureHooks class + """ + + def __init__( + self, + model, + out_indices=(0, 1, 2, 3, 4), + out_map=None, + out_as_dict=False, + no_rewrite=False, + feature_concat=False, + flatten_sequential=False, + default_hook_type="forward", + ): + super(FeatureHookNet, self).__init__() + assert not torch.jit.is_scripting() + self.feature_info = _get_feature_info(model, out_indices) + self.out_as_dict = out_as_dict + layers = OrderedDict() + hooks = [] + if no_rewrite: + assert not flatten_sequential + if hasattr(model, "reset_classifier"): # make sure classifier is removed? + model.reset_classifier(0) + layers["body"] = model + hooks.extend(self.feature_info.get_dicts()) + else: + modules = _module_list(model, flatten_sequential=flatten_sequential) + remaining = { + f["module"]: f["hook_type"] if "hook_type" in f else default_hook_type + for f in self.feature_info.get_dicts() + } + for new_name, old_name, module in modules: + layers[new_name] = module + for fn, fm in module.named_modules(prefix=old_name): + if fn in remaining: + hooks.append(dict(module=fn, hook_type=remaining[fn])) + del remaining[fn] + if not remaining: + break + assert ( + not remaining + ), f"Return layers ({remaining}) are not present in model" + self.update(layers) + self.hooks = FeatureHooks(hooks, model.named_modules(), out_map=out_map) + + def forward(self, x): + for name, module in self.items(): + x = module(x) + out = self.hooks.get_output(x.device) + return out if self.out_as_dict else list(out.values()) diff --git a/lavis/models/timesformer/helpers.py b/lavis/models/timesformer/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..1a8ebd1415fff35cd0f1e365a6f666dcb2f04fee --- /dev/null +++ b/lavis/models/timesformer/helpers.py @@ -0,0 +1,400 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/facebookresearch/TimeSformer +""" + +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Copyright 2020 Ross Wightman +# Modified model creation / weight loading / state_dict helpers + +import logging, warnings +import os +import math +from collections import OrderedDict + +import torch +import torch.utils.model_zoo as model_zoo +import torch.nn.functional as F + + +def load_state_dict(checkpoint_path, use_ema=False): + if checkpoint_path and os.path.isfile(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location="cpu") + state_dict_key = "state_dict" + if isinstance(checkpoint, dict): + if use_ema and "state_dict_ema" in checkpoint: + state_dict_key = "state_dict_ema" + if state_dict_key and state_dict_key in checkpoint: + new_state_dict = OrderedDict() + for k, v in checkpoint[state_dict_key].items(): + # strip `module.` prefix + name = k[7:] if k.startswith("module") else k + new_state_dict[name] = v + state_dict = new_state_dict + elif "model_state" in checkpoint: + state_dict_key = "model_state" + new_state_dict = OrderedDict() + for k, v in checkpoint[state_dict_key].items(): + # strip `model.` prefix + name = k[6:] if k.startswith("model") else k + new_state_dict[name] = v + state_dict = new_state_dict + else: + state_dict = checkpoint + logging.info( + "Loaded {} from checkpoint '{}'".format(state_dict_key, checkpoint_path) + ) + return state_dict + else: + logging.error("No checkpoint found at '{}'".format(checkpoint_path)) + raise FileNotFoundError() + + +def load_checkpoint(model, checkpoint_path, use_ema=False, strict=True): + state_dict = load_state_dict(checkpoint_path, use_ema) + model.load_state_dict(state_dict, strict=strict) + + +# def resume_checkpoint(model, checkpoint_path, optimizer=None, loss_scaler=None, log_info=True): +# resume_epoch = None +# if os.path.isfile(checkpoint_path): +# checkpoint = torch.load(checkpoint_path, map_location='cpu') +# if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: +# if log_info: +# _logger.info('Restoring model state from checkpoint...') +# new_state_dict = OrderedDict() +# for k, v in checkpoint['state_dict'].items(): +# name = k[7:] if k.startswith('module') else k +# new_state_dict[name] = v +# model.load_state_dict(new_state_dict) + +# if optimizer is not None and 'optimizer' in checkpoint: +# if log_info: +# _logger.info('Restoring optimizer state from checkpoint...') +# optimizer.load_state_dict(checkpoint['optimizer']) + +# if loss_scaler is not None and loss_scaler.state_dict_key in checkpoint: +# if log_info: +# _logger.info('Restoring AMP loss scaler state from checkpoint...') +# loss_scaler.load_state_dict(checkpoint[loss_scaler.state_dict_key]) + +# if 'epoch' in checkpoint: +# resume_epoch = checkpoint['epoch'] +# if 'version' in checkpoint and checkpoint['version'] > 1: +# resume_epoch += 1 # start at the next epoch, old checkpoints incremented before save + +# if log_info: +# _logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch'])) +# else: +# model.load_state_dict(checkpoint) +# if log_info: +# _logger.info("Loaded checkpoint '{}'".format(checkpoint_path)) +# return resume_epoch +# else: +# _logger.error("No checkpoint found at '{}'".format(checkpoint_path)) +# raise FileNotFoundError() + + +def load_pretrained( + model, + cfg=None, + num_classes=1000, + in_chans=3, + filter_fn=None, + img_size=224, + num_frames=8, + num_patches=196, + attention_type="divided_space_time", + pretrained_model="", + strict=True, +): + if cfg is None: + cfg = getattr(model, "default_cfg") + if cfg is None or "url" not in cfg or not cfg["url"]: + logging.warning("Pretrained model URL is invalid, using random initialization.") + return + + if len(pretrained_model) == 0: + if cfg is None: + logging.info(f"loading from default config {model.default_cfg}.") + state_dict = model_zoo.load_url(cfg["url"], progress=False, map_location="cpu") + else: + try: + state_dict = load_state_dict(pretrained_model)["model"] + except: + state_dict = load_state_dict(pretrained_model) + + if filter_fn is not None: + state_dict = filter_fn(state_dict) + + if in_chans == 1: + conv1_name = cfg["first_conv"] + logging.info( + "Converting first conv (%s) pretrained weights from 3 to 1 channel" + % conv1_name + ) + conv1_weight = state_dict[conv1_name + ".weight"] + conv1_type = conv1_weight.dtype + conv1_weight = conv1_weight.float() + O, I, J, K = conv1_weight.shape + if I > 3: + assert conv1_weight.shape[1] % 3 == 0 + # For models with space2depth stems + conv1_weight = conv1_weight.reshape(O, I // 3, 3, J, K) + conv1_weight = conv1_weight.sum(dim=2, keepdim=False) + else: + conv1_weight = conv1_weight.sum(dim=1, keepdim=True) + conv1_weight = conv1_weight.to(conv1_type) + state_dict[conv1_name + ".weight"] = conv1_weight + elif in_chans != 3: + conv1_name = cfg["first_conv"] + conv1_weight = state_dict[conv1_name + ".weight"] + conv1_type = conv1_weight.dtype + conv1_weight = conv1_weight.float() + O, I, J, K = conv1_weight.shape + if I != 3: + logging.warning( + "Deleting first conv (%s) from pretrained weights." % conv1_name + ) + del state_dict[conv1_name + ".weight"] + strict = False + else: + logging.info( + "Repeating first conv (%s) weights in channel dim." % conv1_name + ) + repeat = int(math.ceil(in_chans / 3)) + conv1_weight = conv1_weight.repeat(1, repeat, 1, 1)[:, :in_chans, :, :] + conv1_weight *= 3 / float(in_chans) + conv1_weight = conv1_weight.to(conv1_type) + state_dict[conv1_name + ".weight"] = conv1_weight + + classifier_name = cfg["classifier"] + if num_classes == 1000 and cfg["num_classes"] == 1001: + # special case for imagenet trained models with extra background class in pretrained weights + classifier_weight = state_dict[classifier_name + ".weight"] + state_dict[classifier_name + ".weight"] = classifier_weight[1:] + classifier_bias = state_dict[classifier_name + ".bias"] + state_dict[classifier_name + ".bias"] = classifier_bias[1:] + elif num_classes != state_dict[classifier_name + ".weight"].size(0): + # print('Removing the last fully connected layer due to dimensions mismatch ('+str(num_classes)+ ' != '+str(state_dict[classifier_name + '.weight'].size(0))+').', flush=True) + # completely discard fully connected for all other differences between pretrained and created model + del state_dict[classifier_name + ".weight"] + del state_dict[classifier_name + ".bias"] + strict = False + + ## Resizing the positional embeddings in case they don't match + logging.info( + f"Resizing spatial position embedding from {state_dict['pos_embed'].size(1)} to {num_patches + 1}" + ) + if num_patches + 1 != state_dict["pos_embed"].size(1): + pos_embed = state_dict["pos_embed"] + cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1) + other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(1, 2) + new_pos_embed = F.interpolate( + other_pos_embed, size=(num_patches), mode="nearest" + ) + new_pos_embed = new_pos_embed.transpose(1, 2) + new_pos_embed = torch.cat((cls_pos_embed, new_pos_embed), 1) + state_dict["pos_embed"] = new_pos_embed + + ## Resizing time embeddings in case they don't match + if "time_embed" in state_dict and num_frames != state_dict["time_embed"].size(1): + logging.info( + f"Resizing temporal position embedding from {state_dict['time_embed'].size(1)} to {num_frames}" + ) + time_embed = state_dict["time_embed"].transpose(1, 2) + new_time_embed = F.interpolate(time_embed, size=(num_frames), mode="nearest") + state_dict["time_embed"] = new_time_embed.transpose(1, 2) + + ## Initializing temporal attention + if attention_type == "divided_space_time": + new_state_dict = state_dict.copy() + for key in state_dict: + if "blocks" in key and "attn" in key: + new_key = key.replace("attn", "temporal_attn") + if not new_key in state_dict: + new_state_dict[new_key] = state_dict[key] + else: + new_state_dict[new_key] = state_dict[new_key] + if "blocks" in key and "norm1" in key: + new_key = key.replace("norm1", "temporal_norm1") + if not new_key in state_dict: + new_state_dict[new_key] = state_dict[key] + else: + new_state_dict[new_key] = state_dict[new_key] + state_dict = new_state_dict + + ## Loading the weights + model.load_state_dict(state_dict, strict=False) + + +def load_pretrained_imagenet( + model, + pretrained_model, + cfg=None, + ignore_classifier=True, + num_frames=8, + num_patches=196, + **kwargs, +): + import timm + + logging.info(f"Loading vit_base_patch16_224 checkpoints.") + loaded_state_dict = timm.models.vision_transformer.vit_base_patch16_224( + pretrained=True + ).state_dict() + + del loaded_state_dict["head.weight"] + del loaded_state_dict["head.bias"] + + ## Initializing temporal attention + new_state_dict = loaded_state_dict.copy() + for key in loaded_state_dict: + if "blocks" in key and "attn" in key: + new_key = key.replace("attn", "temporal_attn") + if not new_key in loaded_state_dict: + new_state_dict[new_key] = loaded_state_dict[key] + else: + new_state_dict[new_key] = loaded_state_dict[new_key] + if "blocks" in key and "norm1" in key: + new_key = key.replace("norm1", "temporal_norm1") + if not new_key in loaded_state_dict: + new_state_dict[new_key] = loaded_state_dict[key] + else: + new_state_dict[new_key] = loaded_state_dict[new_key] + + loaded_state_dict = new_state_dict + + loaded_keys = loaded_state_dict.keys() + model_keys = model.state_dict().keys() + + load_not_in_model = [k for k in loaded_keys if k not in model_keys] + model_not_in_load = [k for k in model_keys if k not in loaded_keys] + + toload = dict() + mismatched_shape_keys = [] + for k in model_keys: + if k in loaded_keys: + if model.state_dict()[k].shape != loaded_state_dict[k].shape: + mismatched_shape_keys.append(k) + else: + toload[k] = loaded_state_dict[k] + + logging.info("Keys in loaded but not in model:") + logging.info(f"In total {len(load_not_in_model)}, {sorted(load_not_in_model)}") + logging.info("Keys in model but not in loaded:") + logging.info(f"In total {len(model_not_in_load)}, {sorted(model_not_in_load)}") + logging.info("Keys in model and loaded, but shape mismatched:") + logging.info( + f"In total {len(mismatched_shape_keys)}, {sorted(mismatched_shape_keys)}" + ) + + model.load_state_dict(toload, strict=False) + + +def load_pretrained_kinetics( + model, + pretrained_model, + cfg=None, + ignore_classifier=True, + num_frames=8, + num_patches=196, + **kwargs, +): + if cfg is None: + cfg = getattr(model, "default_cfg") + if cfg is None or "url" not in cfg or not cfg["url"]: + logging.warning("Pretrained model URL is invalid, using random initialization.") + return + + assert ( + len(pretrained_model) > 0 + ), "Path to pre-trained Kinetics weights not provided." + + state_dict = load_state_dict(pretrained_model) + + classifier_name = cfg["classifier"] + if ignore_classifier: + + classifier_weight_key = classifier_name + ".weight" + classifier_bias_key = classifier_name + ".bias" + + state_dict[classifier_weight_key] = model.state_dict()[classifier_weight_key] + state_dict[classifier_bias_key] = model.state_dict()[classifier_bias_key] + + else: + raise NotImplementedError( + "[dxli] Not supporting loading Kinetics-pretrained ckpt with classifier." + ) + + ## Resizing the positional embeddings in case they don't match + if num_patches + 1 != state_dict["pos_embed"].size(1): + new_pos_embed = resize_spatial_embedding(state_dict, "pos_embed", num_patches) + state_dict["pos_embed"] = new_pos_embed + + ## Resizing time embeddings in case they don't match + if "time_embed" in state_dict and num_frames != state_dict["time_embed"].size(1): + state_dict["time_embed"] = resize_temporal_embedding( + state_dict, "time_embed", num_frames + ) + + ## Loading the weights + try: + model.load_state_dict(state_dict, strict=True) + logging.info("Succeeded in loading Kinetics pre-trained weights.") + except: + logging.error("Error in loading Kinetics pre-trained weights.") + + +def resize_spatial_embedding(state_dict, key, num_patches): + logging.info( + f"Resizing spatial position embedding from {state_dict[key].size(1)} to {num_patches + 1}" + ) + + pos_embed = state_dict[key] + + cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1) + other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(1, 2) + + new_pos_embed = F.interpolate(other_pos_embed, size=(num_patches), mode="nearest") + new_pos_embed = new_pos_embed.transpose(1, 2) + new_pos_embed = torch.cat((cls_pos_embed, new_pos_embed), 1) + + return new_pos_embed + + +def resize_temporal_embedding(state_dict, key, num_frames): + logging.info( + f"Resizing temporal position embedding from {state_dict[key].size(1)} to {num_frames}" + ) + + time_embed = state_dict[key].transpose(1, 2) + new_time_embed = F.interpolate(time_embed, size=(num_frames), mode="nearest") + + return new_time_embed.transpose(1, 2) + + +def detach_variable(inputs): + if isinstance(inputs, tuple): + out = [] + for inp in inputs: + x = inp.detach() + x.requires_grad = inp.requires_grad + out.append(x) + return tuple(out) + else: + raise RuntimeError( + "Only tuple of tensors is supported. Got Unsupported input type: ", + type(inputs).__name__, + ) + + +def check_backward_validity(inputs): + if not any(inp.requires_grad for inp in inputs): + warnings.warn( + "None of the inputs have requires_grad=True. Gradients will be None" + ) diff --git a/lavis/models/timesformer/linear.py b/lavis/models/timesformer/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..bfa849b93479d796c8cf0c4fde999ed028f9ae45 --- /dev/null +++ b/lavis/models/timesformer/linear.py @@ -0,0 +1,21 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +""" Linear layer (alternate definition) +""" +import torch +import torch.nn.functional as F +from torch import nn as nn + + +class Linear(nn.Linear): + def forward(self, input: torch.Tensor) -> torch.Tensor: + if torch.jit.is_scripting(): + bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None + return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) + else: + return F.linear(input, self.weight, self.bias) diff --git a/lavis/models/timesformer/vit.py b/lavis/models/timesformer/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..a6202b40636684b4a9c4feb8af5be227180ca966 --- /dev/null +++ b/lavis/models/timesformer/vit.py @@ -0,0 +1,634 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/facebookresearch/TimeSformer +""" + +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Copyright 2020 Ross Wightman +# Modified Model definition + +import logging +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils +import torch.utils.checkpoint +from einops import rearrange +from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper + +from .helpers import load_pretrained, load_pretrained_imagenet, load_pretrained_kinetics +from .vit_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + DropPath, + to_2tuple, + trunc_normal_, +) + + +def _cfg(url="", **kwargs): + return { + "url": url, + "num_classes": 1000, + "input_size": (3, 224, 224), + "pool_size": None, + "crop_pct": 0.9, + "interpolation": "bicubic", + "mean": IMAGENET_DEFAULT_MEAN, + "std": IMAGENET_DEFAULT_STD, + "first_conv": "patch_embed.proj", + "classifier": "head", + **kwargs, + } + + +default_cfgs = { + "vit_base_patch16_224": _cfg( + url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth", + mean=(0.5, 0.5, 0.5), + std=(0.5, 0.5, 0.5), + ), +} + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + with_qkv=True, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + self.with_qkv = with_qkv + if self.with_qkv: + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.attn_drop = nn.Dropout(attn_drop) + + def forward(self, x): + B, N, C = x.shape + if self.with_qkv: + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + else: + qkv = x.reshape(B, N, self.num_heads, C // self.num_heads).permute( + 0, 2, 1, 3 + ) + q, k, v = qkv, qkv, qkv + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + if self.with_qkv: + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + layer_num, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.1, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + attention_type="divided_space_time", + use_grad_checkpointing=False, + ): + super().__init__() + self.attention_type = attention_type + assert attention_type in [ + "divided_space_time", + "space_only", + "joint_space_time", + ] + + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + # Temporal Attention Parameters + if self.attention_type == "divided_space_time": + self.temporal_norm1 = norm_layer(dim) + self.temporal_attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.temporal_fc = nn.Linear(dim, dim) + + # drop path + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + # [dxli] + self.layer_num = layer_num + self.use_grad_checkpointing = use_grad_checkpointing + + if use_grad_checkpointing: + self.temporal_attn = checkpoint_wrapper(self.temporal_attn) + self.attn = checkpoint_wrapper(self.attn) + self.mlp = checkpoint_wrapper(self.mlp) + + def forward(self, x, B, T, W): + num_spatial_tokens = (x.size(1) - 1) // T + H = num_spatial_tokens // W + + if self.attention_type in ["space_only", "joint_space_time"]: + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + elif self.attention_type == "divided_space_time": + # Temporal + xt = x[:, 1:, :] + xt = rearrange(xt, "b (h w t) m -> (b h w) t m", b=B, h=H, w=W, t=T) + + temporal_attn_out = self.temporal_attn(self.temporal_norm1(xt)) + + res_temporal = self.drop_path(temporal_attn_out) + + res_temporal = rearrange( + res_temporal, "(b h w) t m -> b (h w t) m", b=B, h=H, w=W, t=T + ) + res_temporal = self.temporal_fc(res_temporal) + xt = x[:, 1:, :] + res_temporal + + # Spatial + init_cls_token = x[:, 0, :].unsqueeze(1) + cls_token = init_cls_token.repeat(1, T, 1) + cls_token = rearrange(cls_token, "b t m -> (b t) m", b=B, t=T).unsqueeze(1) + xs = xt + xs = rearrange(xs, "b (h w t) m -> (b t) (h w) m", b=B, h=H, w=W, t=T) + xs = torch.cat((cls_token, xs), 1) + + spatial_attn_out = self.attn(self.norm1(xs)) + res_spatial = self.drop_path(spatial_attn_out) + + # Taking care of CLS token + cls_token = res_spatial[:, 0, :] + cls_token = rearrange(cls_token, "(b t) m -> b t m", b=B, t=T) + # averaging for every frame + cls_token = torch.mean(cls_token, 1, True) + res_spatial = res_spatial[:, 1:, :] + res_spatial = rearrange( + res_spatial, "(b t) (h w) m -> b (h w t) m", b=B, h=H, w=W, t=T + ) + res = res_spatial + x = xt + + # Mlp + x = torch.cat((init_cls_token, x), 1) + torch.cat((cls_token, res), 1) + + x_res = x + + x = self.norm2(x) + # x = x + self.drop_path(self.mlp(self.norm2(x))) + + # MLP + mlp_out = self.mlp(x) + + x = x_res + self.drop_path(mlp_out) + return x + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + + def forward(self, x): + B, C, T, H, W = x.shape + x = rearrange(x, "b c t h w -> (b t) c h w") + x = self.proj(x) + W = x.size(-1) + x = x.flatten(2).transpose(1, 2) + return x, T, W + + +class VisionTransformer(nn.Module): + """Vision Transformere""" + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.1, + hybrid_backbone=None, + norm_layer=nn.LayerNorm, + num_frames=8, + attention_type="divided_space_time", + dropout=0.0, + use_grad_checkpointing=False, + ckpt_layer=0, + ): + super().__init__() + + self.attention_type = attention_type + self.depth = depth + self.dropout = nn.Dropout(dropout) + self.num_classes = num_classes + # num_features for consistency with other models + self.num_features = self.embed_dim = embed_dim + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + num_patches = self.patch_embed.num_patches + + # Positional Embeddings + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + if self.attention_type != "space_only": + self.time_embed = nn.Parameter(torch.zeros(1, num_frames, embed_dim)) + self.time_drop = nn.Dropout(p=drop_rate) + + # Attention Blocks + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, self.depth) + ] # stochastic depth decay rule + self.blocks = nn.ModuleList( + [ + Block( + layer_num=i, + use_grad_checkpointing=( + use_grad_checkpointing and i >= self.depth - ckpt_layer + ), + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + attention_type=self.attention_type, + ) + for i in range(self.depth) + ] + ) + self.norm = norm_layer(embed_dim) + + # Classifier head + self.head = ( + nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + ) + + trunc_normal_(self.pos_embed, std=0.02) + trunc_normal_(self.cls_token, std=0.02) + self.apply(self._init_weights) + + # initialization of temporal attention weights + if self.attention_type == "divided_space_time": + i = 0 + for m in self.blocks.modules(): + m_str = str(m) + if "Block" in m_str: + if i > 0: + nn.init.constant_(m.temporal_fc.weight, 0) + nn.init.constant_(m.temporal_fc.bias, 0) + i += 1 + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {"pos_embed", "cls_token", "time_embed"} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=""): + self.num_classes = num_classes + self.head = ( + nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + ) + + def remove_classifier(self): + self.num_classes = 0 + self.head = None + + def forward_features(self, x): + B = x.shape[0] + x, T, W = self.patch_embed(x) + cls_tokens = self.cls_token.expand(x.size(0), -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + # resizing the positional embeddings in case they don't match the input at inference + if x.size(1) != self.pos_embed.size(1): + pos_embed = self.pos_embed + cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1) + other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(1, 2) + P = int(other_pos_embed.size(2) ** 0.5) + H = x.size(1) // W + other_pos_embed = other_pos_embed.reshape(1, x.size(2), P, P) + new_pos_embed = F.interpolate(other_pos_embed, size=(H, W), mode="nearest") + new_pos_embed = new_pos_embed.flatten(2) + new_pos_embed = new_pos_embed.transpose(1, 2) + new_pos_embed = torch.cat((cls_pos_embed, new_pos_embed), 1) + x = x + new_pos_embed + else: + x = x + self.pos_embed + x = self.pos_drop(x) + + # Time Embeddings + if self.attention_type != "space_only": + cls_tokens = x[:B, 0, :].unsqueeze(1) + x = x[:, 1:] + x = rearrange(x, "(b t) n m -> (b n) t m", b=B, t=T) + # Resizing time embeddings in case they don't match + if T != self.time_embed.size(1): + time_embed = self.time_embed.transpose(1, 2) + new_time_embed = F.interpolate(time_embed, size=(T), mode="nearest") + new_time_embed = new_time_embed.transpose(1, 2) + x = x + new_time_embed + else: + x = x + self.time_embed + x = self.time_drop(x) + x = rearrange(x, "(b n) t m -> b (n t) m", b=B, t=T) + x = torch.cat((cls_tokens, x), dim=1) + + # Attention blocks + for blk in self.blocks: + x = blk(x, B, T, W) + + # Predictions for space-only baseline + if self.attention_type == "space_only": + x = rearrange(x, "(b t) n m -> b t n m", b=B, t=T) + x = torch.mean(x, 1) # averaging predictions for every frame + + x = self.norm(x) + + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + + +def _conv_filter(state_dict, patch_size=16): + """convert patch embedding weight from manual patchify + linear proj to conv""" + out_dict = {} + for k, v in state_dict.items(): + if "patch_embed.proj.weight" in k: + if v.shape[-1] != patch_size: + patch_size = v.shape[-1] + v = v.reshape((v.shape[0], 3, patch_size, patch_size)) + out_dict[k] = v + return out_dict + + +class vit_base_patch16_224(nn.Module): + def __init__(self, cfg, **kwargs): + super(vit_base_patch16_224, self).__init__() + self.pretrained = True + patch_size = 16 + self.model = VisionTransformer( + img_size=cfg.DATA.TRAIN_CROP_SIZE, + num_classes=cfg.MODEL.NUM_CLASSES, + patch_size=patch_size, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.1, + num_frames=cfg.DATA.NUM_FRAMES, + attention_type=cfg.TIMESFORMER.ATTENTION_TYPE, + **kwargs, + ) + + self.attention_type = cfg.TIMESFORMER.ATTENTION_TYPE + self.model.default_cfg = default_cfgs["vit_base_patch16_224"] + self.num_patches = (cfg.DATA.TRAIN_CROP_SIZE // patch_size) * ( + cfg.DATA.TRAIN_CROP_SIZE // patch_size + ) + pretrained_model = cfg.TIMESFORMER.PRETRAINED_MODEL + if self.pretrained: + load_pretrained( + self.model, + num_classes=self.model.num_classes, + in_chans=kwargs.get("in_chans", 3), + filter_fn=_conv_filter, + img_size=cfg.DATA.TRAIN_CROP_SIZE, + num_patches=self.num_patches, + attention_type=self.attention_type, + pretrained_model=pretrained_model, + ) + + def forward(self, x): + x = self.model(x) + return x + + +class TimeSformer(nn.Module): + def __init__( + self, + image_size=224, + patch_size=16, + n_frms=8, + attn_drop_rate=0.0, + drop_path_rate=0.1, + drop_rate=0, + use_grad_ckpt=False, + ckpt_layer=0, + remove_classifier=True, + **kwargs, + ): + super(TimeSformer, self).__init__() + + self.img_size = image_size + self.patch_size = patch_size + self.num_frames = n_frms + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.drop_rate = drop_rate + self.use_grad_ckpt = use_grad_ckpt + self.ckpt_layer = ckpt_layer + + self.attention_type = "divided_space_time" + + logging.info( + f"Initializing TimeSformer with img_size={self.img_size}, patch_size={self.patch_size}, num_frames={self.num_frames}" + ) + + # will be ignored when loading official pretrained ckpt + self.num_classes = 400 + + self.model = VisionTransformer( + img_size=self.img_size, + num_classes=self.num_classes, + patch_size=self.patch_size, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + drop_rate=self.drop_rate, + attn_drop_rate=self.attn_drop_rate, + drop_path_rate=self.drop_path_rate, + num_frames=self.num_frames, + attention_type=self.attention_type, + use_grad_checkpointing=self.use_grad_ckpt, + ckpt_layer=self.ckpt_layer, + **kwargs, + ) + + if remove_classifier: + self.model.remove_classifier() + + self.model.default_cfg = default_cfgs[ + "vit_base_patch" + str(self.patch_size) + "_224" + ] + self.num_patches = (self.img_size // self.patch_size) * ( + self.img_size // self.patch_size + ) + + def forward(self, x): + x = self.model(x) + return x + + def forward_features(self, x): + # b, c, t, h, w = x.shape + x = self.model.forward_features(x) + + ## apply pooling + W = H = self.img_size // self.patch_size + T = self.num_frames + + cls_tokens = x[:, 0, :].unsqueeze(1) + other_tokens = x[:, 1:, :] + + x = rearrange(other_tokens, "b (h w t) m -> b t (h w) m", h=H, w=W, t=T) + + x = torch.mean(x, dim=1) + x = torch.cat((cls_tokens, x), dim=1) + + return x + + def load_state_dict(self, pretrained_ckpt_path): + logging.info( + "Loading TimeSformer checkpoints from {}".format(pretrained_ckpt_path) + ) + + if pretrained_ckpt_path == "vit_base_patch16_224": + load_ckpt_func = load_pretrained_imagenet + else: + load_ckpt_func = load_pretrained_kinetics + + load_ckpt_func( + self.model, + num_classes=self.model.num_classes, + in_chans=3, + filter_fn=_conv_filter, + img_size=self.img_size, + num_frames=self.num_frames, + num_patches=self.num_patches, + attention_type=self.attention_type, + pretrained_model=pretrained_ckpt_path, + ) diff --git a/lavis/models/timesformer/vit_utils.py b/lavis/models/timesformer/vit_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5045d586495ca8ddab3f52d5f0a1b207fe263762 --- /dev/null +++ b/lavis/models/timesformer/vit_utils.py @@ -0,0 +1,189 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on https://github.com/facebookresearch/TimeSformer +""" + +# Copyright 2020 Ross Wightman +# Various utility functions + +import torch +import torch.nn as nn +import math +import warnings +import torch.nn.functional as F + +from itertools import repeat +import collections.abc as container_abcs + +DEFAULT_CROP_PCT = 0.875 +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) +IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) +IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) +IMAGENET_DPN_MEAN = (124 / 255, 117 / 255, 104 / 255) +IMAGENET_DPN_STD = tuple([1 / (0.0167 * 255)] * 3) + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +# From PyTorch internals +def _ntuple(n): + def parse(x): + if isinstance(x, container_abcs.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + +# Calculate symmetric padding for a convolution +def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int: + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + return padding + + +def get_padding_value(padding, kernel_size, **kwargs): + dynamic = False + if isinstance(padding, str): + # for any string padding, the padding will be calculated for you, one of three ways + padding = padding.lower() + if padding == "same": + # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact + if is_static_pad(kernel_size, **kwargs): + # static case, no extra overhead + padding = get_padding(kernel_size, **kwargs) + else: + # dynamic 'SAME' padding, has runtime/GPU memory overhead + padding = 0 + dynamic = True + elif padding == "valid": + # 'VALID' padding, same as padding=0 + padding = 0 + else: + # Default to PyTorch style 'same'-ish symmetric padding + padding = get_padding(kernel_size, **kwargs) + return padding, dynamic + + +# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution +def get_same_padding(x: int, k: int, s: int, d: int): + return max((int(math.ceil(x // s)) - 1) * s + (k - 1) * d + 1 - x, 0) + + +# Can SAME padding for given args be done statically? +def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_): + return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0 + + +# Dynamically pad input x with 'SAME' padding for conv with specified args +# def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0): +def pad_same(x, k, s, d=(1, 1), value=0): + ih, iw = x.size()[-2:] + pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding( + iw, k[1], s[1], d[1] + ) + if pad_h > 0 or pad_w > 0: + x = F.pad( + x, + [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], + value=value, + ) + return x + + +def adaptive_pool_feat_mult(pool_type="avg"): + if pool_type == "catavgmax": + return 2 + else: + return 1 + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/lavis/models/ulip_models/ULIP_models.py b/lavis/models/ulip_models/ULIP_models.py new file mode 100644 index 0000000000000000000000000000000000000000..ef6a0640760555a1d8f13dacbede2a0320f9d3bb --- /dev/null +++ b/lavis/models/ulip_models/ULIP_models.py @@ -0,0 +1,243 @@ +''' + * Copyright (c) 2023, salesforce.com, inc. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + * By Le Xue +''' +## FROM: https://github.com/salesforce/ULIP +## TODO: Convert to LAVIS format. Currently only supports functionality for XInstructBLIP + +# Modified from github.com/openai/CLIP +from collections import OrderedDict + +import timm +from torch import nn +from lavis.models.ulip_models import losses +from torch.nn.parameter import Parameter +from easydict import EasyDict +import torch +import numpy as np +from lavis.common.dist_utils import download_cached_file + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential(OrderedDict([ + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model)) + ])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None + return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class ULIP_WITH_IMAGE(nn.Module): + def __init__(self, point_encoder, **kwargs): + # super().__init__(ssl_mlp_dim, ssl_emb_dim, **kwargs) + super().__init__() + kwargs = EasyDict(kwargs) + self.context_length = kwargs.context_length + self.vision_width = kwargs.vision_width + self.visual = kwargs.vision_model + self.num_features = kwargs.embed_dim + + self.transformer = Transformer( + width=kwargs.transformer_width, + layers=kwargs.transformer_layers, + heads=kwargs.transformer_heads, + attn_mask=self.build_attention_mask(), + ) + + self.vocab_size = kwargs.vocab_size + self.token_embedding = nn.Embedding(kwargs.vocab_size, kwargs.transformer_width) + self.positional_embedding = nn.Parameter(torch.empty(self.context_length, kwargs.transformer_width)) + self.ln_final = LayerNorm(kwargs.transformer_width) + + self.image_projection = nn.Parameter(torch.empty(kwargs.vision_width, kwargs.embed_dim)) + self.text_projection = nn.Parameter(torch.empty(kwargs.transformer_width, kwargs.embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + self.initialize_parameters() + + self.point_encoder = point_encoder + + self.pc_projection = nn.Parameter(torch.empty(kwargs.pc_feat_dims, kwargs.embed_dim )) + nn.init.normal_(self.pc_projection, std= kwargs.embed_dim ** -0.5) + + def encode_image(self, image): + x = self.visual(image) + x = x @ self.image_projection + + return x + + def encode_text(self, text): + x = self.token_embedding(text) # [batch_size, n_ctx, d_model] + x = x + self.positional_embedding + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x) + + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection + + return x + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float("-inf")) + mask.triu_(1) # zero out the lower diagonal + return mask + + def initialize_parameters(self): + nn.init.normal_(self.token_embedding.weight, std=0.02) + nn.init.normal_(self.positional_embedding, std=0.01) + + proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5) + attn_std = self.transformer.width ** -0.5 + fc_std = (2 * self.transformer.width) ** -0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + nn.init.normal_(self.image_projection, std=self.vision_width ** -0.5) + nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) + + def encode_pc(self, pc): + pc_feat = self.point_encoder(pc) + pc_embed = pc_feat @ self.pc_projection + return pc_embed + + def forward(self, pc, text=None, image=None): + + if text is not None: + text_embed_all = [] + for i in range(text.shape[0]): + text_for_one_sample = text[i] + text_embed = self.encode_text(text_for_one_sample) + text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True) + text_embed = text_embed.mean(dim=0) + text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True) + text_embed_all.append(text_embed) + + text_embed_all = torch.stack(text_embed_all) + else: + text_embed_all = None + + pc_embed = self.encode_pc(pc) + if image is not None: + image_embed = self.encode_image(image) + else: + image_embed = None + + res = {'text_embed': text_embed_all, + 'pc_embed': pc_embed, + 'image_embed': image_embed, + 'logit_scale': self.logit_scale.exp() + } + return pc_embed + + +def get_loss(args): + return losses.ULIPWithImageLoss() + + +def get_metric_names(model): + return ['loss', 'ulip_loss', 'ulip_pc_image_acc', 'ulip_pc_text_acc'] + +def ULIP_PointBERT(ulip_v=2): + vision_model = timm.create_model('vit_base_patch16_224', num_classes=0) + + # ===================================================================== + # import the 3D backbone and specify the output point cloud feature dimension + from lavis.models.ulip_models.pointbert.point_encoder import PointTransformer + from lavis.models.ulip_models.utils.config import cfg_from_yaml_file + ## TODO: parse as config + # config_addr = '/export/home/LAVIS/lavis/models/ulip_models/pointbert/PointTransformer_8192point.yaml' + url = "https://raw.githubusercontent.com/salesforce/ULIP/48d8d00b1cdb2aee79005817a202816f1c521911/models/pointbert/PointTransformer_8192point.yaml" + config_addr = download_cached_file( + url, check_hash=False, progress=True + ) + config = cfg_from_yaml_file(config_addr) + pc_feat_dims = 768 + if ulip_v == "ulip2_scaledup": + config.model.depth = 18 + transformer_layers = 18 + embed_dim=1280 + else: + embed_dim=512 + + transformer_layers = 12 + point_encoder = PointTransformer(config.model) + # ===================================================================== + model = ULIP_WITH_IMAGE(embed_dim=embed_dim, vision_width=pc_feat_dims, point_encoder=point_encoder, vision_model=vision_model, + context_length=77, vocab_size=49408, + transformer_width=512, transformer_heads=8, transformer_layers=transformer_layers, pc_feat_dims=pc_feat_dims) + + ## TODO: setup config + if ulip_v == 2: + cached_file = '/export/share/lxue/shared_models/ULIP-2/objaverse_shapenet_k_5/ULIP-2_pointbert_last.pt' + elif ulip_v == 1: + cached_file = '/export/share/lxue/shared_models/ULIP-1/objaverse/ULIP-1_pointbert_last.pt' + elif ulip_v == 'shapenet': + cached_file = '/export/share/lxue/shared_models/ULIP-1/objaverse_shapenet/checkpoint_last.pt' + elif ulip_v == 'objaverse_k_1': + cached_file = '/export/share/lxue/shared_models/ULIP-2/objaverse_k_1/checkpoint_last.pt' + elif ulip_v == 'objaverse_shapenet_k_1': + cached_file = '/export/share/lxue/shared_models/ULIP-2/objaverse_shapenet_k_1/checkpoint_last.pt' + elif ulip_v == "ulip2_scaledup": + cached_file = "/export/share/lxue/shared_models/ULIP-2/objaverse_shapenet_k_1_scaled_up/checkpoint_last.pt" + # url = "https://storage.cloud.google.com/sfr-ulip-code-release-research/pretrained_models/ckpt_zero-sho_classification/checkpoint_pointbert.pt" + # cached_file = download_cached_file( + # url, check_hash=False, progress=True + # ) + ckpt = torch.load(cached_file, map_location='cpu') + state_dict = OrderedDict() + for k, v in ckpt['state_dict'].items(): + state_dict[k.replace('module.', '')] = v + # model.cuda() + model.load_state_dict(state_dict, strict=False) + return model \ No newline at end of file diff --git a/lavis/models/ulip_models/__pycache__/ULIP_models.cpython-310.pyc b/lavis/models/ulip_models/__pycache__/ULIP_models.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efae76b5dec01c9f04cc985f3a1fcb13459812f5 Binary files /dev/null and b/lavis/models/ulip_models/__pycache__/ULIP_models.cpython-310.pyc differ diff --git a/lavis/models/ulip_models/__pycache__/losses.cpython-310.pyc b/lavis/models/ulip_models/__pycache__/losses.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5aaeaac9e4d53b758016065665501cdb140a871 Binary files /dev/null and b/lavis/models/ulip_models/__pycache__/losses.cpython-310.pyc differ diff --git a/lavis/models/ulip_models/losses.py b/lavis/models/ulip_models/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..b9e2c9ca2588c463beff93e672235adb50afdb40 --- /dev/null +++ b/lavis/models/ulip_models/losses.py @@ -0,0 +1,62 @@ +''' + * Copyright (c) 2023, salesforce.com, inc. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + * By Le Xue +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + +from lavis.models.ulip_models.utils import utils + +class ULIPWithImageLoss(nn.Module): + def __init__(self): + super().__init__() + self.labels = None + self.last_local_batch_size = None + + def forward(self, outputs): + pc_embed = outputs['pc_embed'] + text_embed = outputs['text_embed'] + image_embed = outputs['image_embed'] + logit_scale = outputs['logit_scale'] + local_batch_size = pc_embed.size(0) + + if local_batch_size != self.last_local_batch_size: + self.labels = local_batch_size * utils.get_rank() + torch.arange( + local_batch_size, device=pc_embed.device + ) + self.last_local_batch_size = local_batch_size + + # normalized features + pc_embed = F.normalize(pc_embed, dim=-1, p=2) + text_embed = F.normalize(text_embed, dim=-1, p=2) + image_embed = F.normalize(image_embed, dim=-1, p=2) + + # gather features from all GPUs + pc_embed_all, text_embed_all, image_embed_all = \ + utils.all_gather_batch([pc_embed, text_embed, image_embed]) + + # cosine similarity as logits + logits_per_pc_text = logit_scale * pc_embed @ text_embed_all.t() + logits_per_text_pc = logit_scale * text_embed @ pc_embed_all.t() + logits_per_pc_image = logit_scale * pc_embed @ image_embed_all.t() + logits_per_image_pc = logit_scale * image_embed @ pc_embed_all.t() + + loss = (F.cross_entropy(logits_per_pc_text, self.labels) + \ + F.cross_entropy(logits_per_text_pc, self.labels)) / 2 + \ + (F.cross_entropy(logits_per_pc_image, self.labels) + F.cross_entropy(logits_per_image_pc, self.labels)) / 2 + + # compute accuracy + with torch.no_grad(): + pred = torch.argmax(logits_per_pc_text, dim=-1) + correct = pred.eq(self.labels).sum() + pc_text_acc = 100 * correct / local_batch_size + + pred = torch.argmax(logits_per_pc_image, dim=-1) + correct = pred.eq(self.labels).sum() + pc_image_acc = 100 * correct / local_batch_size + + return {'loss': loss, 'ulip_loss': loss, 'ulip_pc_image_acc': pc_image_acc, 'ulip_pc_text_acc': pc_text_acc} diff --git a/lavis/models/ulip_models/pointbert/PointTransformer_8192point.yaml b/lavis/models/ulip_models/pointbert/PointTransformer_8192point.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f7b59621c34bb3c13ec40bea9a077058a9480746 --- /dev/null +++ b/lavis/models/ulip_models/pointbert/PointTransformer_8192point.yaml @@ -0,0 +1,32 @@ +optimizer : { + type: AdamW, + kwargs: { + lr : 0.0005, + weight_decay : 0.05 +}} + +scheduler: { + type: CosLR, + kwargs: { + epochs: 300, + initial_epochs : 10 +}} + +model : { + NAME: PointTransformer, + trans_dim: 384, + depth: 12, + drop_path_rate: 0.1, + cls_dim: 40, + num_heads: 6, + group_size: 32, + num_group: 512, + encoder_dims: 256, +} +npoints: 8192 +total_bs : 32 +step_per_update : 1 +max_epoch : 300 +grad_norm_clip : 10 + +consider_metric: CDL1 \ No newline at end of file diff --git a/lavis/models/ulip_models/pointbert/checkpoint.py b/lavis/models/ulip_models/pointbert/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..2ac680ab712235a4b8f4cc74f4c36b969ad6e57b --- /dev/null +++ b/lavis/models/ulip_models/pointbert/checkpoint.py @@ -0,0 +1,126 @@ +from collections import defaultdict +import torch.nn as nn + +from typing import Any +from typing import Optional, List, Dict, NamedTuple, Tuple, Iterable + +from termcolor import colored + +def get_missing_parameters_message(keys: List[str]) -> str: + """ + Get a logging-friendly message to report parameter names (keys) that are in + the model but not found in a checkpoint. + Args: + keys (list[str]): List of keys that were not found in the checkpoint. + Returns: + str: message. + """ + groups = _group_checkpoint_keys(keys) + msg = "Some model parameters or buffers are not found in the checkpoint:\n" + msg += "\n".join( + " " + colored(k + _group_to_str(v), "blue") for k, v in groups.items() + ) + return msg + + +def get_unexpected_parameters_message(keys: List[str]) -> str: + """ + Get a logging-friendly message to report parameter names (keys) that are in + the checkpoint but not found in the model. + Args: + keys (list[str]): List of keys that were not found in the model. + Returns: + str: message. + """ + groups = _group_checkpoint_keys(keys) + msg = "The checkpoint state_dict contains keys that are not used by the model:\n" + msg += "\n".join( + " " + colored(k + _group_to_str(v), "magenta") for k, v in groups.items() + ) + return msg + + +def _strip_prefix_if_present(state_dict: Dict[str, Any], prefix: str) -> None: + """ + Strip the prefix in metadata, if any. + Args: + state_dict (OrderedDict): a state-dict to be loaded to the model. + prefix (str): prefix. + """ + keys = sorted(state_dict.keys()) + if not all(len(key) == 0 or key.startswith(prefix) for key in keys): + return + + for key in keys: + newkey = key[len(prefix):] + state_dict[newkey] = state_dict.pop(key) + + # also strip the prefix in metadata, if any.. + try: + metadata = state_dict._metadata # pyre-ignore + except AttributeError: + pass + else: + for key in list(metadata.keys()): + # for the metadata dict, the key can be: + # '': for the DDP module, which we want to remove. + # 'module': for the actual model. + # 'module.xx.xx': for the rest. + + if len(key) == 0: + continue + newkey = key[len(prefix):] + metadata[newkey] = metadata.pop(key) + + +def _group_checkpoint_keys(keys: List[str]) -> Dict[str, List[str]]: + """ + Group keys based on common prefixes. A prefix is the string up to the final + "." in each key. + Args: + keys (list[str]): list of parameter names, i.e. keys in the model + checkpoint dict. + Returns: + dict[list]: keys with common prefixes are grouped into lists. + """ + groups = defaultdict(list) + for key in keys: + pos = key.rfind(".") + if pos >= 0: + head, tail = key[:pos], [key[pos + 1:]] + else: + head, tail = key, [] + groups[head].extend(tail) + return groups + + +def _group_to_str(group: List[str]) -> str: + """ + Format a group of parameter name suffixes into a loggable string. + Args: + group (list[str]): list of parameter name suffixes. + Returns: + str: formated string. + """ + if len(group) == 0: + return "" + + if len(group) == 1: + return "." + group[0] + + return ".{" + ", ".join(group) + "}" + + +def _named_modules_with_dup( + model: nn.Module, prefix: str = "" +) -> Iterable[Tuple[str, nn.Module]]: + """ + The same as `model.named_modules()`, except that it includes + duplicated modules that have more than one name. + """ + yield prefix, model + for name, module in model._modules.items(): # pyre-ignore + if module is None: + continue + submodule_prefix = prefix + ("." if prefix else "") + name + yield from _named_modules_with_dup(module, submodule_prefix) \ No newline at end of file diff --git a/lavis/models/ulip_models/pointbert/dvae.py b/lavis/models/ulip_models/pointbert/dvae.py new file mode 100644 index 0000000000000000000000000000000000000000..4adb3236d1285dd877dd16ce7f3a6ff2cbe73253 --- /dev/null +++ b/lavis/models/ulip_models/pointbert/dvae.py @@ -0,0 +1,342 @@ +import torch.nn as nn +import torch +import torch.nn.functional as F +from lavis.models.ulip_models.pointbert import misc + +if torch.cuda.is_available(): + from knn_cuda import KNN + + knn = KNN(k=4, transpose_mode=False) +else: + knn=None + +class DGCNN(nn.Module): + def __init__(self, encoder_channel, output_channel): + super().__init__() + ''' + K has to be 16 + ''' + self.input_trans = nn.Conv1d(encoder_channel, 128, 1) + + self.layer1 = nn.Sequential(nn.Conv2d(256, 256, kernel_size=1, bias=False), + nn.GroupNorm(4, 256), + nn.LeakyReLU(negative_slope=0.2) + ) + + self.layer2 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=1, bias=False), + nn.GroupNorm(4, 512), + nn.LeakyReLU(negative_slope=0.2) + ) + + self.layer3 = nn.Sequential(nn.Conv2d(1024, 512, kernel_size=1, bias=False), + nn.GroupNorm(4, 512), + nn.LeakyReLU(negative_slope=0.2) + ) + + self.layer4 = nn.Sequential(nn.Conv2d(1024, 1024, kernel_size=1, bias=False), + nn.GroupNorm(4, 1024), + nn.LeakyReLU(negative_slope=0.2) + ) + + self.layer5 = nn.Sequential(nn.Conv1d(2304, output_channel, kernel_size=1, bias=False), + nn.GroupNorm(4, output_channel), + nn.LeakyReLU(negative_slope=0.2) + ) + + @staticmethod + def get_graph_feature(coor_q, x_q, coor_k, x_k): + # coor: bs, 3, np, x: bs, c, np + + k = 4 + batch_size = x_k.size(0) + num_points_k = x_k.size(2) + num_points_q = x_q.size(2) + + with torch.no_grad(): + _, idx = knn(coor_k, coor_q) # bs k np + assert idx.shape[1] == k + idx_base = torch.arange(0, batch_size, device=x_q.device).view(-1, 1, 1) * num_points_k + idx = idx + idx_base + idx = idx.view(-1) + num_dims = x_k.size(1) + x_k = x_k.transpose(2, 1).contiguous() + feature = x_k.view(batch_size * num_points_k, -1)[idx, :] + feature = feature.view(batch_size, k, num_points_q, num_dims).permute(0, 3, 2, 1).contiguous() + x_q = x_q.view(batch_size, num_dims, num_points_q, 1).expand(-1, -1, -1, k) + feature = torch.cat((feature - x_q, x_q), dim=1) + return feature + + def forward(self, f, coor): + # f: B G C + # coor: B G 3 + + # bs 3 N bs C N + feature_list = [] + coor = coor.transpose(1, 2).contiguous() # B 3 N + f = f.transpose(1, 2).contiguous() # B C N + f = self.input_trans(f) # B 128 N + + f = self.get_graph_feature(coor, f, coor, f) # B 256 N k + f = self.layer1(f) # B 256 N k + f = f.max(dim=-1, keepdim=False)[0] # B 256 N + feature_list.append(f) + + f = self.get_graph_feature(coor, f, coor, f) # B 512 N k + f = self.layer2(f) # B 512 N k + f = f.max(dim=-1, keepdim=False)[0] # B 512 N + feature_list.append(f) + + f = self.get_graph_feature(coor, f, coor, f) # B 1024 N k + f = self.layer3(f) # B 512 N k + f = f.max(dim=-1, keepdim=False)[0] # B 512 N + feature_list.append(f) + + f = self.get_graph_feature(coor, f, coor, f) # B 1024 N k + f = self.layer4(f) # B 1024 N k + f = f.max(dim=-1, keepdim=False)[0] # B 1024 N + feature_list.append(f) + + f = torch.cat(feature_list, dim=1) # B 2304 N + + f = self.layer5(f) # B C' N + + f = f.transpose(-1, -2) + + return f + + +### ref https://github.com/Strawberry-Eat-Mango/PCT_Pytorch/blob/main/util.py ### +def knn_point(nsample, xyz, new_xyz): + """ + Input: + nsample: max sample number in local region + xyz: all points, [B, N, C] + new_xyz: query points, [B, S, C] + Return: + group_idx: grouped points index, [B, S, nsample] + """ + sqrdists = square_distance(new_xyz, xyz) + _, group_idx = torch.topk(sqrdists, nsample, dim=-1, largest=False, sorted=False) + return group_idx + + +def square_distance(src, dst): + """ + Calculate Euclid distance between each two points. + src^T * dst = xn * xm + yn * ym + zn * zm; + sum(src^2, dim=-1) = xn*xn + yn*yn + zn*zn; + sum(dst^2, dim=-1) = xm*xm + ym*ym + zm*zm; + dist = (xn - xm)^2 + (yn - ym)^2 + (zn - zm)^2 + = sum(src**2,dim=-1)+sum(dst**2,dim=-1)-2*src^T*dst + Input: + src: source points, [B, N, C] + dst: target points, [B, M, C] + Output: + dist: per-point square distance, [B, N, M] + """ + B, N, _ = src.shape + _, M, _ = dst.shape + dist = -2 * torch.matmul(src, dst.permute(0, 2, 1)) + dist += torch.sum(src ** 2, -1).view(B, N, 1) + dist += torch.sum(dst ** 2, -1).view(B, 1, M) + return dist + + +class Group(nn.Module): + def __init__(self, num_group, group_size): + super().__init__() + self.num_group = num_group + self.group_size = group_size + self.knn = KNN(k=self.group_size, transpose_mode=True) + + def forward(self, xyz): + ''' + input: B N 3 + --------------------------- + output: B G M 3 + center : B G 3 + ''' + batch_size, num_points, _ = xyz.shape + # fps the centers out + center = misc.fps(xyz, self.num_group) # B G 3 + # knn to get the neighborhood + # _, idx = self.knn(xyz, center) # B G M + idx = knn_point(self.group_size, xyz, center) # B G M + assert idx.size(1) == self.num_group + assert idx.size(2) == self.group_size + idx_base = torch.arange(0, batch_size, device=xyz.device).view(-1, 1, 1) * num_points + idx = idx + idx_base + idx = idx.view(-1) + neighborhood = xyz.view(batch_size * num_points, -1)[idx, :] + neighborhood = neighborhood.view(batch_size, self.num_group, self.group_size, 3).contiguous() + # normalize + neighborhood = neighborhood - center.unsqueeze(2) + return neighborhood, center + + +class Encoder(nn.Module): + def __init__(self, encoder_channel): + super().__init__() + self.encoder_channel = encoder_channel + self.first_conv = nn.Sequential( + nn.Conv1d(3, 128, 1), + nn.BatchNorm1d(128), + nn.ReLU(inplace=True), + nn.Conv1d(128, 256, 1) + ) + self.second_conv = nn.Sequential( + nn.Conv1d(512, 512, 1), + nn.BatchNorm1d(512), + nn.ReLU(inplace=True), + nn.Conv1d(512, self.encoder_channel, 1) + ) + + def forward(self, point_groups): + ''' + point_groups : B G N 3 + ----------------- + feature_global : B G C + ''' + bs, g, n, _ = point_groups.shape + point_groups = point_groups.reshape(bs * g, n, 3) + # encoder + feature = self.first_conv(point_groups.transpose(2, 1)) # BG 256 n + feature_global = torch.max(feature, dim=2, keepdim=True)[0] # BG 256 1 + feature = torch.cat([feature_global.expand(-1, -1, n), feature], dim=1) # BG 512 n + feature = self.second_conv(feature) # BG 1024 n + feature_global = torch.max(feature, dim=2, keepdim=False)[0] # BG 1024 + return feature_global.reshape(bs, g, self.encoder_channel) + + +class Decoder(nn.Module): + def __init__(self, encoder_channel, num_fine): + super().__init__() + self.num_fine = num_fine + self.grid_size = 2 + self.num_coarse = self.num_fine // 4 + assert num_fine % 4 == 0 + + self.mlp = nn.Sequential( + nn.Linear(encoder_channel, 1024), + nn.ReLU(inplace=True), + nn.Linear(1024, 1024), + nn.ReLU(inplace=True), + nn.Linear(1024, 3 * self.num_coarse) + ) + self.final_conv = nn.Sequential( + nn.Conv1d(encoder_channel + 3 + 2, 512, 1), + nn.BatchNorm1d(512), + nn.ReLU(inplace=True), + nn.Conv1d(512, 512, 1), + nn.BatchNorm1d(512), + nn.ReLU(inplace=True), + nn.Conv1d(512, 3, 1) + ) + a = torch.linspace(-0.05, 0.05, steps=self.grid_size, dtype=torch.float).view(1, self.grid_size).expand( + self.grid_size, self.grid_size).reshape(1, -1) + b = torch.linspace(-0.05, 0.05, steps=self.grid_size, dtype=torch.float).view(self.grid_size, 1).expand( + self.grid_size, self.grid_size).reshape(1, -1) + self.folding_seed = torch.cat([a, b], dim=0).view(1, 2, self.grid_size ** 2) # 1 2 S + + def forward(self, feature_global): + ''' + feature_global : B G C + ------- + coarse : B G M 3 + fine : B G N 3 + + ''' + bs, g, c = feature_global.shape + feature_global = feature_global.reshape(bs * g, c) + + coarse = self.mlp(feature_global).reshape(bs * g, self.num_coarse, 3) # BG M 3 + + point_feat = coarse.unsqueeze(2).expand(-1, -1, self.grid_size ** 2, -1) # BG (M) S 3 + point_feat = point_feat.reshape(bs * g, self.num_fine, 3).transpose(2, 1) # BG 3 N + + seed = self.folding_seed.unsqueeze(2).expand(bs * g, -1, self.num_coarse, -1) # BG 2 M (S) + seed = seed.reshape(bs * g, -1, self.num_fine).to(feature_global.device) # BG 2 N + + feature_global = feature_global.unsqueeze(2).expand(-1, -1, self.num_fine) # BG 1024 N + feat = torch.cat([feature_global, seed, point_feat], dim=1) # BG C N + + center = coarse.unsqueeze(2).expand(-1, -1, self.grid_size ** 2, -1) # BG (M) S 3 + center = center.reshape(bs * g, self.num_fine, 3).transpose(2, 1) # BG 3 N + + fine = self.final_conv(feat) + center # BG 3 N + fine = fine.reshape(bs, g, 3, self.num_fine).transpose(-1, -2) + coarse = coarse.reshape(bs, g, self.num_coarse, 3) + return coarse, fine + + +class DiscreteVAE(nn.Module): + def __init__(self, config, **kwargs): + super().__init__() + self.group_size = config.group_size + self.num_group = config.num_group + self.encoder_dims = config.encoder_dims + self.tokens_dims = config.tokens_dims + + self.decoder_dims = config.decoder_dims + self.num_tokens = config.num_tokens + + self.group_divider = Group(num_group=self.num_group, group_size=self.group_size) + self.encoder = Encoder(encoder_channel=self.encoder_dims) + self.dgcnn_1 = DGCNN(encoder_channel=self.encoder_dims, output_channel=self.num_tokens) + self.codebook = nn.Parameter(torch.randn(self.num_tokens, self.tokens_dims)) + + self.dgcnn_2 = DGCNN(encoder_channel=self.tokens_dims, output_channel=self.decoder_dims) + self.decoder = Decoder(encoder_channel=self.decoder_dims, num_fine=self.group_size) + # self.build_loss_func() + + # def build_loss_func(self): + # self.loss_func_cdl1 = ChamferDistanceL1().cuda() + # self.loss_func_cdl2 = ChamferDistanceL2().cuda() + # self.loss_func_emd = emd().cuda() + + def recon_loss(self, ret, gt): + whole_coarse, whole_fine, coarse, fine, group_gt, _ = ret + + bs, g, _, _ = coarse.shape + + coarse = coarse.reshape(bs * g, -1, 3).contiguous() + fine = fine.reshape(bs * g, -1, 3).contiguous() + group_gt = group_gt.reshape(bs * g, -1, 3).contiguous() + + loss_coarse_block = self.loss_func_cdl1(coarse, group_gt) + loss_fine_block = self.loss_func_cdl1(fine, group_gt) + + loss_recon = loss_coarse_block + loss_fine_block + + return loss_recon + + def get_loss(self, ret, gt): + # reconstruction loss + loss_recon = self.recon_loss(ret, gt) + # kl divergence + logits = ret[-1] # B G N + softmax = F.softmax(logits, dim=-1) + mean_softmax = softmax.mean(dim=1) + log_qy = torch.log(mean_softmax) + log_uniform = torch.log(torch.tensor([1. / self.num_tokens], device=gt.device)) + loss_klv = F.kl_div(log_qy, log_uniform.expand(log_qy.size(0), log_qy.size(1)), None, None, 'batchmean', + log_target=True) + + return loss_recon, loss_klv + + def forward(self, inp, temperature=1., hard=False, **kwargs): + neighborhood, center = self.group_divider(inp) + logits = self.encoder(neighborhood) # B G C + logits = self.dgcnn_1(logits, center) # B G N + soft_one_hot = F.gumbel_softmax(logits, tau=temperature, dim=2, hard=hard) # B G N + sampled = torch.einsum('b g n, n c -> b g c', soft_one_hot, self.codebook) # B G C + feature = self.dgcnn_2(sampled, center) + coarse, fine = self.decoder(feature) + + with torch.no_grad(): + whole_fine = (fine + center.unsqueeze(2)).reshape(inp.size(0), -1, 3) + whole_coarse = (coarse + center.unsqueeze(2)).reshape(inp.size(0), -1, 3) + + assert fine.size(2) == self.group_size + ret = (whole_coarse, whole_fine, coarse, fine, neighborhood, logits) + return ret diff --git a/lavis/models/ulip_models/pointbert/logger.py b/lavis/models/ulip_models/pointbert/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..847c1c7a2f50f310cd5daf96b928838c1c293525 --- /dev/null +++ b/lavis/models/ulip_models/pointbert/logger.py @@ -0,0 +1,127 @@ +import logging +import torch.distributed as dist + +logger_initialized = {} + +def get_root_logger(log_file=None, log_level=logging.INFO, name='main'): + """Get root logger and add a keyword filter to it. + The logger will be initialized if it has not been initialized. By default a + StreamHandler will be added. If `log_file` is specified, a FileHandler will + also be added. The name of the root logger is the top-level package name, + e.g., "mmdet3d". + Args: + log_file (str, optional): File path of log. Defaults to None. + log_level (int, optional): The level of logger. + Defaults to logging.INFO. + name (str, optional): The name of the root logger, also used as a + filter keyword. Defaults to 'mmdet3d'. + Returns: + :obj:`logging.Logger`: The obtained logger + """ + logger = get_logger(name=name, log_file=log_file, log_level=log_level) + # add a logging filter + logging_filter = logging.Filter(name) + logging_filter.filter = lambda record: record.find(name) != -1 + + return logger + + +def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): + """Initialize and get a logger by name. + If the logger has not been initialized, this method will initialize the + logger by adding one or two handlers, otherwise the initialized logger will + be directly returned. During initialization, a StreamHandler will always be + added. If `log_file` is specified and the process rank is 0, a FileHandler + will also be added. + Args: + name (str): Logger name. + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the logger. + log_level (int): The logger level. Note that only the process of + rank 0 is affected, and other processes will set the level to + "Error" thus be silent most of the time. + file_mode (str): The file mode used in opening log file. + Defaults to 'w'. + Returns: + logging.Logger: The expected logger. + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + # handle hierarchical names + # e.g., logger "a" is initialized, then logger "a.b" will skip the + # initialization since it is a child of "a". + for logger_name in logger_initialized: + if name.startswith(logger_name): + return logger + + # handle duplicate logs to the console + # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET) + # to the root logger. As logger.propagate is True by default, this root + # level handler causes logging messages from rank>0 processes to + # unexpectedly show up on the console, creating much unwanted clutter. + # To fix this issue, we set the root logger's StreamHandler, if any, to log + # at the ERROR level. + for handler in logger.root.handlers: + if type(handler) is logging.StreamHandler: + handler.setLevel(logging.ERROR) + + stream_handler = logging.StreamHandler() + handlers = [stream_handler] + + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank() + else: + rank = 0 + + # only rank 0 will add a FileHandler + if rank == 0 and log_file is not None: + # Here, the default behaviour of the official logger is 'a'. Thus, we + # provide an interface to change the file mode to the default + # behaviour. + file_handler = logging.FileHandler(log_file, file_mode) + handlers.append(file_handler) + + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + for handler in handlers: + handler.setFormatter(formatter) + handler.setLevel(log_level) + logger.addHandler(handler) + + if rank == 0: + logger.setLevel(log_level) + else: + logger.setLevel(logging.ERROR) + + logger_initialized[name] = True + + + return logger + + +def print_log(msg, logger=None, level=logging.INFO): + """Print a log message. + Args: + msg (str): The message to be logged. + logger (logging.Logger | str | None): The logger to be used. + Some special loggers are: + - "silent": no message will be printed. + - other str: the logger obtained with `get_root_logger(logger)`. + - None: The `print()` method will be used to print log messages. + level (int): Logging level. Only available when `logger` is a Logger + object or "root". + """ + if logger is None: + print(msg) + elif isinstance(logger, logging.Logger): + logger.log(level, msg) + elif logger == 'silent': + pass + elif isinstance(logger, str): + _logger = get_logger(logger) + _logger.log(level, msg) + else: + raise TypeError( + 'logger should be either a logging.Logger object, str, ' + f'"silent" or None, but got {type(logger)}') \ No newline at end of file diff --git a/lavis/models/ulip_models/pointbert/misc.py b/lavis/models/ulip_models/pointbert/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..02071cb2e4f70b143c86c617f16d5922a88f24f6 --- /dev/null +++ b/lavis/models/ulip_models/pointbert/misc.py @@ -0,0 +1,287 @@ +import numpy as np +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D +import random +import torch +import torch.nn as nn +import torch.nn.functional as F +import os +from collections import abc +# from pointnet2_ops import pointnet2_utils + + +# def fps(data, number): +# ''' +# data B N 3 +# number int +# ''' +# fps_idx = pointnet2_utils.furthest_point_sample(data, number) +# fps_data = pointnet2_utils.gather_operation(data.transpose(1, 2).contiguous(), fps_idx).transpose(1,2).contiguous() +# return fps_data + +def index_points(points, idx): + """ + Input: + points: input points data, [B, N, C] + idx: sample index data, [B, S] + Return: + new_points:, indexed points data, [B, S, C] + """ + device = points.device + B = points.shape[0] + view_shape = list(idx.shape) + view_shape[1:] = [1] * (len(view_shape) - 1) + repeat_shape = list(idx.shape) + repeat_shape[0] = 1 + batch_indices = torch.arange(B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape) + new_points = points[batch_indices, idx, :] + return new_points + +def fps(xyz, npoint): + """ + Input: + xyz: pointcloud data, [B, N, 3] + npoint: number of samples + Return: + centroids: sampled pointcloud index, [B, npoint] + """ + device = xyz.device + B, N, C = xyz.shape + centroids = torch.zeros(B, npoint, dtype=torch.long).to(device) + distance = torch.ones(B, N).to(device) * 1e10 + farthest = torch.randint(0, N, (B,), dtype=torch.long).to(device) + batch_indices = torch.arange(B, dtype=torch.long).to(device) + for i in range(npoint): + centroids[:, i] = farthest + centroid = xyz[batch_indices, farthest, :].view(B, 1, 3) + dist = torch.sum((xyz - centroid) ** 2, -1) + distance = torch.min(distance, dist) + farthest = torch.max(distance, -1)[1] + return index_points(xyz, centroids) + +def worker_init_fn(worker_id): + np.random.seed(np.random.get_state()[1][0] + worker_id) + +def build_lambda_sche(opti, config): + if config.get('decay_step') is not None: + lr_lbmd = lambda e: max(config.lr_decay ** (e / config.decay_step), config.lowest_decay) + scheduler = torch.optim.lr_scheduler.LambdaLR(opti, lr_lbmd) + else: + raise NotImplementedError() + return scheduler + +def build_lambda_bnsche(model, config): + if config.get('decay_step') is not None: + bnm_lmbd = lambda e: max(config.bn_momentum * config.bn_decay ** (e / config.decay_step), config.lowest_decay) + bnm_scheduler = BNMomentumScheduler(model, bnm_lmbd) + else: + raise NotImplementedError() + return bnm_scheduler + +def set_random_seed(seed, deterministic=False): + """Set random seed. + Args: + seed (int): Seed to be used. + deterministic (bool): Whether to set the deterministic option for + CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` + to True and `torch.backends.cudnn.benchmark` to False. + Default: False. + + # Speed-reproducibility tradeoff https://pytorch.org/docs/stable/notes/randomness.html + if cuda_deterministic: # slower, more reproducible + cudnn.deterministic = True + cudnn.benchmark = False + else: # faster, less reproducible + cudnn.deterministic = False + cudnn.benchmark = True + + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if deterministic: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def is_seq_of(seq, expected_type, seq_type=None): + """Check whether it is a sequence of some type. + Args: + seq (Sequence): The sequence to be checked. + expected_type (type): Expected type of sequence items. + seq_type (type, optional): Expected sequence type. + Returns: + bool: Whether the sequence is valid. + """ + if seq_type is None: + exp_seq_type = abc.Sequence + else: + assert isinstance(seq_type, type) + exp_seq_type = seq_type + if not isinstance(seq, exp_seq_type): + return False + for item in seq: + if not isinstance(item, expected_type): + return False + return True + + +def set_bn_momentum_default(bn_momentum): + def fn(m): + if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)): + m.momentum = bn_momentum + return fn + +class BNMomentumScheduler(object): + + def __init__( + self, model, bn_lambda, last_epoch=-1, + setter=set_bn_momentum_default + ): + if not isinstance(model, nn.Module): + raise RuntimeError( + "Class '{}' is not a PyTorch nn Module".format( + type(model).__name__ + ) + ) + + self.model = model + self.setter = setter + self.lmbd = bn_lambda + + self.step(last_epoch + 1) + self.last_epoch = last_epoch + + def step(self, epoch=None): + if epoch is None: + epoch = self.last_epoch + 1 + + self.last_epoch = epoch + self.model.apply(self.setter(self.lmbd(epoch))) + + def get_momentum(self, epoch=None): + if epoch is None: + epoch = self.last_epoch + 1 + return self.lmbd(epoch) + + + +def seprate_point_cloud(xyz, num_points, crop, fixed_points = None, padding_zeros = False): + ''' + seprate point cloud: usage : using to generate the incomplete point cloud with a setted number. + ''' + _,n,c = xyz.shape + + assert n == num_points + assert c == 3 + if crop == num_points: + return xyz, None + + INPUT = [] + CROP = [] + for points in xyz: + if isinstance(crop,list): + num_crop = random.randint(crop[0],crop[1]) + else: + num_crop = crop + + points = points.unsqueeze(0) + + if fixed_points is None: + center = F.normalize(torch.randn(1,1,3),p=2,dim=-1).cuda() + else: + if isinstance(fixed_points,list): + fixed_point = random.sample(fixed_points,1)[0] + else: + fixed_point = fixed_points + center = fixed_point.reshape(1,1,3).cuda() + + distance_matrix = torch.norm(center.unsqueeze(2) - points.unsqueeze(1), p =2 ,dim = -1) # 1 1 2048 + + idx = torch.argsort(distance_matrix,dim=-1, descending=False)[0,0] # 2048 + + if padding_zeros: + input_data = points.clone() + input_data[0, idx[:num_crop]] = input_data[0,idx[:num_crop]] * 0 + + else: + input_data = points.clone()[0, idx[num_crop:]].unsqueeze(0) # 1 N 3 + + crop_data = points.clone()[0, idx[:num_crop]].unsqueeze(0) + + if isinstance(crop,list): + INPUT.append(fps(input_data,2048)) + CROP.append(fps(crop_data,2048)) + else: + INPUT.append(input_data) + CROP.append(crop_data) + + input_data = torch.cat(INPUT,dim=0)# B N 3 + crop_data = torch.cat(CROP,dim=0)# B M 3 + + return input_data.contiguous(), crop_data.contiguous() + +def get_ptcloud_img(ptcloud): + fig = plt.figure(figsize=(8, 8)) + + x, z, y = ptcloud.transpose(1, 0) + ax = fig.gca(projection=Axes3D.name, adjustable='box') + ax.axis('off') + # ax.axis('scaled') + ax.view_init(30, 45) + max, min = np.max(ptcloud), np.min(ptcloud) + ax.set_xbound(min, max) + ax.set_ybound(min, max) + ax.set_zbound(min, max) + ax.scatter(x, y, z, zdir='z', c=x, cmap='jet') + + fig.canvas.draw() + img = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + img = img.reshape(fig.canvas.get_width_height()[::-1] + (3, )) + return img + + + +def visualize_KITTI(path, data_list, titles = ['input','pred'], cmap=['bwr','autumn'], zdir='y', + xlim=(-1, 1), ylim=(-1, 1), zlim=(-1, 1) ): + fig = plt.figure(figsize=(6*len(data_list),6)) + cmax = data_list[-1][:,0].max() + + for i in range(len(data_list)): + data = data_list[i][:-2048] if i == 1 else data_list[i] + color = data[:,0] /cmax + ax = fig.add_subplot(1, len(data_list) , i + 1, projection='3d') + ax.view_init(30, -120) + b = ax.scatter(data[:, 0], data[:, 1], data[:, 2], zdir=zdir, c=color,vmin=-1,vmax=1 ,cmap = cmap[0],s=4,linewidth=0.05, edgecolors = 'black') + ax.set_title(titles[i]) + + ax.set_axis_off() + ax.set_xlim(xlim) + ax.set_ylim(ylim) + ax.set_zlim(zlim) + plt.subplots_adjust(left=0, right=1, bottom=0, top=1, wspace=0.2, hspace=0) + if not os.path.exists(path): + os.makedirs(path) + + pic_path = path + '.png' + fig.savefig(pic_path) + + np.save(os.path.join(path, 'input.npy'), data_list[0].numpy()) + np.save(os.path.join(path, 'pred.npy'), data_list[1].numpy()) + plt.close(fig) + + +def random_dropping(pc, e): + up_num = max(64, 768 // (e//50 + 1)) + pc = pc + random_num = torch.randint(1, up_num, (1,1))[0,0] + pc = fps(pc, random_num) + padding = torch.zeros(pc.size(0), 2048 - pc.size(1), 3).to(pc.device) + pc = torch.cat([pc, padding], dim = 1) + return pc + + +def random_scale(partial, scale_range=[0.8, 1.2]): + scale = torch.rand(1).cuda() * (scale_range[1] - scale_range[0]) + scale_range[0] + return partial * scale diff --git a/lavis/models/ulip_models/pointbert/point_encoder.py b/lavis/models/ulip_models/pointbert/point_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..6ef9420aa250c56fc19adfb3407a068e9eb95cdf --- /dev/null +++ b/lavis/models/ulip_models/pointbert/point_encoder.py @@ -0,0 +1,225 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import DropPath +from lavis.models.ulip_models.pointbert.dvae import Group +from lavis.models.ulip_models.pointbert.dvae import Encoder +from lavis.models.ulip_models.pointbert.logger import print_log + +from lavis.models.ulip_models.pointbert.checkpoint import get_missing_parameters_message, get_unexpected_parameters_message + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class TransformerEncoder(nn.Module): + """ Transformer Encoder without hierarchical structure + """ + + def __init__(self, embed_dim=768, depth=4, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.): + super().__init__() + + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=drop_path_rate[i] if isinstance(drop_path_rate, list) else drop_path_rate + ) + for i in range(depth)]) + + def forward(self, x, pos): + for _, block in enumerate(self.blocks): + x = block(x + pos) + return x + + +class PointTransformer(nn.Module): + def __init__(self, config, **kwargs): + super().__init__() + self.config = config + # self.args = kwargs["args"] + self.num_features = 512 + self.trans_dim = config.trans_dim + self.depth = config.depth + self.drop_path_rate = config.drop_path_rate + self.cls_dim = config.cls_dim + self.num_heads = config.num_heads + + self.group_size = config.group_size + self.num_group = config.num_group + # grouper + self.group_divider = Group(num_group=self.num_group, group_size=self.group_size) + # define the encoder + self.encoder_dims = config.encoder_dims + self.encoder = Encoder(encoder_channel=self.encoder_dims) + # bridge encoder and transformer + self.reduce_dim = nn.Linear(self.encoder_dims, self.trans_dim) + + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.trans_dim)) + self.cls_pos = nn.Parameter(torch.randn(1, 1, self.trans_dim)) + + self.pos_embed = nn.Sequential( + nn.Linear(3, 128), + nn.GELU(), + nn.Linear(128, self.trans_dim) + ) + + dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, self.depth)] + self.blocks = TransformerEncoder( + embed_dim=self.trans_dim, + depth=self.depth, + drop_path_rate=dpr, + num_heads=self.num_heads + ) + + self.norm = nn.LayerNorm(self.trans_dim) + # self.load_model_from_ckpt('/export/home/repos/SLIP/pretrained_models/point_transformer_8192.pt') + # if not self.args.evaluate_3d: + ## TODO: pass as config + # self.load_model_from_ckpt('/export/home/ULIP/data/initialize_models/ULIP-2_pointbert_last.pt') + + # self.cls_head_finetune = nn.Sequential( + # nn.Linear(self.trans_dim * 2, 256), + # nn.ReLU(inplace=True), + # nn.Dropout(0.5), + # nn.Linear(256, self.cls_dim) + # ) + + # self.build_loss_func() + + def build_loss_func(self): + self.loss_ce = nn.CrossEntropyLoss() + + def get_loss_acc(self, pred, gt, smoothing=True): + # import pdb; pdb.set_trace() + gt = gt.contiguous().view(-1).long() + + if smoothing: + eps = 0.2 + n_class = pred.size(1) + + one_hot = torch.zeros_like(pred).scatter(1, gt.view(-1, 1), 1) + one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1) + log_prb = F.log_softmax(pred, dim=1) + + loss = -(one_hot * log_prb).sum(dim=1).mean() + else: + loss = self.loss_ce(pred, gt.long()) + + pred = pred.argmax(-1) + acc = (pred == gt).sum() / float(gt.size(0)) + + return loss, acc * 100 + + def load_model_from_ckpt(self, bert_ckpt_path): + ckpt = torch.load(bert_ckpt_path, map_location='cpu') + base_ckpt = {k.replace("module.", ""): v for k, v in ckpt['state_dict'].items()} + for k in list(base_ckpt.keys()): + if k.startswith('transformer_q') and not k.startswith('transformer_q.cls_head'): + base_ckpt[k[len('transformer_q.'):]] = base_ckpt[k] + elif k.startswith('base_model'): + base_ckpt[k[len('base_model.'):]] = base_ckpt[k] + del base_ckpt[k] + + incompatible = self.load_state_dict(base_ckpt, strict=True) + + if incompatible.missing_keys: + print_log('missing_keys', logger='Transformer') + print_log( + get_missing_parameters_message(incompatible.missing_keys), + logger='Transformer' + ) + if incompatible.unexpected_keys: + print_log('unexpected_keys', logger='Transformer') + print_log( + get_unexpected_parameters_message(incompatible.unexpected_keys), + logger='Transformer' + ) + + print_log(f'[Transformer] Successful Loading the ckpt from {bert_ckpt_path}', logger='Transformer') + + def forward(self, pts): + # divide the point cloud in the same form. This is important + neighborhood, center = self.group_divider(pts) + # encoder the input cloud blocks + group_input_tokens = self.encoder(neighborhood) # B G N + group_input_tokens = self.reduce_dim(group_input_tokens) + # prepare cls + cls_tokens = self.cls_token.expand(group_input_tokens.size(0), -1, -1) + cls_pos = self.cls_pos.expand(group_input_tokens.size(0), -1, -1) + # add pos embedding + pos = self.pos_embed(center) + # final input + x = torch.cat((cls_tokens, group_input_tokens), dim=1) + pos = torch.cat((cls_pos, pos), dim=1) + # transformer + x = self.blocks(x, pos) + x = self.norm(x) + concat_f = torch.cat([x[:, 0], x[:, 1:].max(1)[0]], dim=-1) + # ret = self.cls_head_finetune(concat_f) + return concat_f \ No newline at end of file diff --git a/lavis/models/ulip_models/ulip_scaled_up_config.yaml b/lavis/models/ulip_models/ulip_scaled_up_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6042dfcefd778a28a9c575ba3cab6880c0138e12 --- /dev/null +++ b/lavis/models/ulip_models/ulip_scaled_up_config.yaml @@ -0,0 +1,32 @@ +optimizer : { + type: AdamW, + kwargs: { + lr : 0.0005, + weight_decay : 0.05 +}} + +scheduler: { + type: CosLR, + kwargs: { + epochs: 300, + initial_epochs : 10 +}} + +model : { + NAME: PointTransformer, + trans_dim: 384, + depth: 18, + drop_path_rate: 0.1, + cls_dim: 40, + num_heads: 6, + group_size: 32, + num_group: 512, + encoder_dims: 256, +} +npoints: 8192 +total_bs : 32 +step_per_update : 1 +max_epoch : 300 +grad_norm_clip : 10 + +consider_metric: CDL1 \ No newline at end of file diff --git a/lavis/models/ulip_models/utils/__init__.py b/lavis/models/ulip_models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa793b40cabf921c1fa6eef34a920516c5582419 --- /dev/null +++ b/lavis/models/ulip_models/utils/__init__.py @@ -0,0 +1,7 @@ +''' + * Copyright (c) 2023, salesforce.com, inc. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + * By Le Xue +''' diff --git a/lavis/models/ulip_models/utils/__pycache__/__init__.cpython-310.pyc b/lavis/models/ulip_models/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e4370c44dbb5004cb81a5a2117855c765b3b8f3 Binary files /dev/null and b/lavis/models/ulip_models/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/models/ulip_models/utils/__pycache__/io.cpython-310.pyc b/lavis/models/ulip_models/utils/__pycache__/io.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e052685553edcd7e0c45055970d3df5fcacb5ea6 Binary files /dev/null and b/lavis/models/ulip_models/utils/__pycache__/io.cpython-310.pyc differ diff --git a/lavis/models/ulip_models/utils/__pycache__/utils.cpython-310.pyc b/lavis/models/ulip_models/utils/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..202369557c90dc9cd7e47544ca4d7eb8a0b4c33d Binary files /dev/null and b/lavis/models/ulip_models/utils/__pycache__/utils.cpython-310.pyc differ diff --git a/lavis/models/ulip_models/utils/build.py b/lavis/models/ulip_models/utils/build.py new file mode 100644 index 0000000000000000000000000000000000000000..db9c4beebc3965828c578bbd3ba84786f1dc968d --- /dev/null +++ b/lavis/models/ulip_models/utils/build.py @@ -0,0 +1,17 @@ +from utils import registry + + +DATASETS = registry.Registry('dataset') + + +def build_dataset_from_cfg(cfg, default_args = None): + """ + Build a dataset, defined by `dataset_name`. + Args: + cfg (eDICT): + Returns: + Dataset: a constructed dataset specified by dataset_name. + """ + return DATASETS.build(cfg, default_args = default_args) + + diff --git a/lavis/models/ulip_models/utils/config.py b/lavis/models/ulip_models/utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..b364ee774f8437a9962280f28748e2167a45e732 --- /dev/null +++ b/lavis/models/ulip_models/utils/config.py @@ -0,0 +1,63 @@ +import yaml +from easydict import EasyDict +import os +from .logger import print_log + +def log_args_to_file(args, pre='args', logger=None): + for key, val in args.__dict__.items(): + print_log(f'{pre}.{key} : {val}', logger = logger) + +def log_config_to_file(cfg, pre='cfg', logger=None): + for key, val in cfg.items(): + if isinstance(cfg[key], EasyDict): + print_log(f'{pre}.{key} = edict()', logger = logger) + log_config_to_file(cfg[key], pre=pre + '.' + key, logger=logger) + continue + print_log(f'{pre}.{key} : {val}', logger = logger) + +def merge_new_config(config, new_config): + for key, val in new_config.items(): + if not isinstance(val, dict): + if key == '_base_': + with open(new_config['_base_'], 'r') as f: + try: + val = yaml.load(f, Loader=yaml.FullLoader) + except: + val = yaml.load(f) + config[key] = EasyDict() + merge_new_config(config[key], val) + else: + config[key] = val + continue + if key not in config: + config[key] = EasyDict() + merge_new_config(config[key], val) + return config + +def cfg_from_yaml_file(cfg_file): + config = EasyDict() + with open(cfg_file, 'r') as f: + try: + new_config = yaml.load(f, Loader=yaml.FullLoader) + except: + new_config = yaml.load(f) + merge_new_config(config=config, new_config=new_config) + return config + +def get_config(args, logger=None): + if args.resume: + cfg_path = os.path.join(args.experiment_path, 'config.yaml') + if not os.path.exists(cfg_path): + print_log("Failed to resume", logger = logger) + raise FileNotFoundError() + print_log(f'Resume yaml from {cfg_path}', logger = logger) + args.config = cfg_path + config = cfg_from_yaml_file(args.config) + if not args.resume and args.local_rank == 0: + save_experiment_config(args, config, logger) + return config + +def save_experiment_config(args, config, logger = None): + config_path = os.path.join(args.experiment_path, 'config.yaml') + os.system('cp %s %s' % (args.config, config_path)) + print_log(f'Copy the Config file from {args.config} to {config_path}',logger = logger ) \ No newline at end of file diff --git a/lavis/models/ulip_models/utils/io.py b/lavis/models/ulip_models/utils/io.py new file mode 100644 index 0000000000000000000000000000000000000000..d0edd1dd450d18981c545a9cb7460184186d6708 --- /dev/null +++ b/lavis/models/ulip_models/utils/io.py @@ -0,0 +1,42 @@ +import h5py +import numpy as np +import open3d +import os + +class IO: + @classmethod + def get(cls, file_path): + _, file_extension = os.path.splitext(file_path) + + if file_extension in ['.npy']: + return cls._read_npy(file_path) + elif file_extension in ['.pcd']: + return cls._read_pcd(file_path) + elif file_extension in ['.h5']: + return cls._read_h5(file_path) + elif file_extension in ['.txt']: + return cls._read_txt(file_path) + else: + raise Exception('Unsupported file extension: %s' % file_extension) + + # References: https://github.com/numpy/numpy/blob/master/numpy/lib/format.py + @classmethod + def _read_npy(cls, file_path): + return np.load(file_path) + + # References: https://github.com/dimatura/pypcd/blob/master/pypcd/pypcd.py#L275 + # Support PCD files without compression ONLY! + @classmethod + def _read_pcd(cls, file_path): + pc = open3d.io.read_point_cloud(file_path) + ptcloud = np.array(pc.points) + return ptcloud + + @classmethod + def _read_txt(cls, file_path): + return np.loadtxt(file_path) + + @classmethod + def _read_h5(cls, file_path): + f = h5py.File(file_path, 'r') + return f['data'][()] \ No newline at end of file diff --git a/lavis/models/ulip_models/utils/logger.py b/lavis/models/ulip_models/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..847c1c7a2f50f310cd5daf96b928838c1c293525 --- /dev/null +++ b/lavis/models/ulip_models/utils/logger.py @@ -0,0 +1,127 @@ +import logging +import torch.distributed as dist + +logger_initialized = {} + +def get_root_logger(log_file=None, log_level=logging.INFO, name='main'): + """Get root logger and add a keyword filter to it. + The logger will be initialized if it has not been initialized. By default a + StreamHandler will be added. If `log_file` is specified, a FileHandler will + also be added. The name of the root logger is the top-level package name, + e.g., "mmdet3d". + Args: + log_file (str, optional): File path of log. Defaults to None. + log_level (int, optional): The level of logger. + Defaults to logging.INFO. + name (str, optional): The name of the root logger, also used as a + filter keyword. Defaults to 'mmdet3d'. + Returns: + :obj:`logging.Logger`: The obtained logger + """ + logger = get_logger(name=name, log_file=log_file, log_level=log_level) + # add a logging filter + logging_filter = logging.Filter(name) + logging_filter.filter = lambda record: record.find(name) != -1 + + return logger + + +def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): + """Initialize and get a logger by name. + If the logger has not been initialized, this method will initialize the + logger by adding one or two handlers, otherwise the initialized logger will + be directly returned. During initialization, a StreamHandler will always be + added. If `log_file` is specified and the process rank is 0, a FileHandler + will also be added. + Args: + name (str): Logger name. + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the logger. + log_level (int): The logger level. Note that only the process of + rank 0 is affected, and other processes will set the level to + "Error" thus be silent most of the time. + file_mode (str): The file mode used in opening log file. + Defaults to 'w'. + Returns: + logging.Logger: The expected logger. + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + # handle hierarchical names + # e.g., logger "a" is initialized, then logger "a.b" will skip the + # initialization since it is a child of "a". + for logger_name in logger_initialized: + if name.startswith(logger_name): + return logger + + # handle duplicate logs to the console + # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET) + # to the root logger. As logger.propagate is True by default, this root + # level handler causes logging messages from rank>0 processes to + # unexpectedly show up on the console, creating much unwanted clutter. + # To fix this issue, we set the root logger's StreamHandler, if any, to log + # at the ERROR level. + for handler in logger.root.handlers: + if type(handler) is logging.StreamHandler: + handler.setLevel(logging.ERROR) + + stream_handler = logging.StreamHandler() + handlers = [stream_handler] + + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank() + else: + rank = 0 + + # only rank 0 will add a FileHandler + if rank == 0 and log_file is not None: + # Here, the default behaviour of the official logger is 'a'. Thus, we + # provide an interface to change the file mode to the default + # behaviour. + file_handler = logging.FileHandler(log_file, file_mode) + handlers.append(file_handler) + + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + for handler in handlers: + handler.setFormatter(formatter) + handler.setLevel(log_level) + logger.addHandler(handler) + + if rank == 0: + logger.setLevel(log_level) + else: + logger.setLevel(logging.ERROR) + + logger_initialized[name] = True + + + return logger + + +def print_log(msg, logger=None, level=logging.INFO): + """Print a log message. + Args: + msg (str): The message to be logged. + logger (logging.Logger | str | None): The logger to be used. + Some special loggers are: + - "silent": no message will be printed. + - other str: the logger obtained with `get_root_logger(logger)`. + - None: The `print()` method will be used to print log messages. + level (int): Logging level. Only available when `logger` is a Logger + object or "root". + """ + if logger is None: + print(msg) + elif isinstance(logger, logging.Logger): + logger.log(level, msg) + elif logger == 'silent': + pass + elif isinstance(logger, str): + _logger = get_logger(logger) + _logger.log(level, msg) + else: + raise TypeError( + 'logger should be either a logging.Logger object, str, ' + f'"silent" or None, but got {type(logger)}') \ No newline at end of file diff --git a/lavis/models/ulip_models/utils/registry.py b/lavis/models/ulip_models/utils/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..60c4dc43f54ab21b8d8ef813bfdb175d9efb4992 --- /dev/null +++ b/lavis/models/ulip_models/utils/registry.py @@ -0,0 +1,288 @@ +import inspect +import warnings +from functools import partial +from . import config + +class Registry: + """A registry to map strings to classes. + Registered object could be built from registry. + Example: + >>> MODELS = Registry('models') + >>> @MODELS.register_module() + >>> class ResNet: + >>> pass + >>> resnet = MODELS.build(dict(NAME='ResNet')) + Please refer to https://mmcv.readthedocs.io/en/latest/registry.html for + advanced useage. + Args: + name (str): Registry name. + build_func(func, optional): Build function to construct instance from + Registry, func:`build_from_cfg` is used if neither ``parent`` or + ``build_func`` is specified. If ``parent`` is specified and + ``build_func`` is not given, ``build_func`` will be inherited + from ``parent``. Default: None. + parent (Registry, optional): Parent registry. The class registered in + children registry could be built from parent. Default: None. + scope (str, optional): The scope of registry. It is the key to search + for children registry. If not specified, scope will be the name of + the package where class is defined, e.g. mmdet, mmcls, mmseg. + Default: None. + """ + + def __init__(self, name, build_func=None, parent=None, scope=None): + self._name = name + self._module_dict = dict() + self._children = dict() + self._scope = self.infer_scope() if scope is None else scope + + # self.build_func will be set with the following priority: + # 1. build_func + # 2. parent.build_func + # 3. build_from_cfg + if build_func is None: + if parent is not None: + self.build_func = parent.build_func + else: + self.build_func = build_from_cfg + else: + self.build_func = build_func + if parent is not None: + assert isinstance(parent, Registry) + parent._add_children(self) + self.parent = parent + else: + self.parent = None + + def __len__(self): + return len(self._module_dict) + + def __contains__(self, key): + return self.get(key) is not None + + def __repr__(self): + format_str = self.__class__.__name__ + \ + f'(name={self._name}, ' \ + f'items={self._module_dict})' + return format_str + + @staticmethod + def infer_scope(): + """Infer the scope of registry. + The name of the package where registry is defined will be returned. + Example: + # in mmdet/models/backbone/resnet.py + >>> MODELS = Registry('models') + >>> @MODELS.register_module() + >>> class ResNet: + >>> pass + The scope of ``ResNet`` will be ``mmdet``. + Returns: + scope (str): The inferred scope name. + """ + # inspect.stack() trace where this function is called, the index-2 + # indicates the frame where `infer_scope()` is called + filename = inspect.getmodule(inspect.stack()[2][0]).__name__ + split_filename = filename.split('.') + return split_filename[0] + + @staticmethod + def split_scope_key(key): + """Split scope and key. + The first scope will be split from key. + Examples: + >>> Registry.split_scope_key('mmdet.ResNet') + 'mmdet', 'ResNet' + >>> Registry.split_scope_key('ResNet') + None, 'ResNet' + Return: + scope (str, None): The first scope. + key (str): The remaining key. + """ + split_index = key.find('.') + if split_index != -1: + return key[:split_index], key[split_index + 1:] + else: + return None, key + + @property + def name(self): + return self._name + + @property + def scope(self): + return self._scope + + @property + def module_dict(self): + return self._module_dict + + @property + def children(self): + return self._children + + def get(self, key): + """Get the registry record. + Args: + key (str): The class name in string format. + Returns: + class: The corresponding class. + """ + scope, real_key = self.split_scope_key(key) + if scope is None or scope == self._scope: + # get from self + if real_key in self._module_dict: + return self._module_dict[real_key] + else: + # get from self._children + if scope in self._children: + return self._children[scope].get(real_key) + else: + # goto root + parent = self.parent + while parent.parent is not None: + parent = parent.parent + return parent.get(key) + + def build(self, *args, **kwargs): + return self.build_func(*args, **kwargs, registry=self) + + def _add_children(self, registry): + """Add children for a registry. + The ``registry`` will be added as children based on its scope. + The parent registry could build objects from children registry. + Example: + >>> models = Registry('models') + >>> mmdet_models = Registry('models', parent=models) + >>> @mmdet_models.register_module() + >>> class ResNet: + >>> pass + >>> resnet = models.build(dict(NAME='mmdet.ResNet')) + """ + + assert isinstance(registry, Registry) + assert registry.scope is not None + assert registry.scope not in self.children, \ + f'scope {registry.scope} exists in {self.name} registry' + self.children[registry.scope] = registry + + def _register_module(self, module_class, module_name=None, force=False): + if not inspect.isclass(module_class): + raise TypeError('module must be a class, ' + f'but got {type(module_class)}') + + if module_name is None: + module_name = module_class.__name__ + if isinstance(module_name, str): + module_name = [module_name] + for name in module_name: + if not force and name in self._module_dict: + raise KeyError(f'{name} is already registered ' + f'in {self.name}') + self._module_dict[name] = module_class + + def deprecated_register_module(self, cls=None, force=False): + warnings.warn( + 'The old API of register_module(module, force=False) ' + 'is deprecated and will be removed, please use the new API ' + 'register_module(name=None, force=False, module=None) instead.') + if cls is None: + return partial(self.deprecated_register_module, force=force) + self._register_module(cls, force=force) + return cls + + def register_module(self, name=None, force=False, module=None): + """Register a module. + A record will be added to `self._module_dict`, whose key is the class + name or the specified name, and value is the class itself. + It can be used as a decorator or a normal function. + Example: + >>> backbones = Registry('backbone') + >>> @backbones.register_module() + >>> class ResNet: + >>> pass + >>> backbones = Registry('backbone') + >>> @backbones.register_module(name='mnet') + >>> class MobileNet: + >>> pass + >>> backbones = Registry('backbone') + >>> class ResNet: + >>> pass + >>> backbones.register_module(ResNet) + Args: + name (str | None): The module name to be registered. If not + specified, the class name will be used. + force (bool, optional): Whether to override an existing class with + the same name. Default: False. + module (type): Module class to be registered. + """ + if not isinstance(force, bool): + raise TypeError(f'force must be a boolean, but got {type(force)}') + # NOTE: This is a walkaround to be compatible with the old api, + # while it may introduce unexpected bugs. + if isinstance(name, type): + return self.deprecated_register_module(name, force=force) + + # raise the error ahead of time + if not (name is None or isinstance(name, str) or misc.is_seq_of(name, str)): + raise TypeError( + 'name must be either of None, an instance of str or a sequence' + f' of str, but got {type(name)}') + + # use it as a normal method: x.register_module(module=SomeClass) + if module is not None: + self._register_module( + module_class=module, module_name=name, force=force) + return module + + # use it as a decorator: @x.register_module() + def _register(cls): + self._register_module( + module_class=cls, module_name=name, force=force) + return cls + + return _register + + +def build_from_cfg(cfg, registry, default_args=None): + """Build a module from config dict. + Args: + cfg (edict): Config dict. It should at least contain the key "NAME". + registry (:obj:`Registry`): The registry to search the type from. + Returns: + object: The constructed object. + """ + if not isinstance(cfg, dict): + raise TypeError(f'cfg must be a dict, but got {type(cfg)}') + if 'NAME' not in cfg: + if default_args is None or 'NAME' not in default_args: + raise KeyError( + '`cfg` or `default_args` must contain the key "NAME", ' + f'but got {cfg}\n{default_args}') + if not isinstance(registry, Registry): + raise TypeError('registry must be an mmcv.Registry object, ' + f'but got {type(registry)}') + + if not (isinstance(default_args, dict) or default_args is None): + raise TypeError('default_args must be a dict or None, ' + f'but got {type(default_args)}') + + if default_args is not None: + cfg = config.merge_new_config(cfg, default_args) + + obj_type = cfg.get('NAME') + + if isinstance(obj_type, str): + obj_cls = registry.get(obj_type) + if obj_cls is None: + raise KeyError( + f'{obj_type} is not in the {registry.name} registry') + elif inspect.isclass(obj_type): + obj_cls = obj_type + else: + raise TypeError( + f'type must be a str or valid type, but got {type(obj_type)}') + try: + return obj_cls(cfg) + except Exception as e: + # Normal TypeError does not print class name. + raise type(e)(f'{obj_cls.__name__}: {e}') \ No newline at end of file diff --git a/lavis/models/ulip_models/utils/tokenizer.py b/lavis/models/ulip_models/utils/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..3bf421618ff722e97f9f6bb7d9da618021c67451 --- /dev/null +++ b/lavis/models/ulip_models/utils/tokenizer.py @@ -0,0 +1,151 @@ +# Modified from github.com/openai/CLIP +import gzip +import html +import os +from functools import lru_cache + +import ftfy +import regex as re +import torch + + +@lru_cache() +def default_bpe(): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r'\s+', ' ', text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') + merges = merges[1:49152-256-2+1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v+'' for v in vocab] + for merge in merges: + vocab.append(''.join(merge)) + vocab.extend(['<|startoftext|>', '<|endoftext|>']) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} + self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + ( token[-1] + '',) + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') + return text + + def __call__(self, texts, context_length=77): + if isinstance(texts, str): + texts = [texts] + + sot_token = self.encoder["<|startoftext|>"] + eot_token = self.encoder["<|endoftext|>"] + all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + tokens = tokens[:context_length] + result[i, :len(tokens)] = torch.tensor(tokens) + + if len(result) == 1: + return result[0] + return result \ No newline at end of file diff --git a/lavis/models/ulip_models/utils/utils.py b/lavis/models/ulip_models/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e45d171d395e4c3f6f0cc3a1b1fb08563b923097 --- /dev/null +++ b/lavis/models/ulip_models/utils/utils.py @@ -0,0 +1,242 @@ +import numpy as np +import os +import random +import shutil +import torch +import torch.distributed as dist +import torch.autograd as autograd + +from PIL import ImageFilter +from easydict import EasyDict +import yaml +# from data.dataset_3d import Dataset_3D + +def merge_new_config(config, new_config): + for key, val in new_config.items(): + if not isinstance(val, dict): + if key == '_base_': + with open(new_config['_base_'], 'r') as f: + try: + val = yaml.load(f, Loader=yaml.FullLoader) + except: + val = yaml.load(f) + config[key] = EasyDict() + merge_new_config(config[key], val) + else: + config[key] = val + continue + if key not in config: + config[key] = EasyDict() + merge_new_config(config[key], val) + return config +def cfg_from_yaml_file(cfg_file): + config = EasyDict() + with open(cfg_file, 'r') as f: + # try: + new_config = yaml.load(f, Loader=yaml.FullLoader) + # except: + # new_config = yaml.load(f) + merge_new_config(config=config, new_config=new_config) + return config + +def get_model(model): + if isinstance(model, torch.nn.DataParallel) \ + or isinstance(model, torch.nn.parallel.DistributedDataParallel): + return model.module + else: + return model + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(state, is_best, output_dir): + if is_main_process(): + ckpt_path = '{}/checkpoint_{}.pt'.format(output_dir, state['epoch']) + best_path = f'{output_dir}/checkpoint_best.pt' + torch.save(state, ckpt_path) + if is_best: + shutil.copyfile(ckpt_path, best_path) + + +def init_distributed_mode(args): + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + else: + print('Not using distributed mode') + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}'.format( + args.rank, args.dist_url), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +def scaled_all_reduce(tensors, is_scale=True): + """Performs the scaled all_reduce operation on the provided tensors. + The input tensors are modified in-place. Currently supports only the sum + reduction operator. The reduced values are scaled by the inverse size of the + world size. + """ + world_size = get_world_size() + # There is no need for reduction in the single-proc case + if world_size == 1: + return tensors + # Queue the reductions + reductions = [] + for tensor in tensors: + reduction = dist.all_reduce(tensor, async_op=True) + reductions.append(reduction) + # Wait for reductions to finish + for reduction in reductions: + reduction.wait() + # Scale the results + if is_scale: + for tensor in tensors: + tensor.mul_(1.0 / world_size) + return tensors + + +def all_gather_batch(tensors): + """ + Performs all_gather operation on the provided tensors. + """ + # Queue the gathered tensors + world_size = get_world_size() + # There is no need for reduction in the single-proc case + if world_size == 1: + return tensors + tensor_list = [] + output_tensor = [] + for tensor in tensors: + tensor_all = [torch.ones_like(tensor) for _ in range(world_size)] + dist.all_gather( + tensor_all, + tensor, + async_op=False # performance opt + ) + + tensor_list.append(tensor_all) + + for tensor_all in tensor_list: + output_tensor.append(torch.cat(tensor_all, dim=0)) + return output_tensor + + +class GatherLayer(autograd.Function): + """ + Gather tensors from all workers with support for backward propagation: + This implementation does not cut the gradients as torch.distributed.all_gather does. + """ + + @staticmethod + def forward(ctx, x): + output = [torch.zeros_like(x) for _ in range(dist.get_world_size())] + dist.all_gather(output, x) + return tuple(output) + + @staticmethod + def backward(ctx, *grads): + all_gradients = torch.stack(grads) + dist.all_reduce(all_gradients) + return all_gradients[dist.get_rank()] + + +def all_gather_batch_with_grad(tensors): + """ + Performs all_gather operation on the provided tensors. + Graph remains connected for backward grad computation. + """ + # Queue the gathered tensors + world_size = get_world_size() + # There is no need for reduction in the single-proc case + if world_size == 1: + return tensors + tensor_list = [] + output_tensor = [] + + for tensor in tensors: + tensor_all = GatherLayer.apply(tensor) + tensor_list.append(tensor_all) + + for tensor_all in tensor_list: + output_tensor.append(torch.cat(tensor_all, dim=0)) + return output_tensor + + +def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0, start_warmup_value=0): + warmup_schedule = np.array([]) + warmup_iters = warmup_epochs * niter_per_ep + if warmup_epochs > 0: + warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters) + + iters = np.arange(epochs * niter_per_ep - warmup_iters) + schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters))) + + schedule = np.concatenate((warmup_schedule, schedule)) + assert len(schedule) == epochs * niter_per_ep + return schedule + + +class GaussianBlur(object): + """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" + + def __init__(self, sigma=[.1, 2.]): + self.sigma = sigma + + def __call__(self, x): + sigma = random.uniform(self.sigma[0], self.sigma[1]) + x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) + return x + +# def get_dataset(train_transform, tokenizer, args, dataset_name=None, files_list=None): +# dataset_3d = Dataset_3D(args, tokenizer, dataset_name, train_transform, files_list=files_list) +# return dataset_3d.dataset \ No newline at end of file diff --git a/lavis/models/vit.py b/lavis/models/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..f35b7bb6886f8e4455330cf7c330a18e57f11db7 --- /dev/null +++ b/lavis/models/vit.py @@ -0,0 +1,527 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + + Based on timm code base + https://github.com/rwightman/pytorch-image-models/tree/master/timm +""" + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial + +from timm.models.vision_transformer import _cfg, PatchEmbed +from timm.models.registry import register_model +from timm.models.layers import trunc_normal_, DropPath +from timm.models.helpers import named_apply, adapt_input_conv + +from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper +from lavis.models.base_model import BaseEncoder + + +class Mlp(nn.Module): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim**-0.5 + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.attn_gradients = None + self.attention_map = None + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def forward(self, x, register_hook=False): + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + if register_hook: + self.save_attention_map(attn) + attn.register_hook(self.save_attn_gradients) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + use_grad_checkpointing=False, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if use_grad_checkpointing: + self.attn = checkpoint_wrapper(self.attn) + self.mlp = checkpoint_wrapper(self.mlp) + + def forward(self, x, register_hook=False): + x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class VisionTransformer(nn.Module): + """Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - + https://arxiv.org/abs/2010.11929 + """ + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + representation_size=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + norm_layer=None, + use_grad_checkpointing=False, + ckpt_layer=0, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + qk_scale (float): override default qk scale of head_dim ** -0.5 if set + representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + norm_layer: (nn.Module): normalization layer + """ + super().__init__() + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.ModuleList( + [ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + use_grad_checkpointing=( + use_grad_checkpointing and i >= depth - ckpt_layer + ), + ) + for i in range(depth) + ] + ) + self.norm = norm_layer(embed_dim) + + trunc_normal_(self.pos_embed, std=0.02) + trunc_normal_(self.cls_token, std=0.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {"pos_embed", "cls_token"} + + def forward(self, x, register_blk=-1): + B = x.shape[0] + x = self.patch_embed(x) + + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + x = x + self.pos_embed[:, : x.size(1), :] + x = self.pos_drop(x) + + for i, blk in enumerate(self.blocks): + x = blk(x, register_blk == i) + x = self.norm(x) + + return x + + @torch.jit.ignore() + def load_pretrained(self, checkpoint_path, prefix=""): + _load_weights(self, checkpoint_path, prefix) + + +@torch.no_grad() +def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ""): + """Load weights from .npz checkpoints for official Google Brain Flax implementation""" + import numpy as np + + def _n2p(w, t=True): + if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: + w = w.flatten() + if t: + if w.ndim == 4: + w = w.transpose([3, 2, 0, 1]) + elif w.ndim == 3: + w = w.transpose([2, 0, 1]) + elif w.ndim == 2: + w = w.transpose([1, 0]) + return torch.from_numpy(w) + + w = np.load(checkpoint_path) + if not prefix and "opt/target/embedding/kernel" in w: + prefix = "opt/target/" + + if hasattr(model.patch_embed, "backbone"): + # hybrid + backbone = model.patch_embed.backbone + stem_only = not hasattr(backbone, "stem") + stem = backbone if stem_only else backbone.stem + stem.conv.weight.copy_( + adapt_input_conv( + stem.conv.weight.shape[1], _n2p(w[f"{prefix}conv_root/kernel"]) + ) + ) + stem.norm.weight.copy_(_n2p(w[f"{prefix}gn_root/scale"])) + stem.norm.bias.copy_(_n2p(w[f"{prefix}gn_root/bias"])) + if not stem_only: + for i, stage in enumerate(backbone.stages): + for j, block in enumerate(stage.blocks): + bp = f"{prefix}block{i + 1}/unit{j + 1}/" + for r in range(3): + getattr(block, f"conv{r + 1}").weight.copy_( + _n2p(w[f"{bp}conv{r + 1}/kernel"]) + ) + getattr(block, f"norm{r + 1}").weight.copy_( + _n2p(w[f"{bp}gn{r + 1}/scale"]) + ) + getattr(block, f"norm{r + 1}").bias.copy_( + _n2p(w[f"{bp}gn{r + 1}/bias"]) + ) + if block.downsample is not None: + block.downsample.conv.weight.copy_( + _n2p(w[f"{bp}conv_proj/kernel"]) + ) + block.downsample.norm.weight.copy_( + _n2p(w[f"{bp}gn_proj/scale"]) + ) + block.downsample.norm.bias.copy_(_n2p(w[f"{bp}gn_proj/bias"])) + embed_conv_w = _n2p(w[f"{prefix}embedding/kernel"]) + else: + embed_conv_w = adapt_input_conv( + model.patch_embed.proj.weight.shape[1], _n2p(w[f"{prefix}embedding/kernel"]) + ) + model.patch_embed.proj.weight.copy_(embed_conv_w) + model.patch_embed.proj.bias.copy_(_n2p(w[f"{prefix}embedding/bias"])) + model.cls_token.copy_(_n2p(w[f"{prefix}cls"], t=False)) + pos_embed_w = _n2p(w[f"{prefix}Transformer/posembed_input/pos_embedding"], t=False) + if pos_embed_w.shape != model.pos_embed.shape: + pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights + pos_embed_w, + model.pos_embed, + getattr(model, "num_tokens", 1), + model.patch_embed.grid_size, + ) + model.pos_embed.copy_(pos_embed_w) + model.norm.weight.copy_(_n2p(w[f"{prefix}Transformer/encoder_norm/scale"])) + model.norm.bias.copy_(_n2p(w[f"{prefix}Transformer/encoder_norm/bias"])) + # if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]: + # model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel'])) + # model.head.bias.copy_(_n2p(w[f'{prefix}head/bias'])) + # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w: + # model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel'])) + # model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias'])) + for i, block in enumerate(model.blocks.children()): + block_prefix = f"{prefix}Transformer/encoderblock_{i}/" + mha_prefix = block_prefix + "MultiHeadDotProductAttention_1/" + block.norm1.weight.copy_(_n2p(w[f"{block_prefix}LayerNorm_0/scale"])) + block.norm1.bias.copy_(_n2p(w[f"{block_prefix}LayerNorm_0/bias"])) + block.attn.qkv.weight.copy_( + torch.cat( + [ + _n2p(w[f"{mha_prefix}{n}/kernel"], t=False).flatten(1).T + for n in ("query", "key", "value") + ] + ) + ) + block.attn.qkv.bias.copy_( + torch.cat( + [ + _n2p(w[f"{mha_prefix}{n}/bias"], t=False).reshape(-1) + for n in ("query", "key", "value") + ] + ) + ) + block.attn.proj.weight.copy_(_n2p(w[f"{mha_prefix}out/kernel"]).flatten(1)) + block.attn.proj.bias.copy_(_n2p(w[f"{mha_prefix}out/bias"])) + for r in range(2): + getattr(block.mlp, f"fc{r + 1}").weight.copy_( + _n2p(w[f"{block_prefix}MlpBlock_3/Dense_{r}/kernel"]) + ) + getattr(block.mlp, f"fc{r + 1}").bias.copy_( + _n2p(w[f"{block_prefix}MlpBlock_3/Dense_{r}/bias"]) + ) + block.norm2.weight.copy_(_n2p(w[f"{block_prefix}LayerNorm_2/scale"])) + block.norm2.bias.copy_(_n2p(w[f"{block_prefix}LayerNorm_2/bias"])) + + +def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()): + # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 + print("Resized position embedding: %s to %s", posemb.shape, posemb_new.shape) + ntok_new = posemb_new.shape[1] + if num_tokens: + posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0, num_tokens:] + ntok_new -= num_tokens + else: + posemb_tok, posemb_grid = posemb[:, :0], posemb[0] + gs_old = int(math.sqrt(len(posemb_grid))) + if not len(gs_new): # backwards compatibility + gs_new = [int(math.sqrt(ntok_new))] * 2 + assert len(gs_new) >= 2 + print("Position embedding grid-size from %s to %s", [gs_old, gs_old], gs_new) + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate( + posemb_grid, size=gs_new, mode="bicubic", align_corners=False + ) + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new[0] * gs_new[1], -1) + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + return + + +def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder): + # interpolate position embedding + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = visual_encoder.patch_embed.num_patches + num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches**0.5) + + if orig_size != new_size: + # class_token and dist_token are kept unchanged + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape( + -1, orig_size, orig_size, embedding_size + ).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False + ) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + print( + "reshape position embedding from %d to %d" % (orig_size**2, new_size**2) + ) + + return new_pos_embed + else: + return pos_embed_checkpoint + + +class VisionTransformerEncoder(VisionTransformer, BaseEncoder): + @classmethod + def from_config(cls, cfg, from_pretrained=False): + + vit_type = cfg.get("vit_type", "base") + image_size = cfg.get("image_size", 384) + ckpt_layer = cfg.get("vit_ckpt_layer", 0) + drop_path_rate = cfg.get("vit_drop_path_rate", 0) + norm_layer_eps = cfg.get("vit_layer_norm_epsilon", -1) + use_grad_checkpointing = cfg.get("vit_grad_ckpt", False) + + if norm_layer_eps == -1: + norm_layer = None + else: + norm_layer = partial(nn.LayerNorm, eps=norm_layer_eps) + + # norm_layer=partial(nn.LayerNorm, eps=1e-6), + assert vit_type in ["base", "large"], "vit parameter must be base or large" + if vit_type == "base": + vision_width = 768 + visual_encoder = cls( + img_size=image_size, + patch_size=16, + embed_dim=vision_width, + depth=12, + num_heads=12, + use_grad_checkpointing=use_grad_checkpointing, + ckpt_layer=ckpt_layer, + drop_path_rate=0 or drop_path_rate, + norm_layer=norm_layer, + ) + + if from_pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth", + map_location="cpu", + check_hash=True, + ) + state_dict = checkpoint["model"] + state_dict["pos_embed"] = interpolate_pos_embed( + state_dict["pos_embed"], visual_encoder + ) + msg = visual_encoder.load_state_dict(state_dict, strict=False) + + elif vit_type == "large": + vision_width = 1024 + visual_encoder = cls( + img_size=image_size, + patch_size=16, + embed_dim=vision_width, + depth=24, + num_heads=16, + use_grad_checkpointing=use_grad_checkpointing, + ckpt_layer=ckpt_layer, + drop_path_rate=0.1 or drop_path_rate, + norm_layer=norm_layer, + ) + if from_pretrained: + from timm.models.helpers import load_custom_pretrained + from timm.models.vision_transformer import default_cfgs + + load_custom_pretrained( + visual_encoder, default_cfgs["vit_large_patch16_224_in21k"] + ) + + visual_encoder.vision_width = vision_width + return visual_encoder + + def forward_features(self, x, register_blk=-1): + return super().forward(x, register_blk) diff --git a/lavis/processors/__init__.py b/lavis/processors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..84d897c21ea7d2f8a1afadae7d7d56eb9c626b60 --- /dev/null +++ b/lavis/processors/__init__.py @@ -0,0 +1,69 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.processors.base_processor import BaseProcessor + +from lavis.processors.alpro_processors import ( + AlproVideoTrainProcessor, + AlproVideoEvalProcessor, +) +from lavis.processors.blip_processors import ( + BlipImageTrainProcessor, + Blip2ImageTrainProcessor, + BlipImageEvalProcessor, + BlipCaptionProcessor, +) +from lavis.processors.blip_diffusion_processors import ( + BlipDiffusionInputImageProcessor, + BlipDiffusionTargetImageProcessor, +) +from lavis.processors.gpt_processors import ( + GPTVideoFeatureProcessor, + GPTDialogueProcessor, +) +from lavis.processors.clip_processors import ClipImageTrainProcessor +from lavis.processors.audio_processors import BeatsAudioProcessor +from lavis.processors.ulip_processors import ULIPPCProcessor +from lavis.processors.instruction_text_processors import BlipInstructionProcessor + +from lavis.common.registry import registry + +__all__ = [ + "BaseProcessor", + # ALPRO + "AlproVideoTrainProcessor", + "AlproVideoEvalProcessor", + # BLIP + "BlipImageTrainProcessor", + "Blip2ImageTrainProcessor", + "BlipImageEvalProcessor", + "BlipCaptionProcessor", + "BlipInstructionProcessor", + # BLIP-Diffusion + "BlipDiffusionInputImageProcessor", + "BlipDiffusionTargetImageProcessor", + # CLIP + "ClipImageTrainProcessor", + # GPT + "GPTVideoFeatureProcessor", + "GPTDialogueProcessor", + # AUDIO + "BeatsAudioProcessor", + # 3D + "ULIPPCProcessor", +] + + +def load_processor(name, cfg=None): + """ + Example + + >>> processor = load_processor("alpro_video_train", cfg=None) + """ + processor = registry.get_processor_class(name).from_config(cfg) + + return processor diff --git a/lavis/processors/__pycache__/__init__.cpython-310.pyc b/lavis/processors/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8afcd4bbe2455e67e7b072a88551cdc802557ff8 Binary files /dev/null and b/lavis/processors/__pycache__/__init__.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/alpro_processors.cpython-310.pyc b/lavis/processors/__pycache__/alpro_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7df9e3480e8e63e103b9c8c78732ff7bde5bab9 Binary files /dev/null and b/lavis/processors/__pycache__/alpro_processors.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/audio_processors.cpython-310.pyc b/lavis/processors/__pycache__/audio_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..229d2de9995d121df3158d022b4c4f73e7efcda0 Binary files /dev/null and b/lavis/processors/__pycache__/audio_processors.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/base_processor.cpython-310.pyc b/lavis/processors/__pycache__/base_processor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54c3b7b4ff793f5bdd0f549cacd8168fe20fadc1 Binary files /dev/null and b/lavis/processors/__pycache__/base_processor.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/blip_diffusion_processors.cpython-310.pyc b/lavis/processors/__pycache__/blip_diffusion_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2225aa702aea343da00d487439e6e640748c1113 Binary files /dev/null and b/lavis/processors/__pycache__/blip_diffusion_processors.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/blip_processors.cpython-310.pyc b/lavis/processors/__pycache__/blip_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d89fb7d4bab0eefe970139b365ccb277b361a375 Binary files /dev/null and b/lavis/processors/__pycache__/blip_processors.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/clip_processors.cpython-310.pyc b/lavis/processors/__pycache__/clip_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..729663e59ec8276351981167cc8cfa5338db3b2f Binary files /dev/null and b/lavis/processors/__pycache__/clip_processors.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/functional_video.cpython-310.pyc b/lavis/processors/__pycache__/functional_video.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..036f513e3880892cf5db6a3c5c210525e4f56840 Binary files /dev/null and b/lavis/processors/__pycache__/functional_video.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/gpt_processors.cpython-310.pyc b/lavis/processors/__pycache__/gpt_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41b97e380d0200ae5dcc38da50dac56026ce779b Binary files /dev/null and b/lavis/processors/__pycache__/gpt_processors.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/instruction_text_processors.cpython-310.pyc b/lavis/processors/__pycache__/instruction_text_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efdb4493eeb20efc79627e533303f1cfe4db72ab Binary files /dev/null and b/lavis/processors/__pycache__/instruction_text_processors.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/randaugment.cpython-310.pyc b/lavis/processors/__pycache__/randaugment.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6dd2f54379f110879e8b66a0c1776960f25a0e2d Binary files /dev/null and b/lavis/processors/__pycache__/randaugment.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/transforms_video.cpython-310.pyc b/lavis/processors/__pycache__/transforms_video.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a65a6317f0f9c5842571281d6a0549b74e9bffc Binary files /dev/null and b/lavis/processors/__pycache__/transforms_video.cpython-310.pyc differ diff --git a/lavis/processors/__pycache__/ulip_processors.cpython-310.pyc b/lavis/processors/__pycache__/ulip_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b37c00c424d3d6c15c19a8766be1bd5c83f9332 Binary files /dev/null and b/lavis/processors/__pycache__/ulip_processors.cpython-310.pyc differ diff --git a/lavis/processors/alpro_processors.py b/lavis/processors/alpro_processors.py new file mode 100644 index 0000000000000000000000000000000000000000..6baf29ea611e11264a4448fea0fcc4c26bbbd656 --- /dev/null +++ b/lavis/processors/alpro_processors.py @@ -0,0 +1,272 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +from lavis.common.registry import registry +from lavis.datasets.data_utils import load_video, load_clip +from lavis.processors import transforms_video +from lavis.processors.base_processor import BaseProcessor +from lavis.processors.randaugment import VideoRandomAugment +from lavis.processors import functional_video as F +from omegaconf import OmegaConf +from torchvision import transforms + +MAX_INT = registry.get("MAX_INT") + + +class AlproVideoBaseProcessor(BaseProcessor): + def __init__(self, mean=None, std=None, n_frms=MAX_INT): + if mean is None: + mean = (0.48145466, 0.4578275, 0.40821073) + if std is None: + std = (0.26862954, 0.26130258, 0.27577711) + + self.normalize = transforms_video.NormalizeVideo(mean, std) + + self.n_frms = n_frms + + +class ToUint8(object): + def __init__(self): + pass + + def __call__(self, tensor): + return tensor.to(torch.uint8) + + def __repr__(self): + return self.__class__.__name__ + + +class ToTHWC(object): + """ + Args: + clip (torch.tensor, dtype=torch.uint8): Size is (C, T, H, W) + Return: + clip (torch.tensor, dtype=torch.float): Size is (T, H, W, C) + """ + + def __init__(self): + pass + + def __call__(self, tensor): + return tensor.permute(1, 2, 3, 0) + + def __repr__(self): + return self.__class__.__name__ + + +class ResizeVideo(object): + def __init__(self, target_size, interpolation_mode="bilinear"): + self.target_size = target_size + self.interpolation_mode = interpolation_mode + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: central cropping of video clip. Size is + (C, T, crop_size, crop_size) + """ + return F.resize(clip, self.target_size, self.interpolation_mode) + + def __repr__(self): + return self.__class__.__name__ + "(resize_size={0})".format(self.target_size) + + +@registry.register_processor("alpro_video_train") +class AlproVideoTrainProcessor(AlproVideoBaseProcessor): + def __init__( + self, + image_size=384, + mean=None, + std=None, + min_scale=0.5, + max_scale=1.0, + n_frms=MAX_INT, + full_video=True, + ): + super().__init__(mean=mean, std=std, n_frms=n_frms) + + self.image_size = image_size + self.full_video=full_video + + self.transform = transforms.Compose( + [ + # Video size is (C, T, H, W) + transforms_video.RandomResizedCropVideo( + image_size, + scale=(min_scale, max_scale), + interpolation_mode="bicubic", + ), + transforms_video.RandomHorizontalFlipVideo(), + ToTHWC(), # C, T, H, W -> T, H, W, C + VideoRandomAugment( + 2, + 5, + augs=[ + "Identity", + # "AutoContrast", + "Brightness", + "Sharpness", + "Equalize", + "ShearX", + "ShearY", + "TranslateX", + "TranslateY", + "Rotate", + ], + ), + ToUint8(), + transforms_video.ToTensorVideo(), # T, H, W, C -> C, T, H, W + self.normalize, + ] + ) + + def __call__(self, vpath, start_sec=None, end_sec=None): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: video clip after transforms. Size is (C, T, size, size). + """ + if self.full_video: + clip = load_video( ## initial LAVIS code has errors when loading video. + video_path=vpath, + n_frms=self.n_frms, + height=self.image_size, + width=self.image_size, + sampling="headtail", + ) + # clip = load_clip( + # video_path=vpath, + # num_frames=self.n_frms, + # target_height=self.image_size, + # target_width=self.image_size, + # start_time=start_sec, + # end_time=end_sec, + # sampling="headtail" + # ) + else: + clip = load_clip( + video_path=vpath, + num_frames=self.n_frms, + target_height=self.image_size, + target_width=self.image_size, + start_time=start_sec, + end_time=end_sec, + sampling="headtail" + ) + transformed = self.transform(clip) + + ## repeat last frame for padding + pad_size = self.n_frms - transformed.shape[1] + if pad_size>0: + last_frame = transformed[:, -1, :, :].unsqueeze(1) + repeat_frames = last_frame.repeat(1, pad_size, 1, 1) + transformed = torch.cat([transformed, repeat_frames], dim=1) + + return transformed + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 256) + + mean = cfg.get("mean", None) + std = cfg.get("std", None) + + min_scale = cfg.get("min_scale", 0.5) + max_scale = cfg.get("max_scale", 1.0) + + n_frms = cfg.get("n_frms", MAX_INT) + full_video = cfg.get("full_video", True) + + return cls( + image_size=image_size, + mean=mean, + std=std, + min_scale=min_scale, + max_scale=max_scale, + n_frms=n_frms, + full_video=full_video + ) + + +@registry.register_processor("alpro_video_eval") +class AlproVideoEvalProcessor(AlproVideoBaseProcessor): + def __init__(self, image_size=256, mean=None, std=None, n_frms=MAX_INT, full_video=True): + super().__init__(mean=mean, std=std, n_frms=n_frms) + + self.image_size = image_size + self.full_video=full_video + + # Input video size is (C, T, H, W) + self.transform = transforms.Compose( + [ + # frames will be resized during decord loading. + ToUint8(), # C, T, H, W + ToTHWC(), # T, H, W, C + transforms_video.ToTensorVideo(), # C, T, H, W + self.normalize, # C, T, H, W + ] + ) + + def __call__(self, vpath, start_sec=None, end_sec=None): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: video clip after transforms. Size is (C, T, size, size). + """ + if self.full_video: + clip = load_clip( + video_path=vpath, + num_frames=self.n_frms, + target_height=self.image_size, + target_width=self.image_size, + start_time=start_sec, + end_time=end_sec, + sampling="headtail" + ) + else: + clip = load_clip( + video_path=vpath, + num_frames=self.n_frms, + target_height=self.image_size, + target_width=self.image_size, + start_time=start_sec, + end_time=end_sec, + sampling="headtail" + ) + transformed = self.transform(clip) + + ## repeat last frame for padding + pad_size = self.n_frms - transformed.shape[1] + if pad_size>0: + last_frame = transformed[:, -1, :, :].unsqueeze(1) + repeat_frames = last_frame.repeat(1, pad_size, 1, 1) + transformed = torch.cat([transformed, repeat_frames], dim=1) + + return transformed + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 256) + full_video = cfg.get("full_video", True) + + mean = cfg.get("mean", None) + std = cfg.get("std", None) + + n_frms = cfg.get("n_frms", MAX_INT) + + return cls(image_size=image_size, mean=mean, std=std, n_frms=n_frms, full_video=full_video) diff --git a/lavis/processors/audio_processors.py b/lavis/processors/audio_processors.py new file mode 100644 index 0000000000000000000000000000000000000000..9dcafbff3d6547e4546974775f17350a54d67fd0 --- /dev/null +++ b/lavis/processors/audio_processors.py @@ -0,0 +1,141 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +import torchaudio +import torchaudio.transforms as transforms +from moviepy.editor import VideoFileClip +from omegaconf import OmegaConf +import torchaudio.compliance.kaldi as ta_kaldi + +from lavis.common.registry import registry +from lavis.processors.base_processor import BaseProcessor +from lavis.models.beats.Tokenizers import TokenizersConfig, Tokenizers + +MAX_INT = registry.get("MAX_INT") + + +@registry.register_processor("beats_audio") +class BeatsAudioProcessor(BaseProcessor): + def __init__(self, model_name, sampling_rate, n_frames, frame_length, is_eval): + """ + Adapted from https://github.com/NINAnor/rare_species_detections/blob/main/BEATs/BEATs.py + """ + super().__init__() + + self.model_name = model_name + self.sampling_rate = sampling_rate + self.n_frames = n_frames + self.frame_length = frame_length + self.fbank_mean = 15.41663 + self.fbank_std = 6.55582 + self.is_eval = is_eval + + def _load_audio(self, aupath): + if aupath.endswith('.mp4'): + video = VideoFileClip(aupath) + audio_np = video.audio.to_soundarray(fps=self.sampling_rate) + if len(audio_np.shape) == 2: + audio_np = audio_np.mean(axis=1) # Convert to mono + waveform = torch.tensor(audio_np).float() + sr = self.sampling_rate + else: + waveform, sr = torchaudio.load(aupath) + if waveform.shape[0] == 2: + waveform = torch.mean(waveform, dim=0) + if sr != self.sampling_rate: + resampler = torchaudio.transforms.Resample(sr, self.sampling_rate) + waveform = resampler(waveform) + return waveform + + def __call__(self, aupath, start_sec=None, end_sec=None): + """ + Args: + aupath: path to audio file + Returns: + torch.tensor: audio clip after transforms. + """ + # Helper function to return empty tensor for invalid audio + def empty_audio_tensor(): + return torch.zeros((self.n_frames, self.frame_length, 128)) + + try: + # Handle MP4 files + if aupath.endswith('.mp4'): + video = VideoFileClip(aupath) + if start_sec is not None and end_sec is not None: + video = video.subclip(start_sec, end_sec) + audio_np = video.audio.to_soundarray(fps=self.sampling_rate) + if audio_np.ndim == 2: + audio_np = audio_np.mean(axis=1) # Convert to mono + waveform = torch.tensor(audio_np).float() + sr = self.sampling_rate + else: + waveform, sr = torchaudio.load(aupath) + + # Validate waveform + if len(waveform.shape) == 0: + return empty_audio_tensor() + + # Convert stereo to mono + if waveform.shape[0] == 2: + waveform = torch.mean(waveform, dim=0) + + # Resample waveform if necessary + if sr != self.sampling_rate: + resampler = torchaudio.transforms.Resample(sr, self.sampling_rate) + waveform = resampler(waveform) + + except: + return empty_audio_tensor() + + if waveform.ndim == 1: + waveform = waveform.unsqueeze(0) + + waveform = waveform * 2**15 + + # Compute fbank features + try: + fbank = ta_kaldi.fbank( + waveform, + num_mel_bins=128, + sample_frequency=self.sampling_rate, + frame_length=25, + frame_shift=10, + ) + fbank = (fbank - self.fbank_mean) / (2 * self.fbank_std) + except: + return empty_audio_tensor() + + # Handle padding and frames extraction differently for eval and training modes + if not self.is_eval: + fbank_pad_len = self.frame_length * self.n_frames - fbank.shape[0] + if fbank_pad_len > 0: + fbank = torch.nn.ZeroPad2d((0, 0, 0, fbank_pad_len))(fbank) + fbank = fbank[:self.frame_length * self.n_frames] + frames = [fbank[i*self.frame_length:(i+1)*self.frame_length].unsqueeze(0) for i in range(self.n_frames)] + else: + fbank_pad_len = fbank.shape[0] % self.frame_length + if fbank_pad_len > 0: + fbank = torch.nn.ZeroPad2d((0, 0, 0, fbank_pad_len))(fbank) + curr_frames = fbank.shape[0] // self.frame_length + frames = [fbank[i*self.frame_length:(i+1)*self.frame_length].unsqueeze(0) for i in range(curr_frames)] + + return torch.cat(frames, dim=0) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + return cls( + model_name=cfg.get("model_name", 'iter3'), + sampling_rate=cfg.get("sampling_rate", 16000), + n_frames=cfg.get("n_frames", 2), + frame_length=cfg.get("frame_length", 512), + is_eval=cfg.get("is_eval", False) + ) \ No newline at end of file diff --git a/lavis/processors/base_processor.py b/lavis/processors/base_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..b4c9d86859270a046623661a632587f2b3136b46 --- /dev/null +++ b/lavis/processors/base_processor.py @@ -0,0 +1,26 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from omegaconf import OmegaConf + + +class BaseProcessor: + def __init__(self): + self.transform = lambda x: x + return + + def __call__(self, item): + return self.transform(item) + + @classmethod + def from_config(cls, cfg=None): + return cls() + + def build(self, **kwargs): + cfg = OmegaConf.create(kwargs) + + return self.from_config(cfg) diff --git a/lavis/processors/blip_diffusion_processors.py b/lavis/processors/blip_diffusion_processors.py new file mode 100644 index 0000000000000000000000000000000000000000..b6c073dce3adac844b6b65517fcc210139383f63 --- /dev/null +++ b/lavis/processors/blip_diffusion_processors.py @@ -0,0 +1,80 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from omegaconf import OmegaConf +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode + +from lavis.common.registry import registry +from lavis.processors.base_processor import BaseProcessor +from lavis.processors.blip_processors import BlipImageBaseProcessor + + +@registry.register_processor("blip_diffusion_inp_image_train") +@registry.register_processor("blip_diffusion_inp_image_eval") +class BlipDiffusionInputImageProcessor(BlipImageBaseProcessor): + def __init__( + self, + image_size=224, + mean=None, + std=None, + ): + super().__init__(mean=mean, std=std) + + self.transform = transforms.Compose( + [ + transforms.Resize(image_size, interpolation=InterpolationMode.BICUBIC), + transforms.CenterCrop(image_size), + transforms.ToTensor(), + self.normalize, + ] + ) + + def __call__(self, item): + return self.transform(item) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 224) + + mean = cfg.get("mean", None) + std = cfg.get("std", None) + + return cls(image_size=image_size, mean=mean, std=std) + + +@registry.register_processor("blip_diffusion_tgt_image_train") +class BlipDiffusionTargetImageProcessor(BaseProcessor): + def __init__( + self, + image_size=512, + ): + super().__init__() + + self.transform = transforms.Compose( + [ + transforms.Resize(image_size, interpolation=InterpolationMode.BICUBIC), + transforms.CenterCrop(image_size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __call__(self, item): + return self.transform(item) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 512) + + return cls(image_size=image_size) diff --git a/lavis/processors/blip_processors.py b/lavis/processors/blip_processors.py new file mode 100644 index 0000000000000000000000000000000000000000..abaafda9041167cfa0e11a08e0f70cca3c8eea56 --- /dev/null +++ b/lavis/processors/blip_processors.py @@ -0,0 +1,239 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import re + +from lavis.common.registry import registry +from lavis.processors.base_processor import BaseProcessor +from lavis.processors.randaugment import RandomAugment +from omegaconf import OmegaConf +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode + + +class BlipImageBaseProcessor(BaseProcessor): + def __init__(self, mean=None, std=None): + if mean is None: + mean = (0.48145466, 0.4578275, 0.40821073) + if std is None: + std = (0.26862954, 0.26130258, 0.27577711) + + self.normalize = transforms.Normalize(mean, std) + + +@registry.register_processor("blip_caption") +class BlipCaptionProcessor(BaseProcessor): + def __init__(self, prompt="", max_words=50): + self.prompt = prompt + self.max_words = max_words + + def __call__(self, caption): + caption = self.prompt + self.pre_caption(caption) + + return caption + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + prompt = cfg.get("prompt", "") + max_words = cfg.get("max_words", 50) + + return cls(prompt=prompt, max_words=max_words) + + def pre_caption(self, caption): + caption = re.sub( + r"([.!\"()*#:;~])", + " ", + caption.lower(), + ) + caption = re.sub( + r"\s{2,}", + " ", + caption, + ) + caption = caption.rstrip("\n") + caption = caption.strip(" ") + + # truncate caption + caption_words = caption.split(" ") + if len(caption_words) > self.max_words: + caption = " ".join(caption_words[: self.max_words]) + + return caption + + +@registry.register_processor("blip_question") +class BlipQuestionProcessor(BaseProcessor): + def __init__(self, max_words=50): + self.max_words = max_words + + def __call__(self, question): + return self.pre_question(question) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + max_words = cfg.get("max_words", 50) + + return cls(max_words=max_words) + + def pre_question(self, question): + question = re.sub( + r"([.!\"()*#:;~])", + "", + question.lower(), + ) + question = question.rstrip(" ") + + # truncate question + question_words = question.split(" ") + if len(question_words) > self.max_words: + question = " ".join(question_words[: self.max_words]) + + return question + + +@registry.register_processor("blip_image_train") +class BlipImageTrainProcessor(BlipImageBaseProcessor): + def __init__( + self, image_size=384, mean=None, std=None, min_scale=0.5, max_scale=1.0 + ): + super().__init__(mean=mean, std=std) + + self.transform = transforms.Compose( + [ + transforms.RandomResizedCrop( + image_size, + scale=(min_scale, max_scale), + interpolation=InterpolationMode.BICUBIC, + ), + transforms.RandomHorizontalFlip(), + RandomAugment( + 2, + 5, + isPIL=True, + augs=[ + "Identity", + "AutoContrast", + "Brightness", + "Sharpness", + "Equalize", + "ShearX", + "ShearY", + "TranslateX", + "TranslateY", + "Rotate", + ], + ), + transforms.ToTensor(), + self.normalize, + ] + ) + + def __call__(self, item): + return self.transform(item) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 384) + + mean = cfg.get("mean", None) + std = cfg.get("std", None) + + min_scale = cfg.get("min_scale", 0.5) + max_scale = cfg.get("max_scale", 1.0) + + return cls( + image_size=image_size, + mean=mean, + std=std, + min_scale=min_scale, + max_scale=max_scale, + ) + + +@registry.register_processor("blip_image_eval") +class BlipImageEvalProcessor(BlipImageBaseProcessor): + def __init__(self, image_size=384, mean=None, std=None): + super().__init__(mean=mean, std=std) + + self.transform = transforms.Compose( + [ + transforms.Resize( + (image_size, image_size), interpolation=InterpolationMode.BICUBIC + ), + transforms.ToTensor(), + self.normalize, + ] + ) + + def __call__(self, item): + return self.transform(item) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 384) + + mean = cfg.get("mean", None) + std = cfg.get("std", None) + + return cls(image_size=image_size, mean=mean, std=std) + + +@registry.register_processor("blip2_image_train") +class Blip2ImageTrainProcessor(BlipImageBaseProcessor): + def __init__( + self, image_size=364, mean=None, std=None, min_scale=0.5, max_scale=1.0 + ): + super().__init__(mean=mean, std=std) + + self.transform = transforms.Compose( + [ + transforms.RandomResizedCrop( + image_size, + scale=(min_scale, max_scale), + interpolation=InterpolationMode.BICUBIC, + ), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + self.normalize, + ] + ) + + def __call__(self, item): + return self.transform(item) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 364) + + mean = cfg.get("mean", None) + std = cfg.get("std", None) + + min_scale = cfg.get("min_scale", 0.5) + max_scale = cfg.get("max_scale", 1.0) + + return cls( + image_size=image_size, + mean=mean, + std=std, + min_scale=min_scale, + max_scale=max_scale, + ) \ No newline at end of file diff --git a/lavis/processors/clip_processors.py b/lavis/processors/clip_processors.py new file mode 100644 index 0000000000000000000000000000000000000000..08bd066de69e01c8a90ca9f8546ab046ae08cd78 --- /dev/null +++ b/lavis/processors/clip_processors.py @@ -0,0 +1,92 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from lavis.common.registry import registry +from lavis.processors.blip_processors import BlipImageBaseProcessor +from omegaconf import OmegaConf +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode + + +def _convert_to_rgb(image): + return image.convert("RGB") + + +@registry.register_processor("clip_image_train") +class ClipImageTrainProcessor(BlipImageBaseProcessor): + def __init__( + self, image_size=224, mean=None, std=None, min_scale=0.9, max_scale=1.0 + ): + + super().__init__(mean=mean, std=std) + + self.transform = transforms.Compose( + [ + transforms.RandomResizedCrop( + image_size, + scale=(min_scale, max_scale), + interpolation=InterpolationMode.BICUBIC, + ), + _convert_to_rgb, + transforms.ToTensor(), + self.normalize, + ] + ) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 224) + + mean = cfg.get("mean", None) + std = cfg.get("std", None) + + min_scale = cfg.get("min_scale", 0.9) + max_scale = cfg.get("max_scale", 1.0) + + return cls( + image_size=image_size, + mean=mean, + std=std, + min_scale=min_scale, + max_scale=max_scale, + ) + + +@registry.register_processor("clip_image_eval") +class ClipImageEvalProcessor(BlipImageBaseProcessor): + def __init__(self, image_size=224, mean=None, std=None): + + super().__init__(mean=mean, std=std) + + self.transform = transforms.Compose( + [ + transforms.Resize(image_size, interpolation=InterpolationMode.BICUBIC), + transforms.CenterCrop(image_size), + _convert_to_rgb, + transforms.ToTensor(), + self.normalize, + ] + ) + + @classmethod + def from_config(cls, cfg=None): + if cfg is None: + cfg = OmegaConf.create() + + image_size = cfg.get("image_size", 224) + + mean = cfg.get("mean", None) + std = cfg.get("std", None) + + return cls( + image_size=image_size, + mean=mean, + std=std, + ) diff --git a/lavis/processors/functional_video.py b/lavis/processors/functional_video.py new file mode 100644 index 0000000000000000000000000000000000000000..597a29315d4e1a575e7209edb0618eeaf4fc024a --- /dev/null +++ b/lavis/processors/functional_video.py @@ -0,0 +1,121 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import warnings + +import torch + + +def _is_tensor_video_clip(clip): + if not torch.is_tensor(clip): + raise TypeError("clip should be Tensor. Got %s" % type(clip)) + + if not clip.ndimension() == 4: + raise ValueError("clip should be 4D. Got %dD" % clip.dim()) + + return True + + +def crop(clip, i, j, h, w): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + """ + if len(clip.size()) != 4: + raise ValueError("clip should be a 4D tensor") + return clip[..., i : i + h, j : j + w] + + +def resize(clip, target_size, interpolation_mode): + if len(target_size) != 2: + raise ValueError( + f"target size should be tuple (height, width), instead got {target_size}" + ) + return torch.nn.functional.interpolate( + clip, size=target_size, mode=interpolation_mode, align_corners=False + ) + + +def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"): + """ + Do spatial cropping and resizing to the video clip + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + i (int): i in (i,j) i.e coordinates of the upper left corner. + j (int): j in (i,j) i.e coordinates of the upper left corner. + h (int): Height of the cropped region. + w (int): Width of the cropped region. + size (tuple(int, int)): height and width of resized clip + Returns: + clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W) + """ + if not _is_tensor_video_clip(clip): + raise ValueError("clip should be a 4D torch.tensor") + clip = crop(clip, i, j, h, w) + clip = resize(clip, size, interpolation_mode) + return clip + + +def center_crop(clip, crop_size): + if not _is_tensor_video_clip(clip): + raise ValueError("clip should be a 4D torch.tensor") + h, w = clip.size(-2), clip.size(-1) + th, tw = crop_size + if h < th or w < tw: + raise ValueError("height and width must be no smaller than crop_size") + + i = int(round((h - th) / 2.0)) + j = int(round((w - tw) / 2.0)) + return crop(clip, i, j, th, tw) + + +def to_tensor(clip): + """ + Convert tensor data type from uint8 to float, divide value by 255.0 and + permute the dimensions of clip tensor + Args: + clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) + Return: + clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W) + """ + _is_tensor_video_clip(clip) + if not clip.dtype == torch.uint8: + raise TypeError( + "clip tensor should have data type uint8. Got %s" % str(clip.dtype) + ) + return clip.float().permute(3, 0, 1, 2) / 255.0 + + +def normalize(clip, mean, std, inplace=False): + """ + Args: + clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W) + mean (tuple): pixel RGB mean. Size is (3) + std (tuple): pixel standard deviation. Size is (3) + Returns: + normalized clip (torch.tensor): Size is (C, T, H, W) + """ + if not _is_tensor_video_clip(clip): + raise ValueError("clip should be a 4D torch.tensor") + if not inplace: + clip = clip.clone() + mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device) + std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device) + clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) + return clip + + +def hflip(clip): + """ + Args: + clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W) + Returns: + flipped clip (torch.tensor): Size is (C, T, H, W) + """ + if not _is_tensor_video_clip(clip): + raise ValueError("clip should be a 4D torch.tensor") + return clip.flip(-1) diff --git a/lavis/processors/gpt_processors.py b/lavis/processors/gpt_processors.py new file mode 100644 index 0000000000000000000000000000000000000000..2fe6204c674a2f4b500a0b0ef79a9f02068dbb66 --- /dev/null +++ b/lavis/processors/gpt_processors.py @@ -0,0 +1,171 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import re + +from lavis.common.registry import registry +from lavis.processors.base_processor import BaseProcessor +from lavis.processors.randaugment import RandomAugment +from omegaconf import OmegaConf +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +import os +from itertools import chain +import numpy as np +import torch +from transformers import GPT2Tokenizer + +SPECIAL_TOKENS_DICT = { + "bos_token": "", + "eos_token": "", + "additional_special_tokens": ["", "", "